diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 55f96014..a6171bf5 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -1,278 +1,279 @@ # Makefile for cipher modules # Copyright (C) 1998, 1999, 2000, 2001, 2002, # 2003, 2009 Free Software Foundation, Inc. # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # Process this file with automake to produce Makefile.in # Need to include ../src in addition to top_srcdir because gcrypt.h is # a built header. AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi AM_CFLAGS = $(GPG_ERROR_CFLAGS) AM_CCASFLAGS = $(NOEXECSTACK_FLAGS) EXTRA_DIST = gost-s-box.c CLEANFILES = gost-s-box$(EXEEXT_FOR_BUILD) DISTCLEANFILES = gost-sb.h noinst_LTLIBRARIES = libcipher.la GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \ @GCRYPT_DIGESTS@ @GCRYPT_KDFS@ libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES) libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c \ cipher-cfb.c \ cipher-ofb.c \ cipher-ctr.c \ cipher-aeswrap.c \ cipher-ccm.c \ cipher-cmac.c \ cipher-gcm.c \ cipher-poly1305.c \ cipher-ocb.c \ cipher-xts.c \ cipher-eax.c \ cipher-siv.c \ cipher-gcm-siv.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ mac.c mac-internal.h \ mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \ poly1305.c poly1305-internal.h \ kdf.c kdf-internal.h \ bithelp.h \ bufhelp.h \ primegen.c \ hash-common.c hash-common.h \ dsa-common.c rsa-common.c \ sha1.h EXTRA_libcipher_la_SOURCES = \ asm-common-aarch64.h \ asm-common-amd64.h \ asm-common-s390x.h \ asm-inline-s390x.h \ asm-poly1305-aarch64.h \ asm-poly1305-amd64.h \ asm-poly1305-s390x.h \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \ chacha20-ppc.c chacha20-s390x.S \ cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \ cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ crc.c crc-intel-pclmul.c crc-armv8-ce.c \ crc-armv8-aarch64-ce.S \ crc-ppc.c \ des.c des-amd64.S \ dsa.c \ elgamal.c \ ecc.c ecc-curves.c ecc-misc.c ecc-common.h \ ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c ecc-sm2.c \ idea.c \ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ poly1305-s390x.S poly1305-amd64-avx512.S \ rijndael.c rijndael-internal.h rijndael-tables.h \ rijndael-aesni.c rijndael-padlock.c \ rijndael-amd64.S rijndael-arm.S \ rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \ rijndael-vaes.c rijndael-vaes-avx2-amd64.S \ rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \ rijndael-armv8-aarch64-ce.S rijndael-aarch64.S \ rijndael-ppc.c rijndael-ppc9le.c \ rijndael-p10le.c rijndael-gcm-p10le.s \ rijndael-ppc-common.h rijndael-ppc-functions.h \ rijndael-s390x.c \ rmd160.c \ rsa.c \ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S \ sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \ sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \ sha256-avx2-bmi2-amd64.S \ sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \ sha256-intel-shaext.c sha256-ppc.c \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \ sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \ sha512-armv7-neon.S sha512-arm.S \ sha512-ppc.c sha512-ssse3-i386.c \ sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \ keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \ twofish-avx2-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ - camellia-aesni-avx2-amd64.h camellia-gfni-avx2-amd64.S \ + camellia-aesni-avx2-amd64.h \ + camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \ camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \ camellia-arm.S camellia-aarch64.S \ blake2.c \ blake2b-amd64-avx2.S blake2s-amd64-avx.S gost28147.lo: gost-sb.h gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD) ./gost-s-box$(EXEEXT_FOR_BUILD) $@ gost-s-box$(EXEEXT_FOR_BUILD): gost-s-box.c $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \ $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c if ENABLE_O_FLAG_MUNGING o_flag_munging = sed -e 's/-O\([2-9sg][2-9sg]*\)/-O1/' -e 's/-Ofast/-O1/g' else o_flag_munging = cat endif # We need to lower the optimization for this module. tiger.o: $(srcdir)/tiger.c Makefile `echo $(COMPILE) -c $< | $(o_flag_munging) ` tiger.lo: $(srcdir)/tiger.c Makefile `echo $(LTCOMPILE) -c $< | $(o_flag_munging) ` # We need to disable instrumentation for these modules as they use cc as # thin assembly front-end and do not tolerate in-between function calls # inserted by compiler as those functions may clobber the XMM registers. if ENABLE_INSTRUMENTATION_MUNGING instrumentation_munging = sed \ -e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \ -e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \ -e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' else instrumentation_munging = cat endif rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS ppc_vcrypto_cflags = -O2 -maltivec -mvsx -mcrypto else ppc_vcrypto_cflags = endif rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` rijndael-p10le.o: $(srcdir)/rijndael-p10le.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` rijndael-p10le.lo: $(srcdir)/rijndael-p10le.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` crc-ppc.o: $(srcdir)/crc-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h index b1b4b2e1..8c322ede 100644 --- a/cipher/bulkhelp.h +++ b/cipher/bulkhelp.h @@ -1,396 +1,425 @@ /* bulkhelp.h - Some bulk processing helpers * Copyright (C) 2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRYPT_BULKHELP_H #define GCRYPT_BULKHELP_H #include "g10lib.h" #include "cipher-internal.h" #ifdef __x86_64__ /* Use u64 to store pointers for x32 support (assembly function assumes * 64-bit pointers). */ typedef u64 ocb_L_uintptr_t; #else typedef uintptr_t ocb_L_uintptr_t; #endif typedef unsigned int (*bulk_crypt_fn_t) (const void *ctx, byte *out, const byte *in, unsigned int num_blks); +static inline ocb_L_uintptr_t * +bulk_ocb_prepare_L_pointers_array_blk64 (gcry_cipher_hd_t c, + ocb_L_uintptr_t Ls[64], u64 blkn) +{ + unsigned int n = 64 - (blkn % 64); + unsigned int i; + + for (i = 0; i < 64; i += 8) + { + Ls[(i + 0 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + } + + Ls[(7 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + Ls[(15 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4]; + Ls[(23 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + Ls[(31 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[5]; + Ls[(39 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + Ls[(47 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4]; + Ls[(55 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + return &Ls[(63 + n) % 64]; +} + + static inline ocb_L_uintptr_t * bulk_ocb_prepare_L_pointers_array_blk32 (gcry_cipher_hd_t c, ocb_L_uintptr_t Ls[32], u64 blkn) { unsigned int n = 32 - (blkn % 32); unsigned int i; for (i = 0; i < 32; i += 8) { Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; } Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4]; Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; return &Ls[(31 + n) % 32]; } static inline ocb_L_uintptr_t * bulk_ocb_prepare_L_pointers_array_blk16 (gcry_cipher_hd_t c, ocb_L_uintptr_t Ls[16], u64 blkn) { unsigned int n = 16 - (blkn % 16); unsigned int i; for (i = 0; i < 16; i += 8) { Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; } Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; return &Ls[(15 + n) % 16]; } static inline ocb_L_uintptr_t * bulk_ocb_prepare_L_pointers_array_blk8 (gcry_cipher_hd_t c, ocb_L_uintptr_t Ls[8], u64 blkn) { unsigned int n = 8 - (blkn % 8); Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; return &Ls[(7 + n) % 8]; } static inline unsigned int bulk_ctr_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, const byte *inbuf, size_t nblocks, byte *ctr, byte *tmpbuf, size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks) { unsigned int tmp_used = 16; unsigned int burn_depth = 0; unsigned int nburn; while (nblocks >= 1) { size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; size_t i; if (curr_blks * 16 > tmp_used) tmp_used = curr_blks * 16; cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16); for (i = 1; i < curr_blks; i++) { cipher_block_cpy (&tmpbuf[i * 16], ctr, 16); cipher_block_add (&tmpbuf[i * 16], i, 16); } cipher_block_add (ctr, curr_blks, 16); nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks); burn_depth = nburn > burn_depth ? nburn : burn_depth; for (i = 0; i < curr_blks; i++) { cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16); outbuf += 16; inbuf += 16; } nblocks -= curr_blks; } *num_used_tmpblocks = tmp_used; return burn_depth; } static inline unsigned int bulk_cbc_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, const byte *inbuf, size_t nblocks, byte *iv, byte *tmpbuf, size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks) { unsigned int tmp_used = 16; unsigned int burn_depth = 0; unsigned int nburn; while (nblocks >= 1) { size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; size_t i; if (curr_blks * 16 > tmp_used) tmp_used = curr_blks * 16; nburn = crypt_fn (priv, tmpbuf, inbuf, curr_blks); burn_depth = nburn > burn_depth ? nburn : burn_depth; for (i = 0; i < curr_blks; i++) { cipher_block_xor_n_copy_2(outbuf, &tmpbuf[i * 16], iv, inbuf, 16); outbuf += 16; inbuf += 16; } nblocks -= curr_blks; } *num_used_tmpblocks = tmp_used; return burn_depth; } static inline unsigned int bulk_cfb_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, const byte *inbuf, size_t nblocks, byte *iv, byte *tmpbuf, size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks) { unsigned int tmp_used = 16; unsigned int burn_depth = 0; unsigned int nburn; while (nblocks >= 1) { size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; size_t i; if (curr_blks * 16 > tmp_used) tmp_used = curr_blks * 16; cipher_block_cpy (&tmpbuf[0 * 16], iv, 16); if (curr_blks > 1) memcpy (&tmpbuf[1 * 16], &inbuf[(1 - 1) * 16], 16 * curr_blks - 16); cipher_block_cpy (iv, &inbuf[(curr_blks - 1) * 16], 16); nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks); burn_depth = nburn > burn_depth ? nburn : burn_depth; for (i = 0; i < curr_blks; i++) { cipher_block_xor (outbuf, inbuf, &tmpbuf[i * 16], 16); outbuf += 16; inbuf += 16; } nblocks -= curr_blks; } *num_used_tmpblocks = tmp_used; return burn_depth; } static inline unsigned int bulk_ocb_crypt_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, const byte *inbuf, size_t nblocks, u64 *blkn, int encrypt, byte *tmpbuf, size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks) { unsigned int tmp_used = 16; unsigned int burn_depth = 0; unsigned int nburn; while (nblocks >= 1) { size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; size_t i; if (curr_blks * 16 > tmp_used) tmp_used = curr_blks * 16; for (i = 0; i < curr_blks; i++) { const unsigned char *l = ocb_get_l(c, ++*blkn); /* Checksum_i = Checksum_{i-1} xor P_i */ if (encrypt) cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16); cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16], c->u_iv.iv, 16); } /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ nburn = crypt_fn (priv, outbuf, outbuf, curr_blks); burn_depth = nburn > burn_depth ? nburn : burn_depth; for (i = 0; i < curr_blks; i++) { cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16); /* Checksum_i = Checksum_{i-1} xor P_i */ if (!encrypt) cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16); } outbuf += curr_blks * 16; inbuf += curr_blks * 16; nblocks -= curr_blks; } *num_used_tmpblocks = tmp_used; return burn_depth; } static inline unsigned int bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn, const byte *abuf, size_t nblocks, u64 *blkn, byte *tmpbuf, size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks) { unsigned int tmp_used = 16; unsigned int burn_depth = 0; unsigned int nburn; while (nblocks >= 1) { size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; size_t i; if (curr_blks * 16 > tmp_used) tmp_used = curr_blks * 16; for (i = 0; i < curr_blks; i++) { const unsigned char *l = ocb_get_l(c, ++*blkn); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_mode.ocb.aad_offset, l, 16); cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16); } /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks); burn_depth = nburn > burn_depth ? nburn : burn_depth; for (i = 0; i < curr_blks; i++) { cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16); } abuf += curr_blks * 16; nblocks -= curr_blks; } *num_used_tmpblocks = tmp_used; return burn_depth; } static inline unsigned int bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, const byte *inbuf, size_t nblocks, byte *tweak, byte *tmpbuf, size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks) { u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry; unsigned int tmp_used = 16; unsigned int burn_depth = 0; unsigned int nburn; tweak_next_lo = buf_get_le64 (tweak + 0); tweak_next_hi = buf_get_le64 (tweak + 8); while (nblocks >= 1) { size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; size_t i; if (curr_blks * 16 > tmp_used) tmp_used = curr_blks * 16; for (i = 0; i < curr_blks; i++) { tweak_lo = tweak_next_lo; tweak_hi = tweak_next_hi; /* Generate next tweak. */ carry = -(tweak_next_hi >> 63) & 0x87; tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63); tweak_next_lo = (tweak_next_lo << 1) ^ carry; /* Xor-Encrypt/Decrypt-Xor block. */ tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo; tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi; buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo); buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi); buf_put_le64 (outbuf + i * 16 + 0, tmp_lo); buf_put_le64 (outbuf + i * 16 + 8, tmp_hi); } nburn = crypt_fn (priv, outbuf, outbuf, curr_blks); burn_depth = nburn > burn_depth ? nburn : burn_depth; for (i = 0; i < curr_blks; i++) { /* Xor-Encrypt/Decrypt-Xor block. */ tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]); tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]); tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo; tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi; buf_put_le64 (outbuf + i * 16 + 0, tmp_lo); buf_put_le64 (outbuf + i * 16 + 8, tmp_hi); } inbuf += curr_blks * 16; outbuf += curr_blks * 16; nblocks -= curr_blks; } buf_put_le64 (tweak + 0, tweak_next_lo); buf_put_le64 (tweak + 8, tweak_next_hi); *num_used_tmpblocks = tmp_used; return burn_depth; } #endif /*GCRYPT_BULKHELP_H*/ diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index 9cc5621e..411e790f 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -1,2220 +1,2218 @@ /* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia * * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRY_CAMELLIA_AESNI_AVX2_AMD64_H #define GCRY_CAMELLIA_AESNI_AVX2_AMD64_H #include "asm-common-amd64.h" #define CAMELLIA_TABLE_BYTE_LEN 272 /* struct CAMELLIA_context: */ #define key_table 0 #define key_bitlength CAMELLIA_TABLE_BYTE_LEN /* register macros */ #define CTX %rdi #define RIO %r8 /********************************************************************** helper macros **********************************************************************/ #ifndef CAMELLIA_GFNI_BUILD #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ vpand x, mask4bit, tmp0; \ vpandn x, mask4bit, x; \ vpsrld $4, x, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; #endif #define ymm0_x xmm0 #define ymm1_x xmm1 #define ymm2_x xmm2 #define ymm3_x xmm3 #define ymm4_x xmm4 #define ymm5_x xmm5 #define ymm6_x xmm6 #define ymm7_x xmm7 #define ymm8_x xmm8 #define ymm9_x xmm9 #define ymm10_x xmm10 #define ymm11_x xmm11 #define ymm12_x xmm12 #define ymm13_x xmm13 #define ymm14_x xmm14 #define ymm15_x xmm15 #ifdef CAMELLIA_VAES_BUILD # define IF_AESNI(...) # define IF_VAES(...) __VA_ARGS__ #else # define IF_AESNI(...) __VA_ARGS__ # define IF_VAES(...) #endif /********************************************************************** GFNI helper macros and constants **********************************************************************/ #ifdef CAMELLIA_GFNI_BUILD #define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \ ( (((a0) & 1) << 0) | \ (((a1) & 1) << 1) | \ (((a2) & 1) << 2) | \ (((a3) & 1) << 3) | \ (((a4) & 1) << 4) | \ (((a5) & 1) << 5) | \ (((a6) & 1) << 6) | \ (((a7) & 1) << 7) ) #define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \ ( ((l7) << (0 * 8)) | \ ((l6) << (1 * 8)) | \ ((l5) << (2 * 8)) | \ ((l4) << (3 * 8)) | \ ((l3) << (4 * 8)) | \ ((l2) << (5 * 8)) | \ ((l1) << (6 * 8)) | \ ((l0) << (7 * 8)) ) /* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4. * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. * * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are * combination of function "A" (AES SubBytes affine transformation) and * "ψ₁"/"ψ₂"/"ψ₃". */ /* Constant from "θ₁(x)" and "θ₄(x)" functions. */ #define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0) /* Constant from "ψ₁(A(x))" function: */ #define post_filter_constant_s14 BV8(0, 1, 1, 1, 0, 1, 1, 0) /* Constant from "ψ₂(A(x))" function: */ #define post_filter_constant_s2 BV8(0, 0, 1, 1, 1, 0, 1, 1) /* Constant from "ψ₃(A(x))" function: */ #define post_filter_constant_s3 BV8(1, 1, 1, 0, 1, 1, 0, 0) #endif /* CAMELLIA_GFNI_BUILD */ /********************************************************************** 32-way camellia **********************************************************************/ #ifdef CAMELLIA_GFNI_BUILD /* roundsm32 (GFNI version) * IN: * x0..x7: byte-sliced AB state * mem_cd: register pointer storing CD state * key: index for key material * OUT: * x0..x7: new byte-sliced CD state */ #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ t6, t7, mem_cd, key) \ /* \ * S-function with AES subbytes \ */ \ vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \ vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \ vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \ vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \ vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \ vpxor t7##_x, t7##_x, t7##_x; \ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ \ /* prefilter sboxes */ \ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \ \ /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \ \ /* sbox GF8 inverse + postfilter sbox 3 */ \ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \ \ /* sbox GF8 inverse + postfilter sbox 2 */ \ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ vpshufb t7, t1, t1; \ vpsrldq $3, t0, t3; \ \ /* P-function */ \ vpxor x5, x0, x0; \ vpxor x6, x1, x1; \ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ vpshufb t7, t2, t2; \ vpsrldq $4, t0, t4; \ vpshufb t7, t3, t3; \ vpsrldq $5, t0, t5; \ vpshufb t7, t4, t4; \ \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ vpsrldq $6, t0, t6; \ vpshufb t7, t5, t5; \ vpshufb t7, t6, t6; \ \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ vpxor x6, x3, x3; \ \ vpxor x3, x4, x4; \ vpxor x0, x5, x5; \ vpxor x1, x6, x6; \ vpxor x2, x7, x7; /* note: high and low parts swapped */ \ \ /* Add key material and result to CD (x becomes new CD) */ \ \ vpxor t6, x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \ \ vpsrldq $7, t0, t6; \ vpshufb t7, t0, t0; \ vpshufb t7, t6, t7; \ \ vpxor t7, x0, x0; \ vpxor 4 * 32(mem_cd), x0, x0; \ \ vpxor t5, x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \ \ vpxor t4, x3, x3; \ vpxor 7 * 32(mem_cd), x3, x3; \ \ vpxor t3, x4, x4; \ vpxor 0 * 32(mem_cd), x4, x4; \ \ vpxor t2, x5, x5; \ vpxor 1 * 32(mem_cd), x5, x5; \ \ vpxor t1, x6, x6; \ vpxor 2 * 32(mem_cd), x6, x6; \ \ vpxor t0, x7, x7; \ vpxor 3 * 32(mem_cd), x7, x7; #else /* CAMELLIA_GFNI_BUILD */ /* roundsm32 (AES-NI / VAES version) * IN: * x0..x7: byte-sliced AB state * mem_cd: register pointer storing CD state * key: index for key material * OUT: * x0..x7: new byte-sliced CD state */ #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ t6, t7, mem_cd, key) \ /* \ * S-function with AES subbytes \ */ \ vbroadcasti128 .Linv_shift_row rRIP, t4; \ vpbroadcastd .L0f0f0f0f rRIP, t7; \ vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \ vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \ vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \ vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ vpshufb t4, x7, x7; \ vpshufb t4, x3, x3; \ vpshufb t4, x6, x6; \ vpshufb t4, x2, x2; \ vpshufb t4, x5, x5; \ vpshufb t4, x1, x1; \ vpshufb t4, x4, x4; \ \ /* prefilter sboxes 1, 2 and 3 */ \ /* prefilter sbox 4 */ \ filter_8bit(x0, t5, t6, t7, t4); \ filter_8bit(x7, t5, t6, t7, t4); \ IF_AESNI(vextracti128 $1, x0, t0##_x); \ IF_AESNI(vextracti128 $1, x7, t1##_x); \ filter_8bit(x3, t2, t3, t7, t4); \ filter_8bit(x6, t2, t3, t7, t4); \ IF_AESNI(vextracti128 $1, x3, t3##_x); \ IF_AESNI(vextracti128 $1, x6, t2##_x); \ filter_8bit(x2, t5, t6, t7, t4); \ filter_8bit(x5, t5, t6, t7, t4); \ filter_8bit(x1, t5, t6, t7, t4); \ filter_8bit(x4, t5, t6, t7, t4); \ \ vpxor t4##_x, t4##_x, t4##_x; \ \ /* AES subbytes + AES shift rows */ \ IF_AESNI(vextracti128 $1, x2, t6##_x; \ vextracti128 $1, x5, t5##_x; \ vaesenclast t4##_x, x0##_x, x0##_x; \ vaesenclast t4##_x, t0##_x, t0##_x; \ vaesenclast t4##_x, x7##_x, x7##_x; \ vaesenclast t4##_x, t1##_x, t1##_x; \ vaesenclast t4##_x, x3##_x, x3##_x; \ vaesenclast t4##_x, t3##_x, t3##_x; \ vaesenclast t4##_x, x6##_x, x6##_x; \ vaesenclast t4##_x, t2##_x, t2##_x; \ vinserti128 $1, t0##_x, x0, x0; \ vinserti128 $1, t1##_x, x7, x7; \ vinserti128 $1, t3##_x, x3, x3; \ vinserti128 $1, t2##_x, x6, x6; \ vextracti128 $1, x1, t3##_x; \ vextracti128 $1, x4, t2##_x); \ vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \ vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \ IF_AESNI(vaesenclast t4##_x, x2##_x, x2##_x; \ vaesenclast t4##_x, t6##_x, t6##_x; \ vaesenclast t4##_x, x5##_x, x5##_x; \ vaesenclast t4##_x, t5##_x, t5##_x; \ vaesenclast t4##_x, x1##_x, x1##_x; \ vaesenclast t4##_x, t3##_x, t3##_x; \ vaesenclast t4##_x, x4##_x, x4##_x; \ vaesenclast t4##_x, t2##_x, t2##_x; \ vinserti128 $1, t6##_x, x2, x2; \ vinserti128 $1, t5##_x, x5, x5; \ vinserti128 $1, t3##_x, x1, x1; \ vinserti128 $1, t2##_x, x4, x4); \ IF_VAES(vaesenclast t4, x0, x0; \ vaesenclast t4, x7, x7; \ vaesenclast t4, x3, x3; \ vaesenclast t4, x6, x6; \ vaesenclast t4, x2, x2; \ vaesenclast t4, x5, x5; \ vaesenclast t4, x1, x1; \ vaesenclast t4, x4, x4); \ \ /* postfilter sboxes 1 and 4 */ \ vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \ vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t4); \ filter_8bit(x7, t0, t1, t7, t4); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \ vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ \ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ vpxor t7##_x, t7##_x, t7##_x; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ vpshufb t7, t1, t1; \ vpsrldq $3, t0, t3; \ \ /* P-function */ \ vpxor x5, x0, x0; \ vpxor x6, x1, x1; \ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ vpshufb t7, t2, t2; \ vpsrldq $4, t0, t4; \ vpshufb t7, t3, t3; \ vpsrldq $5, t0, t5; \ vpshufb t7, t4, t4; \ \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ vpsrldq $6, t0, t6; \ vpshufb t7, t5, t5; \ vpshufb t7, t6, t6; \ \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ vpxor x6, x3, x3; \ \ vpxor x3, x4, x4; \ vpxor x0, x5, x5; \ vpxor x1, x6, x6; \ vpxor x2, x7, x7; /* note: high and low parts swapped */ \ \ /* Add key material and result to CD (x becomes new CD) */ \ \ vpxor t6, x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \ \ vpsrldq $7, t0, t6; \ vpshufb t7, t0, t0; \ vpshufb t7, t6, t7; \ \ vpxor t7, x0, x0; \ vpxor 4 * 32(mem_cd), x0, x0; \ \ vpxor t5, x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \ \ vpxor t4, x3, x3; \ vpxor 7 * 32(mem_cd), x3, x3; \ \ vpxor t3, x4, x4; \ vpxor 0 * 32(mem_cd), x4, x4; \ \ vpxor t2, x5, x5; \ vpxor 1 * 32(mem_cd), x5, x5; \ \ vpxor t1, x6, x6; \ vpxor 2 * 32(mem_cd), x6, x6; \ \ vpxor t0, x7, x7; \ vpxor 3 * 32(mem_cd), x7, x7; #endif /* CAMELLIA_GFNI_BUILD */ /* * IN/OUT: * x0..x7: byte-sliced AB state preloaded * mem_ab: byte-sliced AB state in memory * mem_cb: byte-sliced CD state in memory */ #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ \ vmovdqu x0, 4 * 32(mem_cd); \ vmovdqu x1, 5 * 32(mem_cd); \ vmovdqu x2, 6 * 32(mem_cd); \ vmovdqu x3, 7 * 32(mem_cd); \ vmovdqu x4, 0 * 32(mem_cd); \ vmovdqu x5, 1 * 32(mem_cd); \ vmovdqu x6, 2 * 32(mem_cd); \ vmovdqu x7, 3 * 32(mem_cd); \ \ roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ \ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ /* Store new AB state */ \ vmovdqu x4, 4 * 32(mem_ab); \ vmovdqu x5, 5 * 32(mem_ab); \ vmovdqu x6, 6 * 32(mem_ab); \ vmovdqu x7, 7 * 32(mem_ab); \ vmovdqu x0, 0 * 32(mem_ab); \ vmovdqu x1, 1 * 32(mem_ab); \ vmovdqu x2, 2 * 32(mem_ab); \ vmovdqu x3, 3 * 32(mem_ab); #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i) \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i) \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); /* * IN: * v0..3: byte-sliced 32-bit integers * OUT: * v0..3: (IN <<< 1) */ #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ vpcmpgtb v0, zero, t0; \ vpaddb v0, v0, v0; \ vpabsb t0, t0; \ \ vpcmpgtb v1, zero, t1; \ vpaddb v1, v1, v1; \ vpabsb t1, t1; \ \ vpcmpgtb v2, zero, t2; \ vpaddb v2, v2, v2; \ vpabsb t2, t2; \ \ vpor t0, v1, v1; \ \ vpcmpgtb v3, zero, t0; \ vpaddb v3, v3, v3; \ vpabsb t0, t0; \ \ vpor t1, v2, v2; \ vpor t2, v3, v3; \ vpor t0, v0, v0; /* * IN: * r: byte-sliced AB state in memory * l: byte-sliced CD state in memory * OUT: * x0..x7: new byte-sliced CD state */ #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ tt1, tt2, tt3, kll, klr, krl, krr) \ /* \ * t0 = kll; \ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ vpxor tt0, tt0, tt0; \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ vpand l2, t2, t2; \ vpand l3, t3, t3; \ \ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor l4, t0, l4; \ vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ vmovdqu l4, 4 * 32(l); \ vpxor l5, t1, l5; \ vmovdqu l5, 5 * 32(l); \ vpxor l6, t2, l6; \ vmovdqu l6, 6 * 32(l); \ vpxor l7, t3, l7; \ vmovdqu l7, 7 * 32(l); \ \ /* \ * t2 = krr; \ * t2 |= rr; \ * rl ^= t2; \ */ \ \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpor 4 * 32(r), t0, t0; \ vpor 5 * 32(r), t1, t1; \ vpor 6 * 32(r), t2, t2; \ vpor 7 * 32(r), t3, t3; \ \ vpxor 0 * 32(r), t0, t0; \ vpxor 1 * 32(r), t1, t1; \ vpxor 2 * 32(r), t2, t2; \ vpxor 3 * 32(r), t3, t3; \ vmovdqu t0, 0 * 32(r); \ vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 1 * 32(r); \ vmovdqu t2, 2 * 32(r); \ vmovdqu t3, 3 * 32(r); \ \ /* \ * t2 = krl; \ * t2 &= rl; \ * rr ^= rol32(t2, 1); \ */ \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpand 0 * 32(r), t0, t0; \ vpand 1 * 32(r), t1, t1; \ vpand 2 * 32(r), t2, t2; \ vpand 3 * 32(r), t3, t3; \ \ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor 4 * 32(r), t0, t0; \ vpxor 5 * 32(r), t1, t1; \ vpxor 6 * 32(r), t2, t2; \ vpxor 7 * 32(r), t3, t3; \ vmovdqu t0, 4 * 32(r); \ vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 5 * 32(r); \ vmovdqu t2, 6 * 32(r); \ vmovdqu t3, 7 * 32(r); \ \ /* \ * t0 = klr; \ * t0 |= lr; \ * ll ^= t0; \ */ \ \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ vpor l6, t2, t2; \ vpor l7, t3, t3; \ \ vpxor l0, t0, l0; \ vmovdqu l0, 0 * 32(l); \ vpxor l1, t1, l1; \ vmovdqu l1, 1 * 32(l); \ vpxor l2, t2, l2; \ vmovdqu l2, 2 * 32(l); \ vpxor l3, t3, l3; \ vmovdqu l3, 3 * 32(l); #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ a3, b3, c3, d3, st0, st1) \ vmovdqu d2, st0; \ vmovdqu d3, st1; \ transpose_4x4(a0, a1, a2, a3, d2, d3); \ transpose_4x4(b0, b1, b2, b3, d2, d3); \ vmovdqu st0, d2; \ vmovdqu st1, d3; \ \ vmovdqu a0, st0; \ vmovdqu a1, st1; \ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ vbroadcasti128 .Lshufb_16x16b rRIP, a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ vpshufb a0, b0, b0; \ vpshufb a0, b1, b1; \ vpshufb a0, b2, b2; \ vpshufb a0, b3, b3; \ vpshufb a0, a1, a1; \ vpshufb a0, c0, c0; \ vpshufb a0, c1, c1; \ vpshufb a0, c2, c2; \ vpshufb a0, c3, c3; \ vpshufb a0, d0, d0; \ vpshufb a0, d1, d1; \ vpshufb a0, d2, d2; \ vpshufb a0, d3, d3; \ vmovdqu d3, st1; \ vmovdqu st0, d3; \ vpshufb a0, d3, a0; \ vmovdqu d2, st0; \ \ transpose_4x4(a0, b0, c0, d0, d2, d3); \ transpose_4x4(a1, b1, c1, d1, d2, d3); \ vmovdqu st0, d2; \ vmovdqu st1, d3; \ \ vmovdqu b0, st0; \ vmovdqu b1, st1; \ transpose_4x4(a2, b2, c2, d2, b0, b1); \ transpose_4x4(a3, b3, c3, d3, b0, b1); \ vmovdqu st0, b0; \ vmovdqu st1, b1; \ /* does not adjust output bytes inside vectors */ /* load blocks to registers and apply pre-whitening */ #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vpbroadcastq key, x0; \ vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor 0 * 32(rio), x0, y7; \ vpxor 1 * 32(rio), x0, y6; \ vpxor 2 * 32(rio), x0, y5; \ vpxor 3 * 32(rio), x0, y4; \ vpxor 4 * 32(rio), x0, y3; \ vpxor 5 * 32(rio), x0, y2; \ vpxor 6 * 32(rio), x0, y1; \ vpxor 7 * 32(rio), x0, y0; \ vpxor 8 * 32(rio), x0, x7; \ vpxor 9 * 32(rio), x0, x6; \ vpxor 10 * 32(rio), x0, x5; \ vpxor 11 * 32(rio), x0, x4; \ vpxor 12 * 32(rio), x0, x3; \ vpxor 13 * 32(rio), x0, x2; \ vpxor 14 * 32(rio), x0, x1; \ vpxor 15 * 32(rio), x0, x0; /* byteslice pre-whitened blocks and store to temporary memory */ #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd) \ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ y4, y5, y6, y7, (mem_ab), (mem_cd)); \ \ vmovdqu x0, 0 * 32(mem_ab); \ vmovdqu x1, 1 * 32(mem_ab); \ vmovdqu x2, 2 * 32(mem_ab); \ vmovdqu x3, 3 * 32(mem_ab); \ vmovdqu x4, 4 * 32(mem_ab); \ vmovdqu x5, 5 * 32(mem_ab); \ vmovdqu x6, 6 * 32(mem_ab); \ vmovdqu x7, 7 * 32(mem_ab); \ vmovdqu y0, 0 * 32(mem_cd); \ vmovdqu y1, 1 * 32(mem_cd); \ vmovdqu y2, 2 * 32(mem_cd); \ vmovdqu y3, 3 * 32(mem_cd); \ vmovdqu y4, 4 * 32(mem_cd); \ vmovdqu y5, 5 * 32(mem_cd); \ vmovdqu y6, 6 * 32(mem_cd); \ vmovdqu y7, 7 * 32(mem_cd); /* de-byteslice, apply post-whitening and store blocks */ #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ y5, y6, y7, key, stack_tmp0, stack_tmp1) \ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ \ vmovdqu x0, stack_tmp0; \ \ vpbroadcastq key, x0; \ vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ vpxor x0, y5, y5; \ vpxor x0, y4, y4; \ vpxor x0, y3, y3; \ vpxor x0, y2, y2; \ vpxor x0, y1, y1; \ vpxor x0, y0, y0; \ vpxor x0, x7, x7; \ vpxor x0, x6, x6; \ vpxor x0, x5, x5; \ vpxor x0, x4, x4; \ vpxor x0, x3, x3; \ vpxor x0, x2, x2; \ vpxor x0, x1, x1; \ vpxor stack_tmp0, x0, x0; #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio) \ vmovdqu x0, 0 * 32(rio); \ vmovdqu x1, 1 * 32(rio); \ vmovdqu x2, 2 * 32(rio); \ vmovdqu x3, 3 * 32(rio); \ vmovdqu x4, 4 * 32(rio); \ vmovdqu x5, 5 * 32(rio); \ vmovdqu x6, 6 * 32(rio); \ vmovdqu x7, 7 * 32(rio); \ vmovdqu y0, 8 * 32(rio); \ vmovdqu y1, 9 * 32(rio); \ vmovdqu y2, 10 * 32(rio); \ vmovdqu y3, 11 * 32(rio); \ vmovdqu y4, 12 * 32(rio); \ vmovdqu y5, 13 * 32(rio); \ vmovdqu y6, 14 * 32(rio); \ vmovdqu y7, 15 * 32(rio); .text .align 32 #define SHUFB_BYTES(idx) \ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) FUNC_NAME(_constants): ELF(.type FUNC_NAME(_constants),@object;) -.Lshufb_16x16b: - .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) - .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) - .Lpack_bswap: .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 #ifdef CAMELLIA_GFNI_BUILD /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 * and s4. * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. * * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are * combination of function "A" (AES SubBytes affine transformation) and * "ψ₁"/"ψ₂"/"ψ₃". */ /* Bit-matrix from "θ₁(x)" function: */ .Lpre_filter_bitmatrix_s123: .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1), BV8(0, 0, 1, 1, 0, 0, 1, 0), BV8(1, 1, 0, 1, 0, 0, 0, 0), BV8(1, 0, 1, 1, 0, 0, 1, 1), BV8(0, 0, 0, 0, 1, 1, 0, 0), BV8(1, 0, 1, 0, 0, 1, 0, 0), BV8(0, 0, 1, 0, 1, 1, 0, 0), BV8(1, 0, 0, 0, 0, 1, 1, 0)) /* Bit-matrix from "θ₄(x)" function: */ .Lpre_filter_bitmatrix_s4: .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1), BV8(0, 1, 1, 0, 0, 1, 0, 0), BV8(1, 0, 1, 0, 0, 0, 0, 1), BV8(0, 1, 1, 0, 0, 1, 1, 1), BV8(0, 0, 0, 1, 1, 0, 0, 0), BV8(0, 1, 0, 0, 1, 0, 0, 1), BV8(0, 1, 0, 1, 1, 0, 0, 0), BV8(0, 0, 0, 0, 1, 1, 0, 1)) /* Bit-matrix from "ψ₁(A(x))" function: */ .Lpost_filter_bitmatrix_s14: .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), BV8(0, 1, 1, 0, 0, 1, 1, 0), BV8(1, 0, 1, 1, 1, 1, 1, 0), BV8(0, 0, 0, 1, 1, 0, 1, 1), BV8(1, 0, 0, 0, 1, 1, 1, 0), BV8(0, 1, 0, 1, 1, 1, 1, 0), BV8(0, 1, 1, 1, 1, 1, 1, 1), BV8(0, 0, 0, 1, 1, 1, 0, 0)) /* Bit-matrix from "ψ₂(A(x))" function: */ .Lpost_filter_bitmatrix_s2: .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1), BV8(0, 1, 1, 0, 0, 1, 1, 0), BV8(1, 0, 1, 1, 1, 1, 1, 0), BV8(0, 0, 0, 1, 1, 0, 1, 1), BV8(1, 0, 0, 0, 1, 1, 1, 0), BV8(0, 1, 0, 1, 1, 1, 1, 0), BV8(0, 1, 1, 1, 1, 1, 1, 1)) /* Bit-matrix from "ψ₃(A(x))" function: */ .Lpost_filter_bitmatrix_s3: .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0), BV8(1, 0, 1, 1, 1, 1, 1, 0), BV8(0, 0, 0, 1, 1, 0, 1, 1), BV8(1, 0, 0, 0, 1, 1, 1, 0), BV8(0, 1, 0, 1, 1, 1, 1, 0), BV8(0, 1, 1, 1, 1, 1, 1, 1), BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) #else /* CAMELLIA_GFNI_BUILD */ /* * pre-SubByte transform * * pre-lookup for sbox1, sbox2, sbox3: * swap_bitendianness( * isom_map_camellia_to_aes( * camellia_f( * swap_bitendianess(in) * ) * ) * ) * * (note: '⊕ 0xc5' inside camellia_f()) */ .Lpre_tf_lo_s1: .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 .Lpre_tf_hi_s1: .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 /* * pre-SubByte transform * * pre-lookup for sbox4: * swap_bitendianness( * isom_map_camellia_to_aes( * camellia_f( * swap_bitendianess(in <<< 1) * ) * ) * ) * * (note: '⊕ 0xc5' inside camellia_f()) */ .Lpre_tf_lo_s4: .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 .Lpre_tf_hi_s4: .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf /* * post-SubByte transform * * post-lookup for sbox1, sbox4: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s1: .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 .Lpost_tf_hi_s1: .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c /* * post-SubByte transform * * post-lookup for sbox2: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) <<< 1 * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s2: .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 .Lpost_tf_hi_s2: .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 /* * post-SubByte transform * * post-lookup for sbox3: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) >>> 1 * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s3: .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 .Lpost_tf_hi_s3: .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 /* For isolating SubBytes from AESENCLAST, inverse shift row */ .Linv_shift_row: .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 .align 4 /* 4-bit mask */ .L0f0f0f0f: .long 0x0f0f0f0f #endif /* CAMELLIA_GFNI_BUILD */ ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);) .align 8 -ELF(.type __camellia_enc_blk32,@function;) +ELF(.type FUNC_NAME(enc_blk32),@function;) -__camellia_enc_blk32: +FUNC_NAME(enc_blk32): /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes * %r8d: 24 for 16 byte key, 32 for larger * %ymm0..%ymm15: 32 plaintext blocks * output: * %ymm0..%ymm15: 32 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ CFI_STARTPROC(); leaq 8 * 32(%rax), %rcx; leaq (-8 * 8)(CTX, %r8, 8), %r8; inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx); .align 8 .Lenc_loop: enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx, 0); cmpq %r8, CTX; je .Lenc_done; leaq (8 * 8)(CTX), CTX; fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 0)(CTX), ((key_table) + 4)(CTX), ((key_table) + 8)(CTX), ((key_table) + 12)(CTX)); jmp .Lenc_loop; .align 8 .Lenc_done: /* load CD for output */ vmovdqu 0 * 32(%rcx), %ymm8; vmovdqu 1 * 32(%rcx), %ymm9; vmovdqu 2 * 32(%rcx), %ymm10; vmovdqu 3 * 32(%rcx), %ymm11; vmovdqu 4 * 32(%rcx), %ymm12; vmovdqu 5 * 32(%rcx), %ymm13; vmovdqu 6 * 32(%rcx), %ymm14; vmovdqu 7 * 32(%rcx), %ymm15; outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax)); ret_spec_stop; CFI_ENDPROC(); -ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) +ELF(.size FUNC_NAME(enc_blk32),.-FUNC_NAME(enc_blk32);) .align 8 -ELF(.type __camellia_dec_blk32,@function;) +ELF(.type FUNC_NAME(dec_blk32),@function;) -__camellia_dec_blk32: +FUNC_NAME(dec_blk32): /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes * %r8d: 24 for 16 byte key, 32 for larger - * %ymm0..%ymm15: 16 encrypted blocks + * %ymm0..%ymm15: 32 encrypted blocks * output: - * %ymm0..%ymm15: 16 plaintext blocks, order swapped: + * %ymm0..%ymm15: 32 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ CFI_STARTPROC(); movq %r8, %rcx; movq CTX, %r8 leaq (-8 * 8)(CTX, %rcx, 8), CTX; leaq 8 * 32(%rax), %rcx; inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx); .align 8 .Ldec_loop: dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx, 0); cmpq %r8, CTX; je .Ldec_done; fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 8)(CTX), ((key_table) + 12)(CTX), ((key_table) + 0)(CTX), ((key_table) + 4)(CTX)); leaq (-8 * 8)(CTX), CTX; jmp .Ldec_loop; .align 8 .Ldec_done: /* load CD for output */ vmovdqu 0 * 32(%rcx), %ymm8; vmovdqu 1 * 32(%rcx), %ymm9; vmovdqu 2 * 32(%rcx), %ymm10; vmovdqu 3 * 32(%rcx), %ymm11; vmovdqu 4 * 32(%rcx), %ymm12; vmovdqu 5 * 32(%rcx), %ymm13; vmovdqu 6 * 32(%rcx), %ymm14; vmovdqu 7 * 32(%rcx), %ymm15; outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); ret_spec_stop; CFI_ENDPROC(); -ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) +ELF(.size FUNC_NAME(dec_blk32),.-FUNC_NAME(dec_blk32);) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl FUNC_NAME(ctr_enc) ELF(.type FUNC_NAME(ctr_enc),@function;) FUNC_NAME(ctr_enc): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); movq 8(%rcx), %r11; bswapq %r11; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; vpcmpeqd %ymm15, %ymm15, %ymm15; vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ /* load IV and byteswap */ vmovdqu (%rcx), %xmm0; vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0; vmovdqa %xmm0, %xmm1; inc_le128(%xmm0, %xmm15, %xmm14); vbroadcasti128 .Lbswap128_mask rRIP, %ymm14; vinserti128 $1, %xmm0, %ymm1, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 15 * 32(%rax); /* check need for handling 64-bit overflow and carry */ cmpq $(0xffffffffffffffff - 32), %r11; ja .Lload_ctr_carry; /* construct IVs */ vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */ vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 14 * 32(%rax); vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 13 * 32(%rax); vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm12; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm11; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm10; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm9; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm8; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm7; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm6; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm5; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm4; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm3; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm2; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm1; vpsubq %ymm15, %ymm0, %ymm0; /* +30 ; +31 */ vpsubq %xmm15, %xmm0, %xmm13; /* +32 */ vpshufb %ymm14, %ymm0, %ymm0; vpshufb %xmm14, %xmm13, %xmm13; vmovdqu %xmm13, (%rcx); jmp .Lload_ctr_done; .align 4 .Lload_ctr_carry: /* construct IVs */ inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */ inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */ vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 14 * 32(%rax); inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 13 * 32(%rax); inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm12; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm11; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm10; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm9; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm8; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm7; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm6; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm5; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm4; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm3; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm2; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm1; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vextracti128 $1, %ymm0, %xmm13; vpshufb %ymm14, %ymm0, %ymm0; inc_le128(%xmm13, %xmm15, %xmm14); vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; vmovdqu %xmm13, (%rcx); .align 4 .Lload_ctr_done: - /* inpack16_pre: */ + /* inpack32_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor 13 * 32(%rax), %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call __camellia_enc_blk32; + call FUNC_NAME(enc_blk32); vpxor 0 * 32(%rdx), %ymm7, %ymm7; vpxor 1 * 32(%rdx), %ymm6, %ymm6; vpxor 2 * 32(%rdx), %ymm5, %ymm5; vpxor 3 * 32(%rdx), %ymm4, %ymm4; vpxor 4 * 32(%rdx), %ymm3, %ymm3; vpxor 5 * 32(%rdx), %ymm2, %ymm2; vpxor 6 * 32(%rdx), %ymm1, %ymm1; vpxor 7 * 32(%rdx), %ymm0, %ymm0; vpxor 8 * 32(%rdx), %ymm15, %ymm15; vpxor 9 * 32(%rdx), %ymm14, %ymm14; vpxor 10 * 32(%rdx), %ymm13, %ymm13; vpxor 11 * 32(%rdx), %ymm12, %ymm12; vpxor 12 * 32(%rdx), %ymm11, %ymm11; vpxor 13 * 32(%rdx), %ymm10, %ymm10; vpxor 14 * 32(%rdx), %ymm9, %ymm9; vpxor 15 * 32(%rdx), %ymm8, %ymm8; - leaq 32 * 16(%rdx), %rdx; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);) .align 8 .globl FUNC_NAME(cbc_dec) ELF(.type FUNC_NAME(cbc_dec),@function;) FUNC_NAME(cbc_dec): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: iv */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); movq %rcx, %r9; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rdx, (key_table)(CTX, %r8, 8)); - call __camellia_dec_blk32; + call FUNC_NAME(dec_blk32); /* XOR output with IV */ vmovdqu %ymm8, (%rax); vmovdqu (%r9), %xmm8; vinserti128 $1, (%rdx), %ymm8, %ymm8; vpxor %ymm8, %ymm7, %ymm7; vmovdqu (%rax), %ymm8; vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; movq (15 * 32 + 16 + 0)(%rdx), %rax; movq (15 * 32 + 16 + 8)(%rdx), %rcx; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); /* store new IV */ movq %rax, (0)(%r9); movq %rcx, (8)(%r9); vzeroall; leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);) .align 8 .globl FUNC_NAME(cfb_dec) ELF(.type FUNC_NAME(cfb_dec),@function;) FUNC_NAME(cfb_dec): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: iv */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; - /* inpack16_pre: */ + /* inpack32_pre: */ vpbroadcastq (key_table)(CTX), %ymm0; vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; vmovdqu (%rcx), %xmm15; vinserti128 $1, (%rdx), %ymm15, %ymm15; vpxor %ymm15, %ymm0, %ymm15; vmovdqu (15 * 32 + 16)(%rdx), %xmm1; vmovdqu %xmm1, (%rcx); /* store new IV */ vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14; vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13; vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12; vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11; vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10; vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9; vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8; vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7; vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6; vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5; vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4; vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3; vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2; vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1; vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0; - call __camellia_enc_blk32; + call FUNC_NAME(enc_blk32); vpxor 0 * 32(%rdx), %ymm7, %ymm7; vpxor 1 * 32(%rdx), %ymm6, %ymm6; vpxor 2 * 32(%rdx), %ymm5, %ymm5; vpxor 3 * 32(%rdx), %ymm4, %ymm4; vpxor 4 * 32(%rdx), %ymm3, %ymm3; vpxor 5 * 32(%rdx), %ymm2, %ymm2; vpxor 6 * 32(%rdx), %ymm1, %ymm1; vpxor 7 * 32(%rdx), %ymm0, %ymm0; vpxor 8 * 32(%rdx), %ymm15, %ymm15; vpxor 9 * 32(%rdx), %ymm14, %ymm14; vpxor 10 * 32(%rdx), %ymm13, %ymm13; vpxor 11 * 32(%rdx), %ymm12, %ymm12; vpxor 12 * 32(%rdx), %ymm11, %ymm11; vpxor 13 * 32(%rdx), %ymm10, %ymm10; vpxor 14 * 32(%rdx), %ymm9, %ymm9; vpxor 15 * 32(%rdx), %ymm8, %ymm8; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);) .align 8 .globl FUNC_NAME(ocb_enc) ELF(.type FUNC_NAME(ocb_enc),@function;) FUNC_NAME(ocb_enc): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[32]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $(16 * 32 + 4 * 8), %rsp; andq $~63, %rsp; movq %rsp, %rax; movq %r10, (16 * 32 + 0 * 8)(%rsp); movq %r11, (16 * 32 + 1 * 8)(%rsp); movq %r12, (16 * 32 + 2 * 8)(%rsp); movq %r13, (16 * 32 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rcx), %xmm14; vmovdqu (%r8), %xmm13; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), %xmm14, %xmm15; \ vpxor (l1reg), %xmm15, %xmm14; \ vinserti128 $1, %xmm14, %ymm15, %ymm15; \ vpxor yreg, %ymm13, %ymm13; \ vpxor yreg, %ymm15, yreg; \ vmovdqu %ymm15, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, %ymm0); vmovdqu %ymm0, (15 * 32)(%rax); OCB_INPUT(1, %r12, %r13, %ymm0); vmovdqu %ymm0, (14 * 32)(%rax); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, %ymm0); vmovdqu %ymm0, (13 * 32)(%rax); OCB_INPUT(3, %r12, %r13, %ymm12); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, %ymm11); OCB_INPUT(5, %r12, %r13, %ymm10); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, %ymm9); OCB_INPUT(7, %r12, %r13, %ymm8); movq (16 * 8)(%r9), %r10; movq (17 * 8)(%r9), %r11; movq (18 * 8)(%r9), %r12; movq (19 * 8)(%r9), %r13; OCB_INPUT(8, %r10, %r11, %ymm7); OCB_INPUT(9, %r12, %r13, %ymm6); movq (20 * 8)(%r9), %r10; movq (21 * 8)(%r9), %r11; movq (22 * 8)(%r9), %r12; movq (23 * 8)(%r9), %r13; OCB_INPUT(10, %r10, %r11, %ymm5); OCB_INPUT(11, %r12, %r13, %ymm4); movq (24 * 8)(%r9), %r10; movq (25 * 8)(%r9), %r11; movq (26 * 8)(%r9), %r12; movq (27 * 8)(%r9), %r13; OCB_INPUT(12, %r10, %r11, %ymm3); OCB_INPUT(13, %r12, %r13, %ymm2); movq (28 * 8)(%r9), %r10; movq (29 * 8)(%r9), %r11; movq (30 * 8)(%r9), %r12; movq (31 * 8)(%r9), %r13; OCB_INPUT(14, %r10, %r11, %ymm1); OCB_INPUT(15, %r12, %r13, %ymm0); #undef OCB_INPUT vextracti128 $1, %ymm13, %xmm15; vmovdqu %xmm14, (%rcx); vpxor %xmm13, %xmm15, %xmm15; vmovdqu %xmm15, (%r8); cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r10d; cmovel %r10d, %r8d; /* max */ - /* inpack16_pre: */ + /* inpack32_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor 13 * 32(%rax), %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call __camellia_enc_blk32; + call FUNC_NAME(enc_blk32); vpxor 0 * 32(%rsi), %ymm7, %ymm7; vpxor 1 * 32(%rsi), %ymm6, %ymm6; vpxor 2 * 32(%rsi), %ymm5, %ymm5; vpxor 3 * 32(%rsi), %ymm4, %ymm4; vpxor 4 * 32(%rsi), %ymm3, %ymm3; vpxor 5 * 32(%rsi), %ymm2, %ymm2; vpxor 6 * 32(%rsi), %ymm1, %ymm1; vpxor 7 * 32(%rsi), %ymm0, %ymm0; vpxor 8 * 32(%rsi), %ymm15, %ymm15; vpxor 9 * 32(%rsi), %ymm14, %ymm14; vpxor 10 * 32(%rsi), %ymm13, %ymm13; vpxor 11 * 32(%rsi), %ymm12, %ymm12; vpxor 12 * 32(%rsi), %ymm11, %ymm11; vpxor 13 * 32(%rsi), %ymm10, %ymm10; vpxor 14 * 32(%rsi), %ymm9, %ymm9; vpxor 15 * 32(%rsi), %ymm8, %ymm8; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; movq (16 * 32 + 0 * 8)(%rsp), %r10; movq (16 * 32 + 1 * 8)(%rsp), %r11; movq (16 * 32 + 2 * 8)(%rsp), %r12; movq (16 * 32 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);) .align 8 .globl FUNC_NAME(ocb_dec) ELF(.type FUNC_NAME(ocb_dec),@function;) FUNC_NAME(ocb_dec): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[32]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $(16 * 32 + 4 * 8), %rsp; andq $~63, %rsp; movq %rsp, %rax; movq %r10, (16 * 32 + 0 * 8)(%rsp); movq %r11, (16 * 32 + 1 * 8)(%rsp); movq %r12, (16 * 32 + 2 * 8)(%rsp); movq %r13, (16 * 32 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rcx), %xmm14; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), %xmm14, %xmm15; \ vpxor (l1reg), %xmm15, %xmm14; \ vinserti128 $1, %xmm14, %ymm15, %ymm15; \ vpxor yreg, %ymm15, yreg; \ vmovdqu %ymm15, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, %ymm0); vmovdqu %ymm0, (15 * 32)(%rax); OCB_INPUT(1, %r12, %r13, %ymm0); vmovdqu %ymm0, (14 * 32)(%rax); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, %ymm13); OCB_INPUT(3, %r12, %r13, %ymm12); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, %ymm11); OCB_INPUT(5, %r12, %r13, %ymm10); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, %ymm9); OCB_INPUT(7, %r12, %r13, %ymm8); movq (16 * 8)(%r9), %r10; movq (17 * 8)(%r9), %r11; movq (18 * 8)(%r9), %r12; movq (19 * 8)(%r9), %r13; OCB_INPUT(8, %r10, %r11, %ymm7); OCB_INPUT(9, %r12, %r13, %ymm6); movq (20 * 8)(%r9), %r10; movq (21 * 8)(%r9), %r11; movq (22 * 8)(%r9), %r12; movq (23 * 8)(%r9), %r13; OCB_INPUT(10, %r10, %r11, %ymm5); OCB_INPUT(11, %r12, %r13, %ymm4); movq (24 * 8)(%r9), %r10; movq (25 * 8)(%r9), %r11; movq (26 * 8)(%r9), %r12; movq (27 * 8)(%r9), %r13; OCB_INPUT(12, %r10, %r11, %ymm3); OCB_INPUT(13, %r12, %r13, %ymm2); movq (28 * 8)(%r9), %r10; movq (29 * 8)(%r9), %r11; movq (30 * 8)(%r9), %r12; movq (31 * 8)(%r9), %r13; OCB_INPUT(14, %r10, %r11, %ymm1); OCB_INPUT(15, %r12, %r13, %ymm0); #undef OCB_INPUT vmovdqu %xmm14, (%rcx); movq %r8, %r10; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r9d; cmovel %r9d, %r8d; /* max */ - /* inpack16_pre: */ + /* inpack32_pre: */ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor %ymm13, %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call __camellia_dec_blk32; + call FUNC_NAME(dec_blk32); vpxor 0 * 32(%rsi), %ymm7, %ymm7; vpxor 1 * 32(%rsi), %ymm6, %ymm6; vpxor 2 * 32(%rsi), %ymm5, %ymm5; vpxor 3 * 32(%rsi), %ymm4, %ymm4; vpxor 4 * 32(%rsi), %ymm3, %ymm3; vpxor 5 * 32(%rsi), %ymm2, %ymm2; vpxor 6 * 32(%rsi), %ymm1, %ymm1; vpxor 7 * 32(%rsi), %ymm0, %ymm0; vmovdqu %ymm7, (7 * 32)(%rax); vmovdqu %ymm6, (6 * 32)(%rax); vpxor 8 * 32(%rsi), %ymm15, %ymm15; vpxor 9 * 32(%rsi), %ymm14, %ymm14; vpxor 10 * 32(%rsi), %ymm13, %ymm13; vpxor 11 * 32(%rsi), %ymm12, %ymm12; vpxor 12 * 32(%rsi), %ymm11, %ymm11; vpxor 13 * 32(%rsi), %ymm10, %ymm10; vpxor 14 * 32(%rsi), %ymm9, %ymm9; vpxor 15 * 32(%rsi), %ymm8, %ymm8; /* Checksum_i = Checksum_{i-1} xor P_i */ vpxor %ymm5, %ymm7, %ymm7; vpxor %ymm4, %ymm6, %ymm6; vpxor %ymm3, %ymm7, %ymm7; vpxor %ymm2, %ymm6, %ymm6; vpxor %ymm1, %ymm7, %ymm7; vpxor %ymm0, %ymm6, %ymm6; vpxor %ymm15, %ymm7, %ymm7; vpxor %ymm14, %ymm6, %ymm6; vpxor %ymm13, %ymm7, %ymm7; vpxor %ymm12, %ymm6, %ymm6; vpxor %ymm11, %ymm7, %ymm7; vpxor %ymm10, %ymm6, %ymm6; vpxor %ymm9, %ymm7, %ymm7; vpxor %ymm8, %ymm6, %ymm6; vpxor %ymm7, %ymm6, %ymm7; vextracti128 $1, %ymm7, %xmm6; vpxor %xmm6, %xmm7, %xmm7; vpxor (%r10), %xmm7, %xmm7; vmovdqu %xmm7, (%r10); vmovdqu 7 * 32(%rax), %ymm7; vmovdqu 6 * 32(%rax), %ymm6; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; movq (16 * 32 + 0 * 8)(%rsp), %r10; movq (16 * 32 + 1 * 8)(%rsp), %r11; movq (16 * 32 + 2 * 8)(%rsp), %r12; movq (16 * 32 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);) .align 8 .globl FUNC_NAME(ocb_auth) ELF(.type FUNC_NAME(ocb_auth),@function;) FUNC_NAME(ocb_auth): /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $(16 * 32 + 4 * 8), %rsp; andq $~63, %rsp; movq %rsp, %rax; movq %r10, (16 * 32 + 0 * 8)(%rsp); movq %r11, (16 * 32 + 1 * 8)(%rsp); movq %r12, (16 * 32 + 2 * 8)(%rsp); movq %r13, (16 * 32 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rdx), %xmm14; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rsi), yreg; \ vpxor (l0reg), %xmm14, %xmm15; \ vpxor (l1reg), %xmm15, %xmm14; \ vinserti128 $1, %xmm14, %ymm15, %ymm15; \ vpxor yreg, %ymm15, yreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %r11, %ymm0); vmovdqu %ymm0, (15 * 32)(%rax); OCB_INPUT(1, %r12, %r13, %ymm0); vmovdqu %ymm0, (14 * 32)(%rax); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(2, %r10, %r11, %ymm13); OCB_INPUT(3, %r12, %r13, %ymm12); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %r11, %ymm11); OCB_INPUT(5, %r12, %r13, %ymm10); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(6, %r10, %r11, %ymm9); OCB_INPUT(7, %r12, %r13, %ymm8); movq (16 * 8)(%r8), %r10; movq (17 * 8)(%r8), %r11; movq (18 * 8)(%r8), %r12; movq (19 * 8)(%r8), %r13; OCB_INPUT(8, %r10, %r11, %ymm7); OCB_INPUT(9, %r12, %r13, %ymm6); movq (20 * 8)(%r8), %r10; movq (21 * 8)(%r8), %r11; movq (22 * 8)(%r8), %r12; movq (23 * 8)(%r8), %r13; OCB_INPUT(10, %r10, %r11, %ymm5); OCB_INPUT(11, %r12, %r13, %ymm4); movq (24 * 8)(%r8), %r10; movq (25 * 8)(%r8), %r11; movq (26 * 8)(%r8), %r12; movq (27 * 8)(%r8), %r13; OCB_INPUT(12, %r10, %r11, %ymm3); OCB_INPUT(13, %r12, %r13, %ymm2); movq (28 * 8)(%r8), %r10; movq (29 * 8)(%r8), %r11; movq (30 * 8)(%r8), %r12; movq (31 * 8)(%r8), %r13; OCB_INPUT(14, %r10, %r11, %ymm1); OCB_INPUT(15, %r12, %r13, %ymm0); #undef OCB_INPUT vmovdqu %xmm14, (%rdx); cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r10d; cmovel %r10d, %r8d; /* max */ movq %rcx, %r10; - /* inpack16_pre: */ + /* inpack32_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor %ymm13, %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call __camellia_enc_blk32; + call FUNC_NAME(enc_blk32); vpxor %ymm7, %ymm6, %ymm6; vpxor %ymm5, %ymm4, %ymm4; vpxor %ymm3, %ymm2, %ymm2; vpxor %ymm1, %ymm0, %ymm0; vpxor %ymm15, %ymm14, %ymm14; vpxor %ymm13, %ymm12, %ymm12; vpxor %ymm11, %ymm10, %ymm10; vpxor %ymm9, %ymm8, %ymm8; vpxor %ymm6, %ymm4, %ymm4; vpxor %ymm2, %ymm0, %ymm0; vpxor %ymm14, %ymm12, %ymm12; vpxor %ymm10, %ymm8, %ymm8; vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm12, %ymm8, %ymm8; vpxor %ymm0, %ymm8, %ymm0; vextracti128 $1, %ymm0, %xmm1; vpxor (%r10), %xmm0, %xmm0; vpxor %xmm0, %xmm1, %xmm0; vmovdqu %xmm0, (%r10); vzeroall; movq (16 * 32 + 0 * 8)(%rsp), %r10; movq (16 * 32 + 1 * 8)(%rsp), %r11; movq (16 * 32 + 2 * 8)(%rsp), %r12; movq (16 * 32 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);) .align 8 .globl FUNC_NAME(enc_blk1_32) ELF(.type FUNC_NAME(enc_blk1_32),@function;) FUNC_NAME(enc_blk1_32): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %ecx: nblocks (1 to 32) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); movl %ecx, %r9d; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; cmpl $31, %ecx; vpxor %xmm0, %xmm0, %xmm0; ja 1f; jb 2f; vmovdqu 15 * 32(%rdx), %xmm0; jmp 2f; 1: vmovdqu 15 * 32(%rdx), %ymm0; 2: vmovdqu %ymm0, (%rax); vpbroadcastq (key_table)(CTX), %ymm0; vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; #define LOAD_INPUT(offset, ymm) \ cmpl $(1 + 2 * (offset)), %ecx; \ jb 2f; \ ja 1f; \ vmovdqu (offset) * 32(%rdx), %ymm##_x; \ vpxor %ymm0, %ymm, %ymm; \ jmp 2f; \ 1: \ vpxor (offset) * 32(%rdx), %ymm0, %ymm; LOAD_INPUT(0, ymm15); LOAD_INPUT(1, ymm14); LOAD_INPUT(2, ymm13); LOAD_INPUT(3, ymm12); LOAD_INPUT(4, ymm11); LOAD_INPUT(5, ymm10); LOAD_INPUT(6, ymm9); LOAD_INPUT(7, ymm8); LOAD_INPUT(8, ymm7); LOAD_INPUT(9, ymm6); LOAD_INPUT(10, ymm5); LOAD_INPUT(11, ymm4); LOAD_INPUT(12, ymm3); LOAD_INPUT(13, ymm2); LOAD_INPUT(14, ymm1); vpxor (%rax), %ymm0, %ymm0; 2: - call __camellia_enc_blk32; + call FUNC_NAME(enc_blk32); #define STORE_OUTPUT(ymm, offset) \ cmpl $(1 + 2 * (offset)), %r9d; \ jb 2f; \ ja 1f; \ vmovdqu %ymm##_x, (offset) * 32(%rsi); \ jmp 2f; \ 1: \ vmovdqu %ymm, (offset) * 32(%rsi); STORE_OUTPUT(ymm7, 0); STORE_OUTPUT(ymm6, 1); STORE_OUTPUT(ymm5, 2); STORE_OUTPUT(ymm4, 3); STORE_OUTPUT(ymm3, 4); STORE_OUTPUT(ymm2, 5); STORE_OUTPUT(ymm1, 6); STORE_OUTPUT(ymm0, 7); STORE_OUTPUT(ymm15, 8); STORE_OUTPUT(ymm14, 9); STORE_OUTPUT(ymm13, 10); STORE_OUTPUT(ymm12, 11); STORE_OUTPUT(ymm11, 12); STORE_OUTPUT(ymm10, 13); STORE_OUTPUT(ymm9, 14); STORE_OUTPUT(ymm8, 15); 2: vzeroall; leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);) .align 8 .globl FUNC_NAME(dec_blk1_32) ELF(.type FUNC_NAME(dec_blk1_32),@function;) FUNC_NAME(dec_blk1_32): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %ecx: nblocks (1 to 32) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); movl %ecx, %r9d; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; cmpl $31, %ecx; vpxor %xmm0, %xmm0, %xmm0; ja 1f; jb 2f; vmovdqu 15 * 32(%rdx), %xmm0; jmp 2f; 1: vmovdqu 15 * 32(%rdx), %ymm0; 2: vmovdqu %ymm0, (%rax); vpbroadcastq (key_table)(CTX, %r8, 8), %ymm0; vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; LOAD_INPUT(0, ymm15); LOAD_INPUT(1, ymm14); LOAD_INPUT(2, ymm13); LOAD_INPUT(3, ymm12); LOAD_INPUT(4, ymm11); LOAD_INPUT(5, ymm10); LOAD_INPUT(6, ymm9); LOAD_INPUT(7, ymm8); LOAD_INPUT(8, ymm7); LOAD_INPUT(9, ymm6); LOAD_INPUT(10, ymm5); LOAD_INPUT(11, ymm4); LOAD_INPUT(12, ymm3); LOAD_INPUT(13, ymm2); LOAD_INPUT(14, ymm1); vpxor (%rax), %ymm0, %ymm0; 2: - call __camellia_dec_blk32; + call FUNC_NAME(dec_blk32); STORE_OUTPUT(ymm7, 0); STORE_OUTPUT(ymm6, 1); STORE_OUTPUT(ymm5, 2); STORE_OUTPUT(ymm4, 3); STORE_OUTPUT(ymm3, 4); STORE_OUTPUT(ymm2, 5); STORE_OUTPUT(ymm1, 6); STORE_OUTPUT(ymm0, 7); STORE_OUTPUT(ymm15, 8); STORE_OUTPUT(ymm14, 9); STORE_OUTPUT(ymm13, 10); STORE_OUTPUT(ymm12, 11); STORE_OUTPUT(ymm11, 12); STORE_OUTPUT(ymm10, 13); STORE_OUTPUT(ymm9, 14); STORE_OUTPUT(ymm8, 15); 2: vzeroall; leave; CFI_LEAVE(); ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);) #endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */ diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S new file mode 100644 index 00000000..70e10460 --- /dev/null +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -0,0 +1,1566 @@ +/* camellia-gfni-avx512-amd64.h - GFNI/AVX512 implementation of Camellia + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#ifdef __x86_64 +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) + +#include "asm-common-amd64.h" + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct CAMELLIA_context: */ +#define key_table 0 +#define key_bitlength CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %rdi +#define RIO %r8 + +/********************************************************************** + helper macros + **********************************************************************/ + +#define zmm0_x xmm0 +#define zmm1_x xmm1 +#define zmm2_x xmm2 +#define zmm3_x xmm3 +#define zmm4_x xmm4 +#define zmm5_x xmm5 +#define zmm6_x xmm6 +#define zmm7_x xmm7 +#define zmm8_x xmm8 +#define zmm9_x xmm9 +#define zmm10_x xmm10 +#define zmm11_x xmm11 +#define zmm12_x xmm12 +#define zmm13_x xmm13 +#define zmm14_x xmm14 +#define zmm15_x xmm15 + +#define zmm0_y ymm0 +#define zmm1_y ymm1 +#define zmm2_y ymm2 +#define zmm3_y ymm3 +#define zmm4_y ymm4 +#define zmm5_y ymm5 +#define zmm6_y ymm6 +#define zmm7_y ymm7 +#define zmm8_y ymm8 +#define zmm9_y ymm9 +#define zmm10_y ymm10 +#define zmm11_y ymm11 +#define zmm12_y ymm12 +#define zmm13_y ymm13 +#define zmm14_y ymm14 +#define zmm15_y ymm15 + +#define mem_ab_0 %zmm16 +#define mem_ab_1 %zmm17 +#define mem_ab_2 %zmm31 +#define mem_ab_3 %zmm18 +#define mem_ab_4 %zmm19 +#define mem_ab_5 %zmm20 +#define mem_ab_6 %zmm21 +#define mem_ab_7 %zmm22 +#define mem_cd_0 %zmm23 +#define mem_cd_1 %zmm24 +#define mem_cd_2 %zmm30 +#define mem_cd_3 %zmm25 +#define mem_cd_4 %zmm26 +#define mem_cd_5 %zmm27 +#define mem_cd_6 %zmm28 +#define mem_cd_7 %zmm29 + +#define clear_vec4(v0,v1,v2,v3) \ + vpxord v0, v0, v0; \ + vpxord v1, v1, v1; \ + vpxord v2, v2, v2; \ + vpxord v3, v3, v3 + +#define clear_zmm16_zmm31() \ + clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \ + clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \ + clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \ + clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31) + +#define clear_regs() \ + kxorq %k1, %k1, %k1; \ + vzeroall; \ + clear_zmm16_zmm31() + +/********************************************************************** + GFNI helper macros and constants + **********************************************************************/ + +#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \ + ( (((a0) & 1) << 0) | \ + (((a1) & 1) << 1) | \ + (((a2) & 1) << 2) | \ + (((a3) & 1) << 3) | \ + (((a4) & 1) << 4) | \ + (((a5) & 1) << 5) | \ + (((a6) & 1) << 6) | \ + (((a7) & 1) << 7) ) + +#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \ + ( ((l7) << (0 * 8)) | \ + ((l6) << (1 * 8)) | \ + ((l5) << (2 * 8)) | \ + ((l4) << (3 * 8)) | \ + ((l3) << (4 * 8)) | \ + ((l2) << (5 * 8)) | \ + ((l1) << (6 * 8)) | \ + ((l0) << (7 * 8)) ) + +/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4. + * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. + * + * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are + * combination of function "A" (AES SubBytes affine transformation) and + * "ψ₁"/"ψ₂"/"ψ₃". + */ + +/* Constant from "θ₁(x)" and "θ₄(x)" functions. */ +#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0) + +/* Constant from "ψ₁(A(x))" function: */ +#define post_filter_constant_s14 BV8(0, 1, 1, 1, 0, 1, 1, 0) + +/* Constant from "ψ₂(A(x))" function: */ +#define post_filter_constant_s2 BV8(0, 0, 1, 1, 1, 0, 1, 1) + +/* Constant from "ψ₃(A(x))" function: */ +#define post_filter_constant_s3 BV8(1, 1, 1, 0, 1, 1, 0, 0) + +/********************************************************************** + 64-way parallel camellia + **********************************************************************/ + +/* roundsm64 (GFNI/AVX512 version) + * IN: + * x0..x7: byte-sliced AB state + * mem_cd: register pointer storing CD state + * key: index for key material + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ + t6, t7, mem_cd, key) \ + /* \ + * S-function with AES subbytes \ + */ \ + vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \ + vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \ + vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \ + vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \ + vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \ + vpxor t7##_x, t7##_x, t7##_x; \ + vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ + \ + /* prefilter sboxes */ \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \ + \ + /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \ + \ + /* sbox GF8 inverse + postfilter sbox 3 */ \ + vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \ + vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \ + \ + /* sbox GF8 inverse + postfilter sbox 2 */ \ + vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \ + vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \ + \ + vpsrldq $1, t0, t1; \ + vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ + vpsrldq $3, t0, t3; \ + \ + /* P-function */ \ + vpxorq x5, x0, x0; \ + vpxorq x6, x1, x1; \ + vpxorq x7, x2, x2; \ + vpxorq x4, x3, x3; \ + \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ + vpxorq x2, x4, x4; \ + vpxorq x3, x5, x5; \ + vpxorq x0, x6, x6; \ + vpxorq x1, x7, x7; \ + \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ + vpxorq x7, x0, x0; \ + vpxorq x4, x1, x1; \ + vpxorq x5, x2, x2; \ + vpxorq x6, x3, x3; \ + \ + vpxorq x3, x4, x4; \ + vpxorq x0, x5, x5; \ + vpxorq x1, x6, x6; \ + vpxorq x2, x7, x7; /* note: high and low parts swapped */ \ + \ + /* Add key material and result to CD (x becomes new CD) */ \ + \ + vpternlogq $0x96, mem_cd##_5, t6, x1; \ + \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpternlogq $0x96, mem_cd##_4, t7, x0; \ + vpternlogq $0x96, mem_cd##_6, t5, x2; \ + vpternlogq $0x96, mem_cd##_7, t4, x3; \ + vpternlogq $0x96, mem_cd##_0, t3, x4; \ + vpternlogq $0x96, mem_cd##_1, t2, x5; \ + vpternlogq $0x96, mem_cd##_2, t1, x6; \ + vpternlogq $0x96, mem_cd##_3, t0, x7; + +/* + * IN/OUT: + * x0..x7: byte-sliced AB state preloaded + * mem_ab: byte-sliced AB state in memory + * mem_cb: byte-sliced CD state in memory + */ +#define two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ + roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ + \ + vmovdqu64 x0, mem_cd##_4; \ + vmovdqu64 x1, mem_cd##_5; \ + vmovdqu64 x2, mem_cd##_6; \ + vmovdqu64 x3, mem_cd##_7; \ + vmovdqu64 x4, mem_cd##_0; \ + vmovdqu64 x5, mem_cd##_1; \ + vmovdqu64 x6, mem_cd##_2; \ + vmovdqu64 x7, mem_cd##_3; \ + \ + roundsm64(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ + \ + store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); + +#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ + +#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ + /* Store new AB state */ \ + vmovdqu64 x4, mem_ab##_4; \ + vmovdqu64 x5, mem_ab##_5; \ + vmovdqu64 x6, mem_ab##_6; \ + vmovdqu64 x7, mem_ab##_7; \ + vmovdqu64 x0, mem_ab##_0; \ + vmovdqu64 x1, mem_ab##_1; \ + vmovdqu64 x2, mem_ab##_2; \ + vmovdqu64 x3, mem_ab##_3; + +#define enc_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ + two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ + two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); + +#define dec_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ + two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ + two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); + +/* + * IN: + * v0..3: byte-sliced 32-bit integers + * OUT: + * v0..3: (IN << 1) + * t0, t1, t2, zero: (IN >> 7) + */ +#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \ + vpcmpltb zero, v0, %k1; \ + vpaddb v0, v0, v0; \ + vpaddb one, zero, t0{%k1}{z}; \ + \ + vpcmpltb zero, v1, %k1; \ + vpaddb v1, v1, v1; \ + vpaddb one, zero, t1{%k1}{z}; \ + \ + vpcmpltb zero, v2, %k1; \ + vpaddb v2, v2, v2; \ + vpaddb one, zero, t2{%k1}{z}; \ + \ + vpcmpltb zero, v3, %k1; \ + vpaddb v3, v3, v3; \ + vpaddb one, zero, zero{%k1}{z}; + +/* + * IN: + * r: byte-sliced AB state in memory + * l: byte-sliced CD state in memory + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define fls64(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ + tt1, tt2, tt3, kll, klr, krl, krr, tmp) \ + /* \ + * t0 = kll; \ + * t0 &= ll; \ + * lr ^= rol32(t0, 1); \ + */ \ + vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ + vpbroadcastq .Lbyte_ones rRIP, tmp; \ + vpxor tt3##_x, tt3##_x, tt3##_x; \ + vpshufb tt3, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t0; \ + \ + vpandq l0, t0, t0; \ + vpandq l1, t1, t1; \ + vpandq l2, t2, t2; \ + vpandq l3, t3, t3; \ + \ + rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \ + \ + vpternlogq $0x96, tt2, t0, l4; \ + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ + vmovdqu64 l4, l##_4; \ + vpternlogq $0x96, tt1, t1, l5; \ + vmovdqu64 l5, l##_5; \ + vpternlogq $0x96, tt0, t2, l6; \ + vmovdqu64 l6, l##_6; \ + vpternlogq $0x96, tt3, t3, l7; \ + vmovdqu64 l7, l##_7; \ + vpxor tt3##_x, tt3##_x, tt3##_x; \ + \ + /* \ + * t2 = krr; \ + * t2 |= rr; \ + * rl ^= t2; \ + */ \ + \ + vpshufb tt3, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t0; \ + \ + vpternlogq $0x1e, r##_4, t0, r##_0; \ + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ + vpternlogq $0x1e, r##_5, t1, r##_1; \ + vpternlogq $0x1e, r##_6, t2, r##_2; \ + vpternlogq $0x1e, r##_7, t3, r##_3; \ + \ + /* \ + * t2 = krl; \ + * t2 &= rl; \ + * rr ^= rol32(t2, 1); \ + */ \ + vpshufb tt3, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t0; \ + \ + vpandq r##_0, t0, t0; \ + vpandq r##_1, t1, t1; \ + vpandq r##_2, t2, t2; \ + vpandq r##_3, t3, t3; \ + \ + rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \ + \ + vpternlogq $0x96, tt2, t0, r##_4; \ + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ + vpternlogq $0x96, tt1, t1, r##_5; \ + vpternlogq $0x96, tt0, t2, r##_6; \ + vpternlogq $0x96, tt3, t3, r##_7; \ + vpxor tt3##_x, tt3##_x, tt3##_x; \ + \ + /* \ + * t0 = klr; \ + * t0 |= lr; \ + * ll ^= t0; \ + */ \ + \ + vpshufb tt3, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt3, t0, t0; \ + \ + vpternlogq $0x1e, l4, t0, l0; \ + vmovdqu64 l0, l##_0; \ + vpternlogq $0x1e, l5, t1, l1; \ + vmovdqu64 l1, l##_1; \ + vpternlogq $0x1e, l6, t2, l2; \ + vmovdqu64 l2, l##_2; \ + vpternlogq $0x1e, l7, t3, l3; \ + vmovdqu64 l3, l##_3; + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ + a3, b3, c3, d3, st0, st1) \ + transpose_4x4(a0, a1, a2, a3, st0, st1); \ + transpose_4x4(b0, b1, b2, b3, st0, st1); \ + \ + transpose_4x4(c0, c1, c2, c3, st0, st1); \ + transpose_4x4(d0, d1, d2, d3, st0, st1); \ + \ + vbroadcasti64x2 .Lshufb_16x16b rRIP, st0; \ + vpshufb st0, a0, a0; \ + vpshufb st0, a1, a1; \ + vpshufb st0, a2, a2; \ + vpshufb st0, a3, a3; \ + vpshufb st0, b0, b0; \ + vpshufb st0, b1, b1; \ + vpshufb st0, b2, b2; \ + vpshufb st0, b3, b3; \ + vpshufb st0, c0, c0; \ + vpshufb st0, c1, c1; \ + vpshufb st0, c2, c2; \ + vpshufb st0, c3, c3; \ + vpshufb st0, d0, d0; \ + vpshufb st0, d1, d1; \ + vpshufb st0, d2, d2; \ + vpshufb st0, d3, d3; \ + \ + transpose_4x4(a0, b0, c0, d0, st0, st1); \ + transpose_4x4(a1, b1, c1, d1, st0, st1); \ + \ + transpose_4x4(a2, b2, c2, d2, st0, st1); \ + transpose_4x4(a3, b3, c3, d3, st0, st1); \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack64_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio, key) \ + vpbroadcastq key, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ + \ + vpxorq 0 * 64(rio), x0, y7; \ + vpxorq 1 * 64(rio), x0, y6; \ + vpxorq 2 * 64(rio), x0, y5; \ + vpxorq 3 * 64(rio), x0, y4; \ + vpxorq 4 * 64(rio), x0, y3; \ + vpxorq 5 * 64(rio), x0, y2; \ + vpxorq 6 * 64(rio), x0, y1; \ + vpxorq 7 * 64(rio), x0, y0; \ + vpxorq 8 * 64(rio), x0, x7; \ + vpxorq 9 * 64(rio), x0, x6; \ + vpxorq 10 * 64(rio), x0, x5; \ + vpxorq 11 * 64(rio), x0, x4; \ + vpxorq 12 * 64(rio), x0, x3; \ + vpxorq 13 * 64(rio), x0, x2; \ + vpxorq 14 * 64(rio), x0, x1; \ + vpxorq 15 * 64(rio), x0, x0; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack64_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, tmp0, tmp1) \ + byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ + y4, y5, y6, y7, tmp0, tmp1); \ + \ + vmovdqu64 x0, mem_ab##_0; \ + vmovdqu64 x1, mem_ab##_1; \ + vmovdqu64 x2, mem_ab##_2; \ + vmovdqu64 x3, mem_ab##_3; \ + vmovdqu64 x4, mem_ab##_4; \ + vmovdqu64 x5, mem_ab##_5; \ + vmovdqu64 x6, mem_ab##_6; \ + vmovdqu64 x7, mem_ab##_7; \ + vmovdqu64 y0, mem_cd##_0; \ + vmovdqu64 y1, mem_cd##_1; \ + vmovdqu64 y2, mem_cd##_2; \ + vmovdqu64 y3, mem_cd##_3; \ + vmovdqu64 y4, mem_cd##_4; \ + vmovdqu64 y5, mem_cd##_5; \ + vmovdqu64 y6, mem_cd##_6; \ + vmovdqu64 y7, mem_cd##_7; + +/* de-byteslice, apply post-whitening and store blocks */ +#define outunpack64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ + y5, y6, y7, key, tmp0, tmp1) \ + byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ + y3, y7, x3, x7, tmp0, tmp1); \ + \ + vpbroadcastq key, tmp0; \ + vpshufb .Lpack_bswap rRIP, tmp0, tmp0; \ + \ + vpxorq tmp0, y7, y7; \ + vpxorq tmp0, y6, y6; \ + vpxorq tmp0, y5, y5; \ + vpxorq tmp0, y4, y4; \ + vpxorq tmp0, y3, y3; \ + vpxorq tmp0, y2, y2; \ + vpxorq tmp0, y1, y1; \ + vpxorq tmp0, y0, y0; \ + vpxorq tmp0, x7, x7; \ + vpxorq tmp0, x6, x6; \ + vpxorq tmp0, x5, x5; \ + vpxorq tmp0, x4, x4; \ + vpxorq tmp0, x3, x3; \ + vpxorq tmp0, x2, x2; \ + vpxorq tmp0, x1, x1; \ + vpxorq tmp0, x0, x0; + +#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio) \ + vmovdqu64 x0, 0 * 64(rio); \ + vmovdqu64 x1, 1 * 64(rio); \ + vmovdqu64 x2, 2 * 64(rio); \ + vmovdqu64 x3, 3 * 64(rio); \ + vmovdqu64 x4, 4 * 64(rio); \ + vmovdqu64 x5, 5 * 64(rio); \ + vmovdqu64 x6, 6 * 64(rio); \ + vmovdqu64 x7, 7 * 64(rio); \ + vmovdqu64 y0, 8 * 64(rio); \ + vmovdqu64 y1, 9 * 64(rio); \ + vmovdqu64 y2, 10 * 64(rio); \ + vmovdqu64 y3, 11 * 64(rio); \ + vmovdqu64 y4, 12 * 64(rio); \ + vmovdqu64 y5, 13 * 64(rio); \ + vmovdqu64 y6, 14 * 64(rio); \ + vmovdqu64 y7, 15 * 64(rio); + +.text + +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) + +_gcry_camellia_gfni_avx512__constants: +ELF(.type _gcry_camellia_gfni_avx512__constants,@object;) + +.align 64 +.Lpack_bswap: + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + +.Lcounter0123_lo: + .quad 0, 0 + .quad 1, 0 + .quad 2, 0 + .quad 3, 0 + +.align 16 +.Lcounter4444_lo: + .quad 4, 0 +.Lcounter8888_lo: + .quad 8, 0 +.Lcounter16161616_lo: + .quad 16, 0 +.Lcounter1111_hi: + .quad 0, 1 + +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22; + vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23; + vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24; + vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25; + +.Lbyte_ones: + .byte 1, 1, 1, 1, 1, 1, 1, 1 + +/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 + * and s4. + * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. + * + * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are + * combination of function "A" (AES SubBytes affine transformation) and + * "ψ₁"/"ψ₂"/"ψ₃". + */ + +/* Bit-matrix from "θ₁(x)" function: */ +.Lpre_filter_bitmatrix_s123: + .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1), + BV8(0, 0, 1, 1, 0, 0, 1, 0), + BV8(1, 1, 0, 1, 0, 0, 0, 0), + BV8(1, 0, 1, 1, 0, 0, 1, 1), + BV8(0, 0, 0, 0, 1, 1, 0, 0), + BV8(1, 0, 1, 0, 0, 1, 0, 0), + BV8(0, 0, 1, 0, 1, 1, 0, 0), + BV8(1, 0, 0, 0, 0, 1, 1, 0)) + +/* Bit-matrix from "θ₄(x)" function: */ +.Lpre_filter_bitmatrix_s4: + .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1), + BV8(0, 1, 1, 0, 0, 1, 0, 0), + BV8(1, 0, 1, 0, 0, 0, 0, 1), + BV8(0, 1, 1, 0, 0, 1, 1, 1), + BV8(0, 0, 0, 1, 1, 0, 0, 0), + BV8(0, 1, 0, 0, 1, 0, 0, 1), + BV8(0, 1, 0, 1, 1, 0, 0, 0), + BV8(0, 0, 0, 0, 1, 1, 0, 1)) + +/* Bit-matrix from "ψ₁(A(x))" function: */ +.Lpost_filter_bitmatrix_s14: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 1, 1, 0, 0, 1, 1, 0), + BV8(1, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 0, 1, 1), + BV8(1, 0, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 0, 1, 1, 1, 1, 0), + BV8(0, 1, 1, 1, 1, 1, 1, 1), + BV8(0, 0, 0, 1, 1, 1, 0, 0)) + +/* Bit-matrix from "ψ₂(A(x))" function: */ +.Lpost_filter_bitmatrix_s2: + .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 1, 1, 0, 0, 1, 1, 0), + BV8(1, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 0, 1, 1), + BV8(1, 0, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 0, 1, 1, 1, 1, 0), + BV8(0, 1, 1, 1, 1, 1, 1, 1)) + +/* Bit-matrix from "ψ₃(A(x))" function: */ +.Lpost_filter_bitmatrix_s3: + .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0), + BV8(1, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 0, 1, 1), + BV8(1, 0, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 0, 1, 1, 1, 1, 0), + BV8(0, 1, 1, 1, 1, 1, 1, 1), + BV8(0, 0, 0, 1, 1, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1)) + +ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;) + +.align 8 +ELF(.type __camellia_gfni_avx512_enc_blk64,@function;) + +__camellia_gfni_avx512_enc_blk64: + /* input: + * %rdi: ctx, CTX + * %r8d: 24 for 16 byte key, 32 for larger + * %zmm0..%zmm15: 64 plaintext blocks + * output: + * %zmm0..%zmm15: 64 encrypted blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + CFI_STARTPROC(); + + leaq (-8 * 8)(CTX, %r8, 8), %r8; + + inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, mem_ab, mem_cd, %zmm30, %zmm31); + +.align 8 +.Lenc_loop: + enc_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, mem_ab, mem_cd, 0); + + cmpq %r8, CTX; + je .Lenc_done; + leaq (8 * 8)(CTX), CTX; + + fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX), + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX), + %zmm31); + jmp .Lenc_loop; + +.align 8 +.Lenc_done: + /* load CD for output */ + vmovdqu64 mem_cd_0, %zmm8; + vmovdqu64 mem_cd_1, %zmm9; + vmovdqu64 mem_cd_2, %zmm10; + vmovdqu64 mem_cd_3, %zmm11; + vmovdqu64 mem_cd_4, %zmm12; + vmovdqu64 mem_cd_5, %zmm13; + vmovdqu64 mem_cd_6, %zmm14; + vmovdqu64 mem_cd_7, %zmm15; + + outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, ((key_table) + 8 * 8)(%r8), %zmm30, %zmm31); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size __camellia_gfni_avx512_enc_blk64,.-__camellia_gfni_avx512_enc_blk64;) + +.align 8 +ELF(.type __camellia_gfni_avx512_dec_blk64,@function;) + +__camellia_gfni_avx512_dec_blk64: + /* input: + * %rdi: ctx, CTX + * %r8d: 24 for 16 byte key, 32 for larger + * %zmm0..%zmm15: 64 encrypted blocks + * output: + * %zmm0..%zmm15: 64 plaintext blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + CFI_STARTPROC(); + + movq %r8, %rcx; + movq CTX, %r8 + leaq (-8 * 8)(CTX, %rcx, 8), CTX; + + inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, mem_ab, mem_cd, %zmm30, %zmm31); + +.align 8 +.Ldec_loop: + dec_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, mem_ab, mem_cd, 0); + + cmpq %r8, CTX; + je .Ldec_done; + + fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX), + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX), + %zmm31); + + leaq (-8 * 8)(CTX), CTX; + jmp .Ldec_loop; + +.align 8 +.Ldec_done: + /* load CD for output */ + vmovdqu64 mem_cd_0, %zmm8; + vmovdqu64 mem_cd_1, %zmm9; + vmovdqu64 mem_cd_2, %zmm10; + vmovdqu64 mem_cd_3, %zmm11; + vmovdqu64 mem_cd_4, %zmm12; + vmovdqu64 mem_cd_5, %zmm13; + vmovdqu64 mem_cd_6, %zmm14; + vmovdqu64 mem_cd_7, %zmm15; + + outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, (key_table)(CTX), %zmm30, %zmm31); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size __camellia_gfni_avx512_dec_blk64,.-__camellia_gfni_avx512_dec_blk64;) + +#define add_le128(out, in, lo_counter, hi_counter1) \ + vpaddq lo_counter, in, out; \ + vpcmpuq $1, lo_counter, out, %k1; \ + kaddb %k1, %k1, %k1; \ + vpaddq hi_counter1, out, out{%k1}; + +.align 8 +.globl _gcry_camellia_gfni_avx512_ctr_enc +ELF(.type _gcry_camellia_gfni_avx512_ctr_enc,@function;) + +_gcry_camellia_gfni_avx512_ctr_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (64 blocks) + * %rdx: src (64 blocks) + * %rcx: iv (big endian, 128bit) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19; + vmovdqa64 .Lcounter0123_lo rRIP, %zmm21; + vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22; + vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23; + vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24; + vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25; + + /* load IV and byteswap */ + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + vbroadcasti64x2 (%rcx), %zmm0; + vpshufb %zmm19, %zmm0, %zmm0; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 64), %r11; + ja .Lload_ctr_carry; + + /* construct IVs */ + vpaddq %zmm21, %zmm0, %zmm15; /* +0:+1:+2:+3 */ + vpaddq %zmm22, %zmm15, %zmm14; /* +4:+5:+6:+7 */ + vpaddq %zmm23, %zmm15, %zmm13; /* +8:+9:+10:+11 */ + vpaddq %zmm23, %zmm14, %zmm12; /* +12:+13:+14:+15 */ + vpaddq %zmm24, %zmm15, %zmm11; /* +16... */ + vpaddq %zmm24, %zmm14, %zmm10; /* +20... */ + vpaddq %zmm24, %zmm13, %zmm9; /* +24... */ + vpaddq %zmm24, %zmm12, %zmm8; /* +28... */ + vpaddq %zmm24, %zmm11, %zmm7; /* +32... */ + vpaddq %zmm24, %zmm10, %zmm6; /* +36... */ + vpaddq %zmm24, %zmm9, %zmm5; /* +40... */ + vpaddq %zmm24, %zmm8, %zmm4; /* +44... */ + vpaddq %zmm24, %zmm7, %zmm3; /* +48... */ + vpaddq %zmm24, %zmm6, %zmm2; /* +52... */ + vpaddq %zmm24, %zmm5, %zmm1; /* +56... */ + vpaddq %zmm24, %zmm4, %zmm0; /* +60... */ + jmp .Lload_ctr_done; + +.align 4 +.Lload_ctr_carry: + /* construct IVs */ + add_le128(%zmm15, %zmm0, %zmm21, %zmm25); /* +0:+1:+2:+3 */ + add_le128(%zmm14, %zmm15, %zmm22, %zmm25); /* +4:+5:+6:+7 */ + add_le128(%zmm13, %zmm15, %zmm23, %zmm25); /* +8:+9:+10:+11 */ + add_le128(%zmm12, %zmm14, %zmm23, %zmm25); /* +12:+13:+14:+15 */ + add_le128(%zmm11, %zmm15, %zmm24, %zmm25); /* +16... */ + add_le128(%zmm10, %zmm14, %zmm24, %zmm25); /* +20... */ + add_le128(%zmm9, %zmm13, %zmm24, %zmm25); /* +24... */ + add_le128(%zmm8, %zmm12, %zmm24, %zmm25); /* +28... */ + add_le128(%zmm7, %zmm11, %zmm24, %zmm25); /* +32... */ + add_le128(%zmm6, %zmm10, %zmm24, %zmm25); /* +36... */ + add_le128(%zmm5, %zmm9, %zmm24, %zmm25); /* +40... */ + add_le128(%zmm4, %zmm8, %zmm24, %zmm25); /* +44... */ + add_le128(%zmm3, %zmm7, %zmm24, %zmm25); /* +48... */ + add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */ + add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */ + add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */ + +.align 4 +.Lload_ctr_done: + vpbroadcastq (key_table)(CTX), %zmm16; + vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16; + + /* Byte-swap IVs and update counter. */ + addq $64, %r11; + adcq $0, %r10; + vpshufb %zmm19, %zmm15, %zmm15; + vpshufb %zmm19, %zmm14, %zmm14; + vpshufb %zmm19, %zmm13, %zmm13; + vpshufb %zmm19, %zmm12, %zmm12; + vpshufb %zmm19, %zmm11, %zmm11; + vpshufb %zmm19, %zmm10, %zmm10; + vpshufb %zmm19, %zmm9, %zmm9; + vpshufb %zmm19, %zmm8, %zmm8; + bswapq %r11; + bswapq %r10; + vpshufb %zmm19, %zmm7, %zmm7; + vpshufb %zmm19, %zmm6, %zmm6; + vpshufb %zmm19, %zmm5, %zmm5; + vpshufb %zmm19, %zmm4, %zmm4; + vpshufb %zmm19, %zmm3, %zmm3; + vpshufb %zmm19, %zmm2, %zmm2; + vpshufb %zmm19, %zmm1, %zmm1; + vpshufb %zmm19, %zmm0, %zmm0; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + + /* inpack64_pre: */ + vpxorq %zmm0, %zmm16, %zmm0; + vpxorq %zmm1, %zmm16, %zmm1; + vpxorq %zmm2, %zmm16, %zmm2; + vpxorq %zmm3, %zmm16, %zmm3; + vpxorq %zmm4, %zmm16, %zmm4; + vpxorq %zmm5, %zmm16, %zmm5; + vpxorq %zmm6, %zmm16, %zmm6; + vpxorq %zmm7, %zmm16, %zmm7; + vpxorq %zmm8, %zmm16, %zmm8; + vpxorq %zmm9, %zmm16, %zmm9; + vpxorq %zmm10, %zmm16, %zmm10; + vpxorq %zmm11, %zmm16, %zmm11; + vpxorq %zmm12, %zmm16, %zmm12; + vpxorq %zmm13, %zmm16, %zmm13; + vpxorq %zmm14, %zmm16, %zmm14; + vpxorq %zmm15, %zmm16, %zmm15; + + call __camellia_gfni_avx512_enc_blk64; + + vpxorq 0 * 64(%rdx), %zmm7, %zmm7; + vpxorq 1 * 64(%rdx), %zmm6, %zmm6; + vpxorq 2 * 64(%rdx), %zmm5, %zmm5; + vpxorq 3 * 64(%rdx), %zmm4, %zmm4; + vpxorq 4 * 64(%rdx), %zmm3, %zmm3; + vpxorq 5 * 64(%rdx), %zmm2, %zmm2; + vpxorq 6 * 64(%rdx), %zmm1, %zmm1; + vpxorq 7 * 64(%rdx), %zmm0, %zmm0; + vpxorq 8 * 64(%rdx), %zmm15, %zmm15; + vpxorq 9 * 64(%rdx), %zmm14, %zmm14; + vpxorq 10 * 64(%rdx), %zmm13, %zmm13; + vpxorq 11 * 64(%rdx), %zmm12, %zmm12; + vpxorq 12 * 64(%rdx), %zmm11, %zmm11; + vpxorq 13 * 64(%rdx), %zmm10, %zmm10; + vpxorq 14 * 64(%rdx), %zmm9, %zmm9; + vpxorq 15 * 64(%rdx), %zmm8, %zmm8; + + write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0, + %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9, + %zmm8, %rsi); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;) + +.align 8 +.globl _gcry_camellia_gfni_avx512_cbc_dec +ELF(.type _gcry_camellia_gfni_avx512_cbc_dec,@function;) + +_gcry_camellia_gfni_avx512_cbc_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (64 blocks) + * %rdx: src (64 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + movq %rcx, %r9; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack64_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7, + %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14, + %zmm15, %rdx, (key_table)(CTX, %r8, 8)); + + call __camellia_gfni_avx512_dec_blk64; + + /* XOR output with IV */ + vmovdqu64 (%r9), %xmm16; + vinserti64x2 $1, (0 * 16)(%rdx), %ymm16, %ymm16; + vinserti64x4 $1, (1 * 16)(%rdx), %zmm16, %zmm16; + vpxorq %zmm16, %zmm7, %zmm7; + vpxorq (0 * 64 + 48)(%rdx), %zmm6, %zmm6; + vpxorq (1 * 64 + 48)(%rdx), %zmm5, %zmm5; + vpxorq (2 * 64 + 48)(%rdx), %zmm4, %zmm4; + vpxorq (3 * 64 + 48)(%rdx), %zmm3, %zmm3; + vpxorq (4 * 64 + 48)(%rdx), %zmm2, %zmm2; + vpxorq (5 * 64 + 48)(%rdx), %zmm1, %zmm1; + vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm0; + vpxorq (7 * 64 + 48)(%rdx), %zmm15, %zmm15; + vpxorq (8 * 64 + 48)(%rdx), %zmm14, %zmm14; + vpxorq (9 * 64 + 48)(%rdx), %zmm13, %zmm13; + vpxorq (10 * 64 + 48)(%rdx), %zmm12, %zmm12; + vpxorq (11 * 64 + 48)(%rdx), %zmm11, %zmm11; + vpxorq (12 * 64 + 48)(%rdx), %zmm10, %zmm10; + vpxorq (13 * 64 + 48)(%rdx), %zmm9, %zmm9; + vpxorq (14 * 64 + 48)(%rdx), %zmm8, %zmm8; + vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16; + + write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0, + %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9, + %zmm8, %rsi); + + /* store new IV */ + vmovdqu64 %xmm16, (0)(%r9); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_cbc_dec,.-_gcry_camellia_gfni_avx512_cbc_dec;) + +.align 8 +.globl _gcry_camellia_gfni_avx512_cfb_dec +ELF(.type _gcry_camellia_gfni_avx512_cfb_dec,@function;) + +_gcry_camellia_gfni_avx512_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + /* inpack64_pre: */ + vpbroadcastq (key_table)(CTX), %zmm0; + vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0; + vmovdqu64 (%rcx), %xmm15; + vinserti64x2 $1, (%rdx), %ymm15, %ymm15; + vinserti64x4 $1, 16(%rdx), %zmm15, %zmm15; + vpxorq %zmm15, %zmm0, %zmm15; + vpxorq (0 * 64 + 48)(%rdx), %zmm0, %zmm14; + vpxorq (1 * 64 + 48)(%rdx), %zmm0, %zmm13; + vpxorq (2 * 64 + 48)(%rdx), %zmm0, %zmm12; + vpxorq (3 * 64 + 48)(%rdx), %zmm0, %zmm11; + vpxorq (4 * 64 + 48)(%rdx), %zmm0, %zmm10; + vpxorq (5 * 64 + 48)(%rdx), %zmm0, %zmm9; + vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm8; + vpxorq (7 * 64 + 48)(%rdx), %zmm0, %zmm7; + vpxorq (8 * 64 + 48)(%rdx), %zmm0, %zmm6; + vpxorq (9 * 64 + 48)(%rdx), %zmm0, %zmm5; + vpxorq (10 * 64 + 48)(%rdx), %zmm0, %zmm4; + vpxorq (11 * 64 + 48)(%rdx), %zmm0, %zmm3; + vpxorq (12 * 64 + 48)(%rdx), %zmm0, %zmm2; + vpxorq (13 * 64 + 48)(%rdx), %zmm0, %zmm1; + vpxorq (14 * 64 + 48)(%rdx), %zmm0, %zmm0; + vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16; + vmovdqu64 %xmm16, (%rcx); /* store new IV */ + + call __camellia_gfni_avx512_enc_blk64; + + vpxorq 0 * 64(%rdx), %zmm7, %zmm7; + vpxorq 1 * 64(%rdx), %zmm6, %zmm6; + vpxorq 2 * 64(%rdx), %zmm5, %zmm5; + vpxorq 3 * 64(%rdx), %zmm4, %zmm4; + vpxorq 4 * 64(%rdx), %zmm3, %zmm3; + vpxorq 5 * 64(%rdx), %zmm2, %zmm2; + vpxorq 6 * 64(%rdx), %zmm1, %zmm1; + vpxorq 7 * 64(%rdx), %zmm0, %zmm0; + vpxorq 8 * 64(%rdx), %zmm15, %zmm15; + vpxorq 9 * 64(%rdx), %zmm14, %zmm14; + vpxorq 10 * 64(%rdx), %zmm13, %zmm13; + vpxorq 11 * 64(%rdx), %zmm12, %zmm12; + vpxorq 12 * 64(%rdx), %zmm11, %zmm11; + vpxorq 13 * 64(%rdx), %zmm10, %zmm10; + vpxorq 14 * 64(%rdx), %zmm9, %zmm9; + vpxorq 15 * 64(%rdx), %zmm8, %zmm8; + + write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0, + %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9, + %zmm8, %rsi); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_cfb_dec,.-_gcry_camellia_gfni_avx512_cfb_dec;) + +.align 8 +.globl _gcry_camellia_gfni_avx512_ocb_enc +ELF(.type _gcry_camellia_gfni_avx512_ocb_enc,@function;) + +_gcry_camellia_gfni_avx512_ocb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (64 blocks) + * %rdx: src (64 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[64]) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + pushq %r12; + CFI_PUSH(%r12); + pushq %r13; + CFI_PUSH(%r13); + pushq %r14; + CFI_PUSH(%r14); + pushq %r15; + CFI_PUSH(%r15); + pushq %rbx; + CFI_PUSH(%rbx); + + vmovdqu64 (%rcx), %xmm30; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg, zplain) \ + vmovdqu64 (n * 64)(%rdx), zplain; \ + vpxorq (l0reg), %xmm30, %xmm16; \ + vpxorq (l1reg), %xmm16, %xmm30; \ + vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \ + vpxorq (l2reg), %xmm30, %xmm30; \ + vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \ + vpxorq (l3reg), %xmm30, %xmm30; \ + vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \ + vpxorq zplain, %zmm31, %zmm31; \ + vpxorq zplain, %zmm16, zreg; \ + vmovdqu64 %zmm16, (n * 64)(%rsi); + +#define OCB_LOAD_PTRS(n) \ + movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \ + movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \ + movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \ + movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \ + movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \ + movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \ + movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \ + movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx; + + OCB_LOAD_PTRS(0); + OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15, %zmm20); + OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14, %zmm21); + OCB_LOAD_PTRS(2); + OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13, %zmm22); + vpternlogq $0x96, %zmm20, %zmm21, %zmm22; + OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12, %zmm23); + OCB_LOAD_PTRS(4); + OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11, %zmm24); + OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10, %zmm25); + vpternlogq $0x96, %zmm23, %zmm24, %zmm25; + OCB_LOAD_PTRS(6); + OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9, %zmm20); + OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8, %zmm21); + OCB_LOAD_PTRS(8); + OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7, %zmm26); + vpternlogq $0x96, %zmm20, %zmm21, %zmm26; + OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6, %zmm23); + OCB_LOAD_PTRS(10); + OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5, %zmm24); + OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4, %zmm27); + vpternlogq $0x96, %zmm23, %zmm24, %zmm27; + OCB_LOAD_PTRS(12); + OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3, %zmm20); + OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2, %zmm21); + OCB_LOAD_PTRS(14); + OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1, %zmm23); + vpternlogq $0x96, %zmm20, %zmm21, %zmm23; + OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0, %zmm24); +#undef OCB_LOAD_PTRS +#undef OCB_INPUT + + vpbroadcastq (key_table)(CTX), %zmm16; + vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16; + + vpternlogq $0x96, %zmm24, %zmm22, %zmm25; + vpternlogq $0x96, %zmm26, %zmm27, %zmm23; + vpxorq %zmm25, %zmm23, %zmm20; + vextracti64x4 $1, %zmm20, %ymm21; + vpxorq %ymm21, %ymm20, %ymm20; + vextracti64x2 $1, %ymm20, %xmm21; + vpternlogq $0x96, (%r8), %xmm21, %xmm20; + vmovdqu64 %xmm30, (%rcx); + vmovdqu64 %xmm20, (%r8); + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + /* inpack64_pre: */ + vpxorq %zmm0, %zmm16, %zmm0; + vpxorq %zmm1, %zmm16, %zmm1; + vpxorq %zmm2, %zmm16, %zmm2; + vpxorq %zmm3, %zmm16, %zmm3; + vpxorq %zmm4, %zmm16, %zmm4; + vpxorq %zmm5, %zmm16, %zmm5; + vpxorq %zmm6, %zmm16, %zmm6; + vpxorq %zmm7, %zmm16, %zmm7; + vpxorq %zmm8, %zmm16, %zmm8; + vpxorq %zmm9, %zmm16, %zmm9; + vpxorq %zmm10, %zmm16, %zmm10; + vpxorq %zmm11, %zmm16, %zmm11; + vpxorq %zmm12, %zmm16, %zmm12; + vpxorq %zmm13, %zmm16, %zmm13; + vpxorq %zmm14, %zmm16, %zmm14; + vpxorq %zmm15, %zmm16, %zmm15; + + call __camellia_gfni_avx512_enc_blk64; + + vpxorq 0 * 64(%rsi), %zmm7, %zmm7; + vpxorq 1 * 64(%rsi), %zmm6, %zmm6; + vpxorq 2 * 64(%rsi), %zmm5, %zmm5; + vpxorq 3 * 64(%rsi), %zmm4, %zmm4; + vpxorq 4 * 64(%rsi), %zmm3, %zmm3; + vpxorq 5 * 64(%rsi), %zmm2, %zmm2; + vpxorq 6 * 64(%rsi), %zmm1, %zmm1; + vpxorq 7 * 64(%rsi), %zmm0, %zmm0; + vpxorq 8 * 64(%rsi), %zmm15, %zmm15; + vpxorq 9 * 64(%rsi), %zmm14, %zmm14; + vpxorq 10 * 64(%rsi), %zmm13, %zmm13; + vpxorq 11 * 64(%rsi), %zmm12, %zmm12; + vpxorq 12 * 64(%rsi), %zmm11, %zmm11; + vpxorq 13 * 64(%rsi), %zmm10, %zmm10; + vpxorq 14 * 64(%rsi), %zmm9, %zmm9; + vpxorq 15 * 64(%rsi), %zmm8, %zmm8; + + write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0, + %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9, + %zmm8, %rsi); + + popq %rbx; + CFI_RESTORE(%rbx); + popq %r15; + CFI_RESTORE(%r15); + popq %r14; + CFI_RESTORE(%r14); + popq %r13; + CFI_RESTORE(%r12); + popq %r12; + CFI_RESTORE(%r13); + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_ocb_enc,.-_gcry_camellia_gfni_avx512_ocb_enc;) + +.align 8 +.globl _gcry_camellia_gfni_avx512_ocb_dec +ELF(.type _gcry_camellia_gfni_avx512_ocb_dec,@function;) + +_gcry_camellia_gfni_avx512_ocb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (64 blocks) + * %rdx: src (64 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[64]) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + pushq %r12; + CFI_PUSH(%r12); + pushq %r13; + CFI_PUSH(%r13); + pushq %r14; + CFI_PUSH(%r14); + pushq %r15; + CFI_PUSH(%r15); + pushq %rbx; + CFI_PUSH(%rbx); + pushq %r8; + CFI_PUSH(%r8); + + vmovdqu64 (%rcx), %xmm30; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg) \ + vpxorq (l0reg), %xmm30, %xmm16; \ + vpxorq (l1reg), %xmm16, %xmm30; \ + vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \ + vpxorq (l2reg), %xmm30, %xmm30; \ + vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \ + vpxorq (l3reg), %xmm30, %xmm30; \ + vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \ + vpxorq (n * 64)(%rdx), %zmm16, zreg; \ + vmovdqu64 %zmm16, (n * 64)(%rsi); + +#define OCB_LOAD_PTRS(n) \ + movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \ + movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \ + movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \ + movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \ + movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \ + movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \ + movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \ + movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx; + + OCB_LOAD_PTRS(0); + OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15); + OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14); + OCB_LOAD_PTRS(2); + OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13); + OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12); + OCB_LOAD_PTRS(4); + OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11); + OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10); + OCB_LOAD_PTRS(6); + OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9); + OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8); + OCB_LOAD_PTRS(8); + OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7); + OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6); + OCB_LOAD_PTRS(10); + OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5); + OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4); + OCB_LOAD_PTRS(12); + OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3); + OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2); + OCB_LOAD_PTRS(14); + OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1); + OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0); +#undef OCB_LOAD_PTRS +#undef OCB_INPUT + + vmovdqu64 %xmm30, (%rcx); + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + vpbroadcastq (key_table)(CTX, %r8, 8), %zmm16; + vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16; + + /* inpack64_pre: */ + vpxorq %zmm0, %zmm16, %zmm0; + vpxorq %zmm1, %zmm16, %zmm1; + vpxorq %zmm2, %zmm16, %zmm2; + vpxorq %zmm3, %zmm16, %zmm3; + vpxorq %zmm4, %zmm16, %zmm4; + vpxorq %zmm5, %zmm16, %zmm5; + vpxorq %zmm6, %zmm16, %zmm6; + vpxorq %zmm7, %zmm16, %zmm7; + vpxorq %zmm8, %zmm16, %zmm8; + vpxorq %zmm9, %zmm16, %zmm9; + vpxorq %zmm10, %zmm16, %zmm10; + vpxorq %zmm11, %zmm16, %zmm11; + vpxorq %zmm12, %zmm16, %zmm12; + vpxorq %zmm13, %zmm16, %zmm13; + vpxorq %zmm14, %zmm16, %zmm14; + vpxorq %zmm15, %zmm16, %zmm15; + + call __camellia_gfni_avx512_dec_blk64; + + vpxorq 0 * 64(%rsi), %zmm7, %zmm7; + vpxorq 1 * 64(%rsi), %zmm6, %zmm6; + vpxorq 2 * 64(%rsi), %zmm5, %zmm5; + vpxorq 3 * 64(%rsi), %zmm4, %zmm4; + vpxorq 4 * 64(%rsi), %zmm3, %zmm3; + vpxorq 5 * 64(%rsi), %zmm2, %zmm2; + vpxorq 6 * 64(%rsi), %zmm1, %zmm1; + vpxorq 7 * 64(%rsi), %zmm0, %zmm0; + vpxorq 8 * 64(%rsi), %zmm15, %zmm15; + vpxorq 9 * 64(%rsi), %zmm14, %zmm14; + vpxorq 10 * 64(%rsi), %zmm13, %zmm13; + vpxorq 11 * 64(%rsi), %zmm12, %zmm12; + vpxorq 12 * 64(%rsi), %zmm11, %zmm11; + vpxorq 13 * 64(%rsi), %zmm10, %zmm10; + vpxorq 14 * 64(%rsi), %zmm9, %zmm9; + vpxorq 15 * 64(%rsi), %zmm8, %zmm8; + + write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0, + %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9, + %zmm8, %rsi); + + popq %r8; + CFI_RESTORE(%r8); + + /* Checksum_i = Checksum_{i-1} xor C_i */ + vpternlogq $0x96, %zmm7, %zmm6, %zmm5; + vpternlogq $0x96, %zmm4, %zmm3, %zmm2; + vpternlogq $0x96, %zmm1, %zmm0, %zmm15; + vpternlogq $0x96, %zmm14, %zmm13, %zmm12; + vpternlogq $0x96, %zmm11, %zmm10, %zmm9; + vpternlogq $0x96, %zmm5, %zmm2, %zmm15; + vpternlogq $0x96, %zmm12, %zmm9, %zmm8; + vpxorq %zmm15, %zmm8, %zmm8; + + vextracti64x4 $1, %zmm8, %ymm0; + vpxor %ymm0, %ymm8, %ymm8; + vextracti128 $1, %ymm8, %xmm0; + vpternlogq $0x96, (%r8), %xmm0, %xmm8; + vmovdqu64 %xmm8, (%r8); + + popq %rbx; + CFI_RESTORE(%rbx); + popq %r15; + CFI_RESTORE(%r15); + popq %r14; + CFI_RESTORE(%r14); + popq %r13; + CFI_RESTORE(%r12); + popq %r12; + CFI_RESTORE(%r13); + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_ocb_dec,.-_gcry_camellia_gfni_avx512_ocb_dec;) + +.align 8 +.globl _gcry_camellia_gfni_avx512_enc_blk64 +ELF(.type _gcry_camellia_gfni_avx512_enc_blk64,@function;) + +_gcry_camellia_gfni_avx512_enc_blk64: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (64 blocks) + * %rdx: src (64 blocks) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + xorl %eax, %eax; + + vpbroadcastq (key_table)(CTX), %zmm0; + vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0; + + vpxorq (0) * 64(%rdx), %zmm0, %zmm15; + vpxorq (1) * 64(%rdx), %zmm0, %zmm14; + vpxorq (2) * 64(%rdx), %zmm0, %zmm13; + vpxorq (3) * 64(%rdx), %zmm0, %zmm12; + vpxorq (4) * 64(%rdx), %zmm0, %zmm11; + vpxorq (5) * 64(%rdx), %zmm0, %zmm10; + vpxorq (6) * 64(%rdx), %zmm0, %zmm9; + vpxorq (7) * 64(%rdx), %zmm0, %zmm8; + vpxorq (8) * 64(%rdx), %zmm0, %zmm7; + vpxorq (9) * 64(%rdx), %zmm0, %zmm6; + vpxorq (10) * 64(%rdx), %zmm0, %zmm5; + vpxorq (11) * 64(%rdx), %zmm0, %zmm4; + vpxorq (12) * 64(%rdx), %zmm0, %zmm3; + vpxorq (13) * 64(%rdx), %zmm0, %zmm2; + vpxorq (14) * 64(%rdx), %zmm0, %zmm1; + vpxorq (15) * 64(%rdx), %zmm0, %zmm0; + + call __camellia_gfni_avx512_enc_blk64; + + vmovdqu64 %zmm7, (0) * 64(%rsi); + vmovdqu64 %zmm6, (1) * 64(%rsi); + vmovdqu64 %zmm5, (2) * 64(%rsi); + vmovdqu64 %zmm4, (3) * 64(%rsi); + vmovdqu64 %zmm3, (4) * 64(%rsi); + vmovdqu64 %zmm2, (5) * 64(%rsi); + vmovdqu64 %zmm1, (6) * 64(%rsi); + vmovdqu64 %zmm0, (7) * 64(%rsi); + vmovdqu64 %zmm15, (8) * 64(%rsi); + vmovdqu64 %zmm14, (9) * 64(%rsi); + vmovdqu64 %zmm13, (10) * 64(%rsi); + vmovdqu64 %zmm12, (11) * 64(%rsi); + vmovdqu64 %zmm11, (12) * 64(%rsi); + vmovdqu64 %zmm10, (13) * 64(%rsi); + vmovdqu64 %zmm9, (14) * 64(%rsi); + vmovdqu64 %zmm8, (15) * 64(%rsi); + + clear_regs(); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_enc_blk64,.-_gcry_camellia_gfni_avx512_enc_blk64;) + +.align 8 +.globl _gcry_camellia_gfni_avx512_dec_blk64 +ELF(.type _gcry_camellia_gfni_avx512_dec_blk64,@function;) + +_gcry_camellia_gfni_avx512_dec_blk64: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (64 blocks) + * %rdx: src (64 blocks) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + xorl %eax, %eax; + + vpbroadcastq (key_table)(CTX, %r8, 8), %zmm0; + vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0; + + vpxorq (0) * 64(%rdx), %zmm0, %zmm15; + vpxorq (1) * 64(%rdx), %zmm0, %zmm14; + vpxorq (2) * 64(%rdx), %zmm0, %zmm13; + vpxorq (3) * 64(%rdx), %zmm0, %zmm12; + vpxorq (4) * 64(%rdx), %zmm0, %zmm11; + vpxorq (5) * 64(%rdx), %zmm0, %zmm10; + vpxorq (6) * 64(%rdx), %zmm0, %zmm9; + vpxorq (7) * 64(%rdx), %zmm0, %zmm8; + vpxorq (8) * 64(%rdx), %zmm0, %zmm7; + vpxorq (9) * 64(%rdx), %zmm0, %zmm6; + vpxorq (10) * 64(%rdx), %zmm0, %zmm5; + vpxorq (11) * 64(%rdx), %zmm0, %zmm4; + vpxorq (12) * 64(%rdx), %zmm0, %zmm3; + vpxorq (13) * 64(%rdx), %zmm0, %zmm2; + vpxorq (14) * 64(%rdx), %zmm0, %zmm1; + vpxorq (15) * 64(%rdx), %zmm0, %zmm0; + + call __camellia_gfni_avx512_dec_blk64; + + vmovdqu64 %zmm7, (0) * 64(%rsi); + vmovdqu64 %zmm6, (1) * 64(%rsi); + vmovdqu64 %zmm5, (2) * 64(%rsi); + vmovdqu64 %zmm4, (3) * 64(%rsi); + vmovdqu64 %zmm3, (4) * 64(%rsi); + vmovdqu64 %zmm2, (5) * 64(%rsi); + vmovdqu64 %zmm1, (6) * 64(%rsi); + vmovdqu64 %zmm0, (7) * 64(%rsi); + vmovdqu64 %zmm15, (8) * 64(%rsi); + vmovdqu64 %zmm14, (9) * 64(%rsi); + vmovdqu64 %zmm13, (10) * 64(%rsi); + vmovdqu64 %zmm12, (11) * 64(%rsi); + vmovdqu64 %zmm11, (12) * 64(%rsi); + vmovdqu64 %zmm10, (13) * 64(%rsi); + vmovdqu64 %zmm9, (14) * 64(%rsi); + vmovdqu64 %zmm8, (15) * 64(%rsi); + + clear_regs(); + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_camellia_gfni_avx512_dec_blk64,.-_gcry_camellia_gfni_avx512_dec_blk64;) + +#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) */ +#endif /* __x86_64 */ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 00e23750..a854b82d 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -1,1395 +1,1628 @@ /* camellia-glue.c - Glue for the Camellia cipher * Copyright (C) 2007 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. */ /* I put all the libgcrypt-specific stuff in this file to keep the camellia.c/camellia.h files exactly as provided by NTT. If they update their code, this should make it easier to bring the changes in. - dshaw There is one small change which needs to be done: Include the following code at the top of camellia.h: */ #if 0 /* To use Camellia with libraries it is often useful to keep the name * space of the library clean. The following macro is thus useful: * * #define CAMELLIA_EXT_SYM_PREFIX foo_ * * This prefixes all external symbols with "foo_". */ #ifdef HAVE_CONFIG_H #include #endif #ifdef CAMELLIA_EXT_SYM_PREFIX #define CAMELLIA_PREFIX1(x,y) x ## y #define CAMELLIA_PREFIX2(x,y) CAMELLIA_PREFIX1(x,y) #define CAMELLIA_PREFIX(x) CAMELLIA_PREFIX2(CAMELLIA_EXT_SYM_PREFIX,x) #define Camellia_Ekeygen CAMELLIA_PREFIX(Camellia_Ekeygen) #define Camellia_EncryptBlock CAMELLIA_PREFIX(Camellia_EncryptBlock) #define Camellia_DecryptBlock CAMELLIA_PREFIX(Camellia_DecryptBlock) #define camellia_decrypt128 CAMELLIA_PREFIX(camellia_decrypt128) #define camellia_decrypt256 CAMELLIA_PREFIX(camellia_decrypt256) #define camellia_encrypt128 CAMELLIA_PREFIX(camellia_encrypt128) #define camellia_encrypt256 CAMELLIA_PREFIX(camellia_encrypt256) #define camellia_setup128 CAMELLIA_PREFIX(camellia_setup128) #define camellia_setup192 CAMELLIA_PREFIX(camellia_setup192) #define camellia_setup256 CAMELLIA_PREFIX(camellia_setup256) #endif /*CAMELLIA_EXT_SYM_PREFIX*/ #endif /* Code sample. */ #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "camellia.h" #include "bufhelp.h" #include "cipher-internal.h" #include "cipher-selftest.h" #include "bulkhelp.h" /* Helper macro to force alignment to 16 bytes. */ #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED # define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) #else # define ATTR_ALIGNED_16 #endif /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */ #undef USE_AESNI_AVX #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AESNI_AVX 1 # endif #endif /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */ #undef USE_AESNI_AVX2 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AESNI_AVX2 1 # endif #endif /* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */ #undef USE_VAES_AVX2 #if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) # define USE_VAES_AVX2 1 #endif /* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */ #undef USE_GFNI_AVX2 #if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT) # define USE_GFNI_AVX2 1 #endif +/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */ +#undef USE_GFNI_AVX512 +#if defined(USE_GFNI_AVX2) && defined(ENABLE_AVX512_SUPPORT) +# define USE_GFNI_AVX512 1 +#endif + typedef struct { KEY_TABLE_TYPE keytable; int keybitlength; #ifdef USE_AESNI_AVX unsigned int use_aesni_avx:1; /* AES-NI/AVX implementation shall be used. */ #endif /*USE_AESNI_AVX*/ #ifdef USE_AESNI_AVX2 unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */ unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */ unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used. */ + unsigned int use_gfni_avx512:1; /* GFNI/AVX512 implementation shall be used. */ #endif /*USE_AESNI_AVX2*/ } CAMELLIA_context; /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ASM_FUNC_ABI __attribute__((sysv_abi)) # define ASM_EXTRA_STACK (10 * 16) # else # define ASM_FUNC_ABI # define ASM_EXTRA_STACK 0 # endif #endif #ifdef USE_AESNI_AVX /* Assembler implementations of Camellia using AES-NI and AVX. Process data - in 16 block same time. + in 16 blocks same time. */ extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx, const unsigned char *key, unsigned int keylen) ASM_FUNC_ABI; static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 + 2 * sizeof(void *) + ASM_EXTRA_STACK; #endif #ifdef USE_AESNI_AVX2 /* Assembler implementations of Camellia using AES-NI and AVX2. Process data - in 32 block same time. + in 32 blocks same time. */ extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_enc_blk1_32(const CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned int nblocks) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_dec_blk1_32(const CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned int nblocks) ASM_FUNC_ABI; static const int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 + 2 * sizeof(void *) + ASM_EXTRA_STACK; #endif #ifdef USE_VAES_AVX2 /* Assembler implementations of Camellia using VAES and AVX2. Process data - in 32 block same time. + in 32 blocks same time. */ extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_cbc_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_ocb_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_ocb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_enc_blk1_32(const CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned int nblocks) ASM_FUNC_ABI; extern void _gcry_camellia_vaes_avx2_dec_blk1_32(const CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned int nblocks) ASM_FUNC_ABI; #endif #ifdef USE_GFNI_AVX2 /* Assembler implementations of Camellia using GFNI and AVX2. Process data - in 32 block same time. + in 32 blocks same time. */ extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const u64 Ls[32]) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_enc_blk1_32(const CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned int nblocks) ASM_FUNC_ABI; extern void _gcry_camellia_gfni_avx2_dec_blk1_32(const CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned int nblocks) ASM_FUNC_ABI; #endif +#ifdef USE_GFNI_AVX512 +/* Assembler implementations of Camellia using GFNI and AVX512. Process data + in 64 blocks same time. + */ +extern void _gcry_camellia_gfni_avx512_ctr_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx512_cbc_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx512_cfb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx512_ocb_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx512_ocb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx512_enc_blk64(const CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in) + ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx512_dec_blk64(const CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in) + ASM_FUNC_ABI; + +/* Stack not used by AVX512 implementation. */ +static const int avx512_burn_stack_depth = 0; +#endif + static const char *selftest(void); static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); static size_t _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); static gcry_err_code_t camellia_setkey(void *c, const byte *key, unsigned keylen, cipher_bulk_ops_t *bulk_ops) { CAMELLIA_context *ctx=c; static int initialized=0; static const char *selftest_failed=NULL; #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \ || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2) unsigned int hwf = _gcry_get_hw_features (); #endif if(keylen!=16 && keylen!=24 && keylen!=32) return GPG_ERR_INV_KEYLEN; if(!initialized) { initialized=1; selftest_failed=selftest(); if(selftest_failed) log_error("%s\n",selftest_failed); } if(selftest_failed) return GPG_ERR_SELFTEST_FAILED; #ifdef USE_AESNI_AVX ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX); #endif #ifdef USE_AESNI_AVX2 ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2); ctx->use_vaes_avx2 = 0; ctx->use_gfni_avx2 = 0; + ctx->use_gfni_avx512 = 0; #endif #ifdef USE_VAES_AVX2 ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2); #endif #ifdef USE_GFNI_AVX2 ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2); #endif +#ifdef USE_GFNI_AVX512 + ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512); +#endif ctx->keybitlength=keylen*8; /* Setup bulk encryption routines. */ memset (bulk_ops, 0, sizeof(*bulk_ops)); bulk_ops->cbc_dec = _gcry_camellia_cbc_dec; bulk_ops->cfb_dec = _gcry_camellia_cfb_dec; bulk_ops->ctr_enc = _gcry_camellia_ctr_enc; bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt; bulk_ops->ocb_auth = _gcry_camellia_ocb_auth; #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2) bulk_ops->xts_crypt = _gcry_camellia_xts_crypt; #endif if (0) { } #ifdef USE_AESNI_AVX else if (ctx->use_aesni_avx) _gcry_camellia_aesni_avx_keygen(ctx, key, keylen); else #endif { Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable); _gcry_burn_stack ((19+34+34)*sizeof(u32)+2*sizeof(void*) /* camellia_setup256 */ +(4+32)*sizeof(u32)+2*sizeof(void*) /* camellia_setup192 */ +0+sizeof(int)+2*sizeof(void*) /* Camellia_Ekeygen */ +3*2*sizeof(void*) /* Function calls. */ ); } #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { /* Disable AESNI & VAES implementations when GFNI implementation is * enabled. */ #ifdef USE_AESNI_AVX ctx->use_aesni_avx = 0; #endif #ifdef USE_AESNI_AVX2 ctx->use_aesni_avx2 = 0; #endif #ifdef USE_VAES_AVX2 ctx->use_vaes_avx2 = 0; #endif } #endif return 0; } #ifdef USE_ARM_ASM /* Assembly implementations of Camellia. */ extern void _gcry_camellia_arm_encrypt_block(const KEY_TABLE_TYPE keyTable, byte *outbuf, const byte *inbuf, const int keybits); extern void _gcry_camellia_arm_decrypt_block(const KEY_TABLE_TYPE keyTable, byte *outbuf, const byte *inbuf, const int keybits); static void Camellia_EncryptBlock(const int keyBitLength, const unsigned char *plaintext, const KEY_TABLE_TYPE keyTable, unsigned char *cipherText) { _gcry_camellia_arm_encrypt_block(keyTable, cipherText, plaintext, keyBitLength); } static void Camellia_DecryptBlock(const int keyBitLength, const unsigned char *cipherText, const KEY_TABLE_TYPE keyTable, unsigned char *plaintext) { _gcry_camellia_arm_decrypt_block(keyTable, plaintext, cipherText, keyBitLength); } #ifdef __aarch64__ # define CAMELLIA_encrypt_stack_burn_size (0) # define CAMELLIA_decrypt_stack_burn_size (0) #else # define CAMELLIA_encrypt_stack_burn_size (15*4) # define CAMELLIA_decrypt_stack_burn_size (15*4) #endif static unsigned int camellia_encrypt(void *c, byte *outbuf, const byte *inbuf) { CAMELLIA_context *ctx = c; Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf); return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size); } static unsigned int camellia_decrypt(void *c, byte *outbuf, const byte *inbuf) { CAMELLIA_context *ctx=c; Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf); return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size); } #else /*USE_ARM_ASM*/ static unsigned int camellia_encrypt(void *c, byte *outbuf, const byte *inbuf) { CAMELLIA_context *ctx=c; Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf); #define CAMELLIA_encrypt_stack_burn_size \ (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \ +4*sizeof(u32)+4*sizeof(u32) \ +2*sizeof(u32*)+4*sizeof(u32) \ +2*2*sizeof(void*) /* Function calls. */ \ ) return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size); } static unsigned int camellia_decrypt(void *c, byte *outbuf, const byte *inbuf) { CAMELLIA_context *ctx=c; Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf); #define CAMELLIA_decrypt_stack_burn_size \ (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \ +4*sizeof(u32)+4*sizeof(u32) \ +2*sizeof(u32*)+4*sizeof(u32) \ +2*2*sizeof(void*) /* Function calls. */ \ ) return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size); } #endif /*!USE_ARM_ASM*/ static unsigned int camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf, unsigned int num_blks) { const CAMELLIA_context *ctx = priv; unsigned int stack_burn_size = 0; gcry_assert (num_blks <= 32); #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2 && num_blks >= 3) { /* 3 or more parallel block GFNI processing is faster than * generic C implementation. */ _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2 && num_blks >= 6) { /* 6 or more parallel block VAES processing is faster than * generic C implementation. */ _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2 && num_blks >= 6) { /* 6 or more parallel block AESNI processing is faster than * generic C implementation. */ _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif while (num_blks) { stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf); outbuf += CAMELLIA_BLOCK_SIZE; inbuf += CAMELLIA_BLOCK_SIZE; num_blks--; } return stack_burn_size; } +static unsigned int +camellia_encrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf, + unsigned int num_blks) +{ + const CAMELLIA_context *ctx = priv; + unsigned int stack_burn_size = 0; + unsigned int nburn; + + gcry_assert (num_blks <= 64); + +#ifdef USE_GFNI_AVX512 + if (num_blks == 64 && ctx->use_gfni_avx512) + { + _gcry_camellia_gfni_avx512_enc_blk64 (ctx, outbuf, inbuf); + return avx512_burn_stack_depth; + } +#endif + + do + { + unsigned int curr_blks = num_blks > 32 ? 32 : num_blks; + nburn = camellia_encrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks); + stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size; + outbuf += curr_blks * 16; + inbuf += curr_blks * 16; + num_blks -= curr_blks; + } + while (num_blks > 0); + + return stack_burn_size; +} static unsigned int camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf, unsigned int num_blks) { const CAMELLIA_context *ctx = priv; unsigned int stack_burn_size = 0; gcry_assert (num_blks <= 32); #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2 && num_blks >= 3) { /* 3 or more parallel block GFNI processing is faster than * generic C implementation. */ _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2 && num_blks >= 6) { /* 6 or more parallel block VAES processing is faster than * generic C implementation. */ _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2 && num_blks >= 6) { /* 6 or more parallel block AESNI processing is faster than * generic C implementation. */ _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif while (num_blks) { stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf); outbuf += CAMELLIA_BLOCK_SIZE; inbuf += CAMELLIA_BLOCK_SIZE; num_blks--; } return stack_burn_size; } +static unsigned int +camellia_decrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf, + unsigned int num_blks) +{ + const CAMELLIA_context *ctx = priv; + unsigned int stack_burn_size = 0; + unsigned int nburn; + + gcry_assert (num_blks <= 64); + +#ifdef USE_GFNI_AVX512 + if (num_blks == 64 && ctx->use_gfni_avx512) + { + _gcry_camellia_gfni_avx512_dec_blk64 (ctx, outbuf, inbuf); + return avx512_burn_stack_depth; + } +#endif + + do + { + unsigned int curr_blks = num_blks > 32 ? 32 : num_blks; + nburn = camellia_decrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks); + stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size; + outbuf += curr_blks * 16; + inbuf += curr_blks * 16; + num_blks -= curr_blks; + } + while (num_blks > 0); + + return stack_burn_size; +} + /* Bulk encryption of complete blocks in CTR mode. This function is only intended for the bulk encryption feature of cipher.c. CTR is expected to be of size CAMELLIA_BLOCK_SIZE. */ static void _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { CAMELLIA_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + int did_use_gfni_avx512 = 0; + + /* Process data in 64 block chunks. */ + while (nblocks >= 64) + { + _gcry_camellia_gfni_avx512_ctr_enc (ctx, outbuf, inbuf, ctr); + nblocks -= 64; + outbuf += 64 * CAMELLIA_BLOCK_SIZE; + inbuf += 64 * CAMELLIA_BLOCK_SIZE; + did_use_gfni_avx512 = 1; + } + + if (did_use_gfni_avx512) + { + if (burn_stack_depth < avx512_burn_stack_depth) + burn_stack_depth = avx512_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn = _gcry_camellia_aesni_avx2_ctr_enc; #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2) bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc; #endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc; #endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { bulk_ctr_fn (ctx, outbuf, inbuf, ctr); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx2 = 1; } if (did_use_aesni_avx2) { if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ - /* TODO: use caching instead? */ } #endif #ifdef USE_AESNI_AVX if (ctx->use_aesni_avx) { int did_use_aesni_avx = 0; /* Process data in 16 block chunks. */ while (nblocks >= 16) { _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 16; outbuf += 16 * CAMELLIA_BLOCK_SIZE; inbuf += 16 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx = 1; } if (did_use_aesni_avx) { if (burn_stack_depth < avx_burn_stack_depth) burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ - /* TODO: use caching instead? */ } #endif /* Process remaining blocks. */ if (nblocks) { byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32]; unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; size_t nburn; nburn = bulk_ctr_enc_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf, nblocks, ctr, tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; wipememory(tmpbuf, tmp_used); } if (burn_stack_depth) _gcry_burn_stack(burn_stack_depth); } /* Bulk decryption of complete blocks in CBC mode. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_camellia_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { CAMELLIA_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + int did_use_gfni_avx512 = 0; + + /* Process data in 64 block chunks. */ + while (nblocks >= 64) + { + _gcry_camellia_gfni_avx512_cbc_dec (ctx, outbuf, inbuf, iv); + nblocks -= 64; + outbuf += 64 * CAMELLIA_BLOCK_SIZE; + inbuf += 64 * CAMELLIA_BLOCK_SIZE; + did_use_gfni_avx512 = 1; + } + + if (did_use_gfni_avx512) + { + if (burn_stack_depth < avx512_burn_stack_depth) + burn_stack_depth = avx512_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn = _gcry_camellia_aesni_avx2_cbc_dec; #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2) bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec; #endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec; #endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { bulk_cbc_fn (ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx2 = 1; } if (did_use_aesni_avx2) { if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_AESNI_AVX if (ctx->use_aesni_avx) { int did_use_aesni_avx = 0; /* Process data in 16 block chunks. */ while (nblocks >= 16) { _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 16; outbuf += 16 * CAMELLIA_BLOCK_SIZE; inbuf += 16 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx = 1; } if (did_use_aesni_avx) { if (burn_stack_depth < avx_burn_stack_depth) burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif /* Process remaining blocks. */ if (nblocks) { byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32]; unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; size_t nburn; nburn = bulk_cbc_dec_128(ctx, camellia_decrypt_blk1_32, outbuf, inbuf, nblocks, iv, tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; wipememory(tmpbuf, tmp_used); } if (burn_stack_depth) _gcry_burn_stack(burn_stack_depth); } /* Bulk decryption of complete blocks in CFB mode. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_camellia_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { CAMELLIA_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + int did_use_gfni_avx512 = 0; + + /* Process data in 64 block chunks. */ + while (nblocks >= 64) + { + _gcry_camellia_gfni_avx512_cfb_dec (ctx, outbuf, inbuf, iv); + nblocks -= 64; + outbuf += 64 * CAMELLIA_BLOCK_SIZE; + inbuf += 64 * CAMELLIA_BLOCK_SIZE; + did_use_gfni_avx512 = 1; + } + + if (did_use_gfni_avx512) + { + if (burn_stack_depth < avx512_burn_stack_depth) + burn_stack_depth = avx512_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn = _gcry_camellia_aesni_avx2_cfb_dec; #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2) bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec; #endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec; #endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { bulk_cfb_fn (ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx2 = 1; } if (did_use_aesni_avx2) { if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_AESNI_AVX if (ctx->use_aesni_avx) { int did_use_aesni_avx = 0; /* Process data in 16 block chunks. */ while (nblocks >= 16) { _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 16; outbuf += 16 * CAMELLIA_BLOCK_SIZE; inbuf += 16 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx = 1; } if (did_use_aesni_avx) { if (burn_stack_depth < avx_burn_stack_depth) burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif /* Process remaining blocks. */ if (nblocks) { byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32]; unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; size_t nburn; nburn = bulk_cfb_dec_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf, nblocks, iv, tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; wipememory(tmpbuf, tmp_used); } if (burn_stack_depth) _gcry_burn_stack(burn_stack_depth); } /* Bulk encryption/decryption of complete blocks in XTS mode. */ static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { CAMELLIA_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; /* Process remaining blocks. */ if (nblocks) { - byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32]; + byte tmpbuf[CAMELLIA_BLOCK_SIZE * 64]; unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; size_t nburn; - nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_32 - : camellia_decrypt_blk1_32, + nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64 + : camellia_decrypt_blk1_64, outbuf, inbuf, nblocks, tweak, tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; wipememory(tmpbuf, tmp_used); } if (burn_stack_depth) _gcry_burn_stack(burn_stack_depth); } /* Bulk encryption/decryption of complete blocks in OCB mode. */ static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) CAMELLIA_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; u64 blkn = c->u_mode.ocb.data_nblocks; #else (void)c; (void)outbuf_arg; (void)inbuf_arg; (void)encrypt; #endif +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + int did_use_gfni_avx512 = 0; + u64 Ls[64]; + u64 *l; + + if (nblocks >= 64) + { + typeof (&_gcry_camellia_gfni_avx512_ocb_dec) bulk_ocb_fn = + encrypt ? _gcry_camellia_gfni_avx512_ocb_enc + : _gcry_camellia_gfni_avx512_ocb_dec; + l = bulk_ocb_prepare_L_pointers_array_blk64 (c, Ls, blkn); + + /* Process data in 64 block chunks. */ + while (nblocks >= 64) + { + blkn += 64; + *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 64); + + bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); + + nblocks -= 64; + outbuf += 64 * CAMELLIA_BLOCK_SIZE; + inbuf += 64 * CAMELLIA_BLOCK_SIZE; + did_use_gfni_avx512 = 1; + } + } + + if (did_use_gfni_avx512) + { + if (burn_stack_depth < avx2_burn_stack_depth) + burn_stack_depth = avx2_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; u64 Ls[32]; u64 *l; if (nblocks >= 32) { typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn = encrypt ? _gcry_camellia_aesni_avx2_ocb_enc : _gcry_camellia_aesni_avx2_ocb_dec; #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2) bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc : _gcry_camellia_vaes_avx2_ocb_dec; #endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc : _gcry_camellia_gfni_avx2_ocb_dec; #endif l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn); /* Process data in 32 block chunks. */ while (nblocks >= 32) { blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx2 = 1; } } if (did_use_aesni_avx2) { if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_AESNI_AVX if (ctx->use_aesni_avx) { int did_use_aesni_avx = 0; u64 Ls[16]; u64 *l; if (nblocks >= 16) { l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn); /* Process data in 16 block chunks. */ while (nblocks >= 16) { blkn += 16; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); if (encrypt) _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); else _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); nblocks -= 16; outbuf += 16 * CAMELLIA_BLOCK_SIZE; inbuf += 16 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx = 1; } } if (did_use_aesni_avx) { if (burn_stack_depth < avx_burn_stack_depth) burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) /* Process remaining blocks. */ if (nblocks) { byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32]; unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; size_t nburn; nburn = bulk_ocb_crypt_128 (c, ctx, encrypt ? camellia_encrypt_blk1_32 : camellia_decrypt_blk1_32, outbuf, inbuf, nblocks, &blkn, encrypt, tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; wipememory(tmpbuf, tmp_used); nblocks = 0; } c->u_mode.ocb.data_nblocks = blkn; if (burn_stack_depth) _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); #endif return nblocks; } /* Bulk authentication of complete blocks in OCB mode. */ static size_t _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) CAMELLIA_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; int burn_stack_depth = 0; u64 blkn = c->u_mode.ocb.aad_nblocks; #else (void)c; (void)abuf_arg; #endif #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; u64 Ls[32]; u64 *l; if (nblocks >= 32) { typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn = _gcry_camellia_aesni_avx2_ocb_auth; #ifdef USE_VAES_AVX2 if (ctx->use_vaes_avx2) bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth; #endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth; #endif l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn); /* Process data in 32 block chunks. */ while (nblocks >= 32) { blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, Ls); nblocks -= 32; abuf += 32 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx2 = 1; } } if (did_use_aesni_avx2) { if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_AESNI_AVX if (ctx->use_aesni_avx) { int did_use_aesni_avx = 0; u64 Ls[16]; u64 *l; if (nblocks >= 16) { l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn); /* Process data in 16 block chunks. */ while (nblocks >= 16) { blkn += 16; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, Ls); nblocks -= 16; abuf += 16 * CAMELLIA_BLOCK_SIZE; did_use_aesni_avx = 1; } } if (did_use_aesni_avx) { if (burn_stack_depth < avx_burn_stack_depth) burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ } #endif #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) /* Process remaining blocks. */ if (nblocks) { byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32]; unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; size_t nburn; nburn = bulk_ocb_auth_128 (c, ctx, camellia_encrypt_blk1_32, abuf, nblocks, &blkn, tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; wipememory(tmpbuf, tmp_used); nblocks = 0; } c->u_mode.ocb.aad_nblocks = blkn; if (burn_stack_depth) _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); #endif return nblocks; } /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR encryption. Returns NULL on success. */ static const char* selftest_ctr_128 (void) { - const int nblocks = 32+16+1; + const int nblocks = 64+32+16+1; const int blocksize = CAMELLIA_BLOCK_SIZE; const int context_size = sizeof(CAMELLIA_context); return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey, &camellia_encrypt, nblocks, blocksize, context_size); } /* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption. Returns NULL on success. */ static const char* selftest_cbc_128 (void) { - const int nblocks = 32+16+2; + const int nblocks = 64+32+16+2; const int blocksize = CAMELLIA_BLOCK_SIZE; const int context_size = sizeof(CAMELLIA_context); return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey, &camellia_encrypt, nblocks, blocksize, context_size); } /* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption. Returns NULL on success. */ static const char* selftest_cfb_128 (void) { - const int nblocks = 32+16+2; + const int nblocks = 64+32+16+2; const int blocksize = CAMELLIA_BLOCK_SIZE; const int context_size = sizeof(CAMELLIA_context); return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey, &camellia_encrypt, nblocks, blocksize, context_size); } static const char * selftest(void) { CAMELLIA_context ctx; byte scratch[16]; cipher_bulk_ops_t bulk_ops; const char *r; /* These test vectors are from RFC-3713 */ static const byte plaintext[]= { 0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef, 0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10 }; static const byte key_128[]= { 0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef, 0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10 }; static const byte ciphertext_128[]= { 0x67,0x67,0x31,0x38,0x54,0x96,0x69,0x73, 0x08,0x57,0x06,0x56,0x48,0xea,0xbe,0x43 }; static const byte key_192[]= { 0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,0x98, 0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77 }; static const byte ciphertext_192[]= { 0xb4,0x99,0x34,0x01,0xb3,0xe9,0x96,0xf8, 0x4e,0xe5,0xce,0xe7,0xd7,0x9b,0x09,0xb9 }; static const byte key_256[]= { 0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba, 0x98,0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55, 0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff }; static const byte ciphertext_256[]= { 0x9a,0xcc,0x23,0x7d,0xff,0x16,0xd7,0x6c, 0x20,0xef,0x7c,0x91,0x9e,0x3a,0x75,0x09 }; camellia_setkey(&ctx,key_128,sizeof(key_128),&bulk_ops); camellia_encrypt(&ctx,scratch,plaintext); if(memcmp(scratch,ciphertext_128,sizeof(ciphertext_128))!=0) return "CAMELLIA-128 test encryption failed."; camellia_decrypt(&ctx,scratch,scratch); if(memcmp(scratch,plaintext,sizeof(plaintext))!=0) return "CAMELLIA-128 test decryption failed."; camellia_setkey(&ctx,key_192,sizeof(key_192),&bulk_ops); camellia_encrypt(&ctx,scratch,plaintext); if(memcmp(scratch,ciphertext_192,sizeof(ciphertext_192))!=0) return "CAMELLIA-192 test encryption failed."; camellia_decrypt(&ctx,scratch,scratch); if(memcmp(scratch,plaintext,sizeof(plaintext))!=0) return "CAMELLIA-192 test decryption failed."; camellia_setkey(&ctx,key_256,sizeof(key_256),&bulk_ops); camellia_encrypt(&ctx,scratch,plaintext); if(memcmp(scratch,ciphertext_256,sizeof(ciphertext_256))!=0) return "CAMELLIA-256 test encryption failed."; camellia_decrypt(&ctx,scratch,scratch); if(memcmp(scratch,plaintext,sizeof(plaintext))!=0) return "CAMELLIA-256 test decryption failed."; if ( (r = selftest_ctr_128 ()) ) return r; if ( (r = selftest_cbc_128 ()) ) return r; if ( (r = selftest_cfb_128 ()) ) return r; return NULL; } /* These oids are from , retrieved May 1, 2007. */ static const gcry_cipher_oid_spec_t camellia128_oids[] = { {"1.2.392.200011.61.1.1.1.2", GCRY_CIPHER_MODE_CBC}, {"0.3.4401.5.3.1.9.1", GCRY_CIPHER_MODE_ECB}, {"0.3.4401.5.3.1.9.3", GCRY_CIPHER_MODE_OFB}, {"0.3.4401.5.3.1.9.4", GCRY_CIPHER_MODE_CFB}, { NULL } }; static const gcry_cipher_oid_spec_t camellia192_oids[] = { {"1.2.392.200011.61.1.1.1.3", GCRY_CIPHER_MODE_CBC}, {"0.3.4401.5.3.1.9.21", GCRY_CIPHER_MODE_ECB}, {"0.3.4401.5.3.1.9.23", GCRY_CIPHER_MODE_OFB}, {"0.3.4401.5.3.1.9.24", GCRY_CIPHER_MODE_CFB}, { NULL } }; static const gcry_cipher_oid_spec_t camellia256_oids[] = { {"1.2.392.200011.61.1.1.1.4", GCRY_CIPHER_MODE_CBC}, {"0.3.4401.5.3.1.9.41", GCRY_CIPHER_MODE_ECB}, {"0.3.4401.5.3.1.9.43", GCRY_CIPHER_MODE_OFB}, {"0.3.4401.5.3.1.9.44", GCRY_CIPHER_MODE_CFB}, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_camellia128 = { GCRY_CIPHER_CAMELLIA128, {0, 0}, "CAMELLIA128",NULL,camellia128_oids,CAMELLIA_BLOCK_SIZE,128, sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt }; gcry_cipher_spec_t _gcry_cipher_spec_camellia192 = { GCRY_CIPHER_CAMELLIA192, {0, 0}, "CAMELLIA192",NULL,camellia192_oids,CAMELLIA_BLOCK_SIZE,192, sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt }; gcry_cipher_spec_t _gcry_cipher_spec_camellia256 = { GCRY_CIPHER_CAMELLIA256, {0, 0}, "CAMELLIA256",NULL,camellia256_oids,CAMELLIA_BLOCK_SIZE,256, sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt }; diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S index da24286e..8b4d7499 100644 --- a/cipher/chacha20-amd64-avx512.S +++ b/cipher/chacha20-amd64-avx512.S @@ -1,300 +1,300 @@ /* chacha20-amd64-avx512.S - AVX512 implementation of ChaCha20 cipher * * Copyright (C) 2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) .text #include "asm-common-amd64.h" /* register macros */ #define INPUT %rdi #define DST %rsi #define SRC %rdx #define NBLKS %rcx #define ROUND %eax /* vector registers */ #define X0 %zmm0 #define X1 %zmm1 #define X2 %zmm2 #define X3 %zmm3 #define X4 %zmm4 #define X5 %zmm5 #define X6 %zmm6 #define X7 %zmm7 #define X8 %zmm8 #define X9 %zmm9 #define X10 %zmm10 #define X11 %zmm11 #define X12 %zmm12 #define X13 %zmm13 #define X14 %zmm14 #define X15 %zmm15 #define TMP0 %zmm16 #define TMP1 %zmm17 #define COUNTER_ADD %zmm18 #define X12_SAVE %zmm19 #define X13_SAVE %zmm20 #define S0 %zmm21 #define S1 %zmm22 #define S2 %zmm23 #define S3 %zmm24 #define S4 %zmm25 #define S5 %zmm26 #define S6 %zmm27 #define S7 %zmm28 #define S8 %zmm29 #define S14 %zmm30 #define S15 %zmm31 /********************************************************************** helper macros **********************************************************************/ /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /* 4x4 128-bit matrix transpose */ #define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \ vshufi32x4 $0xee, x1, x0, t2; \ vshufi32x4 $0x44, x1, x0, x0; \ \ vshufi32x4 $0x44, x3, x2, t1; \ vshufi32x4 $0xee, x3, x2, x2; \ \ vshufi32x4 $0xdd, t1, x0, x1; \ vshufi32x4 $0x88, t1, x0, x0; \ \ vshufi32x4 $0xdd, x2, t2, x3; \ vshufi32x4 $0x88, x2, t2, x2; #define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \ vpxord (offset + 0 * (add))(src), x0, x0; \ vpxord (offset + 1 * (add))(src), x4, x4; \ vpxord (offset + 2 * (add))(src), x8, x8; \ vpxord (offset + 3 * (add))(src), x12, x12; \ vmovdqu32 x0, (offset + 0 * (add))(dst); \ vmovdqu32 x4, (offset + 1 * (add))(dst); \ vmovdqu32 x8, (offset + 2 * (add))(dst); \ vmovdqu32 x12, (offset + 3 * (add))(dst); #define xor_src_dst(dst, src, offset, xreg) \ vpxord offset(src), xreg, xreg; \ vmovdqu32 xreg, offset(dst); #define clear_vec4(v0,v1,v2,v3) \ vpxord v0, v0, v0; \ vpxord v1, v1, v1; \ vpxord v2, v2, v2; \ vpxord v3, v3, v3; #define clear_zmm16_zmm31() \ clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \ clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \ clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \ clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31); /********************************************************************** 16-way chacha20 **********************************************************************/ #define ROTATE2(v1,v2,c) \ vprold $(c), v1, v1; \ vprold $(c), v2, v2; #define XOR(ds,s) \ vpxord s, ds, ds; #define PLUS(ds,s) \ vpaddd s, ds, ds; #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE2(d1, d2, 16); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12); \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE2(d1, d2, 8); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7); .align 64 ELF(.type _gcry_chacha20_amd64_avx512_data,@object;) _gcry_chacha20_amd64_avx512_data: .Linc_counter: .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .Lone: .long 1,0,0,0 ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data) .align 16 .globl _gcry_chacha20_amd64_avx512_blocks16 ELF(.type _gcry_chacha20_amd64_avx512_blocks16,@function;) _gcry_chacha20_amd64_avx512_blocks16: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks (multiple of 16) */ CFI_STARTPROC(); vpxord %xmm16, %xmm16, %xmm16; vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ vpmovzxbd .Linc_counter rRIP, COUNTER_ADD; /* Preload state */ vpbroadcastd (0 * 4)(INPUT), S0; vpbroadcastd (1 * 4)(INPUT), S1; vpbroadcastd (2 * 4)(INPUT), S2; vpbroadcastd (3 * 4)(INPUT), S3; vpbroadcastd (4 * 4)(INPUT), S4; vpbroadcastd (5 * 4)(INPUT), S5; vpbroadcastd (6 * 4)(INPUT), S6; vpbroadcastd (7 * 4)(INPUT), S7; vpbroadcastd (8 * 4)(INPUT), S8; vpbroadcastd (14 * 4)(INPUT), S14; vpbroadcastd (15 * 4)(INPUT), S15; .align 16 .Loop16: movl $20, ROUND; /* Construct counter vectors X12 and X13 */ vpbroadcastd (12 * 4)(INPUT), X12; vpbroadcastd (13 * 4)(INPUT), X13; vpaddd COUNTER_ADD, X12, X12; vpcmpud $6, X12, COUNTER_ADD, %k2; vpaddd .Lone rRIP {1to16}, X13, X13{%k2}; vmovdqa32 X12, X12_SAVE; vmovdqa32 X13, X13_SAVE; /* Load vectors */ vmovdqa32 S0, X0; vmovdqa32 S4, X4; vmovdqa32 S8, X8; vmovdqa32 S1, X1; vmovdqa32 S5, X5; vpbroadcastd (9 * 4)(INPUT), X9; QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13) vmovdqa32 S2, X2; vmovdqa32 S6, X6; vpbroadcastd (10 * 4)(INPUT), X10; vmovdqa32 S14, X14; vmovdqa32 S3, X3; vmovdqa32 S7, X7; vpbroadcastd (11 * 4)(INPUT), X11; vmovdqa32 S15, X15; /* Update counter */ addq $16, (12 * 4)(INPUT); jmp .Lround2_entry; .align 16 .Lround2: QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14) QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13) .Lround2_entry: subl $2, ROUND; QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15) QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12) jnz .Lround2; .Lround2_end: PLUS(X0, S0); PLUS(X1, S1); PLUS(X5, S5); PLUS(X6, S6); PLUS(X10, (10 * 4)(INPUT){1to16}); PLUS(X11, (11 * 4)(INPUT){1to16}); PLUS(X15, S15); PLUS(X12, X12_SAVE); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14) PLUS(X2, S2); PLUS(X3, S3); PLUS(X4, S4); PLUS(X7, S7); transpose_4x4(X0, X1, X2, X3, TMP0, TMP1); transpose_4x4(X4, X5, X6, X7, TMP0, TMP1); PLUS(X8, S8); PLUS(X9, (9 * 4)(INPUT){1to16}); PLUS(X13, X13_SAVE); PLUS(X14, S14); transpose_4x4(X8, X9, X10, X11, TMP0, TMP1); transpose_4x4(X12, X13, X14, X15, TMP0, TMP1); transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1); xor_src_dst_4x4(DST, SRC, (64 * 0), (64 * 4), X0, X4, X8, X12); transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1); xor_src_dst_4x4(DST, SRC, (64 * 1), (64 * 4), X1, X5, X9, X13); transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1); xor_src_dst_4x4(DST, SRC, (64 * 2), (64 * 4), X2, X6, X10, X14); transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1); xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15); subq $16, NBLKS; leaq (16 * 64)(SRC), SRC; leaq (16 * 64)(DST), DST; jnz .Loop16; /* clear the used vector registers */ clear_zmm16_zmm31(); - kmovd %eax, %k2; + kxord %k2, %k2, %k2; vzeroall; /* clears ZMM0-ZMM15 */ /* eax zeroed by round loop. */ ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_avx512_blocks16, .-_gcry_chacha20_amd64_avx512_blocks16;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S index 48892777..72303e1e 100644 --- a/cipher/poly1305-amd64-avx512.S +++ b/cipher/poly1305-amd64-avx512.S @@ -1,1625 +1,1625 @@ /* ;; ;; Copyright (c) 2021-2022, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; */ /* * From: * https://github.com/intel/intel-ipsec-mb/blob/f0cad21a644231c0f5d4af51f56061a5796343fb/lib/avx512/poly_fma_avx512.asm * * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX512) #include "asm-common-amd64.h" .intel_syntax noprefix .text ELF(.type _gcry_poly1305_avx512_consts,@object) _gcry_poly1305_avx512_consts: .align 64 .Lmask_44: .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff .align 64 .Lmask_42: .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff .align 64 .Lhigh_bit: .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000 .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000 .Lbyte_len_to_mask_table: .short 0x0000, 0x0001, 0x0003, 0x0007 .short 0x000f, 0x001f, 0x003f, 0x007f .short 0x00ff, 0x01ff, 0x03ff, 0x07ff .short 0x0fff, 0x1fff, 0x3fff, 0x7fff .short 0xffff .align 64 .Lbyte64_len_to_mask_table: .quad 0x0000000000000000, 0x0000000000000001 .quad 0x0000000000000003, 0x0000000000000007 .quad 0x000000000000000f, 0x000000000000001f .quad 0x000000000000003f, 0x000000000000007f .quad 0x00000000000000ff, 0x00000000000001ff .quad 0x00000000000003ff, 0x00000000000007ff .quad 0x0000000000000fff, 0x0000000000001fff .quad 0x0000000000003fff, 0x0000000000007fff .quad 0x000000000000ffff, 0x000000000001ffff .quad 0x000000000003ffff, 0x000000000007ffff .quad 0x00000000000fffff, 0x00000000001fffff .quad 0x00000000003fffff, 0x00000000007fffff .quad 0x0000000000ffffff, 0x0000000001ffffff .quad 0x0000000003ffffff, 0x0000000007ffffff .quad 0x000000000fffffff, 0x000000001fffffff .quad 0x000000003fffffff, 0x000000007fffffff .quad 0x00000000ffffffff, 0x00000001ffffffff .quad 0x00000003ffffffff, 0x00000007ffffffff .quad 0x0000000fffffffff, 0x0000001fffffffff .quad 0x0000003fffffffff, 0x0000007fffffffff .quad 0x000000ffffffffff, 0x000001ffffffffff .quad 0x000003ffffffffff, 0x000007ffffffffff .quad 0x00000fffffffffff, 0x00001fffffffffff .quad 0x00003fffffffffff, 0x00007fffffffffff .quad 0x0000ffffffffffff, 0x0001ffffffffffff .quad 0x0003ffffffffffff, 0x0007ffffffffffff .quad 0x000fffffffffffff, 0x001fffffffffffff .quad 0x003fffffffffffff, 0x007fffffffffffff .quad 0x00ffffffffffffff, 0x01ffffffffffffff .quad 0x03ffffffffffffff, 0x07ffffffffffffff .quad 0x0fffffffffffffff, 0x1fffffffffffffff .quad 0x3fffffffffffffff, 0x7fffffffffffffff .quad 0xffffffffffffffff .Lqword_high_bit_mask: .short 0, 0x1, 0x5, 0x15, 0x55, 0x57, 0x5f, 0x7f, 0xff ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts) #define raxd eax #define rbxd ebx #define rcxd ecx #define rdxd edx #define rsid esi #define rdid edi #define rbpd ebp #define rspd esp #define __DWORD(X) X##d #define DWORD(R) __DWORD(R) #define arg1 rdi #define arg2 rsi #define arg3 rdx #define arg4 rcx #define job arg1 #define gp1 rsi #define gp2 rcx /* ;; don't use rdx and rax - they are needed for multiply operation */ #define gp3 rbp #define gp4 r8 #define gp5 r9 #define gp6 r10 #define gp7 r11 #define gp8 r12 #define gp9 r13 #define gp10 r14 #define gp11 r15 #define len gp11 #define msg gp10 #define POLY1305_BLOCK_SIZE 16 #define STACK_r_save 0 #define STACK_r_save_size (6 * 64) #define STACK_gpr_save (STACK_r_save + STACK_r_save_size) #define STACK_gpr_save_size (8 * 8) #define STACK_rsp_save (STACK_gpr_save + STACK_gpr_save_size) #define STACK_rsp_save_size (1 * 8) #define STACK_SIZE (STACK_rsp_save + STACK_rsp_save_size) #define A2_ZERO(...) /**/ #define A2_ZERO_INVERT(...) __VA_ARGS__ #define A2_NOT_ZERO(...) __VA_ARGS__ #define A2_NOT_ZERO_INVERT(...) /**/ #define clear_zmm(vec) vpxord vec, vec, vec /* ;; ============================================================================= ;; ============================================================================= ;; Computes hash for message length being multiple of block size ;; ============================================================================= ;; Combining 64-bit x 64-bit multiplication with reduction steps ;; ;; NOTES: ;; 1) A2 here is only two bits so anything above is subject of reduction. ;; Constant C1 = R1 + (R1 >> 2) simplifies multiply with less operations ;; 2) Magic 5x comes from mod 2^130-5 property and incorporating ;; reduction into multiply phase. ;; See "Cheating at modular arithmetic" and "Poly1305's prime: 2^130 - 5" ;; paragraphs at https://loup-vaillant.fr/tutorials/poly1305-design for more details. ;; ;; Flow of the code below is as follows: ;; ;; A2 A1 A0 ;; x R1 R0 ;; ----------------------------- ;; A2×R0 A1×R0 A0×R0 ;; + A0×R1 ;; + 5xA2xR1 5xA1xR1 ;; ----------------------------- ;; [0|L2L] [L1H|L1L] [L0H|L0L] ;; ;; Registers: T3:T2 T1:A0 ;; ;; Completing the multiply and adding (with carry) 3x128-bit limbs into ;; 192-bits again (3x64-bits): ;; A0 = L0L ;; A1 = L0H + L1L ;; T3 = L1H + L2L ; A0 [in/out] GPR with accumulator bits 63:0 ; A1 [in/out] GPR with accumulator bits 127:64 ; A2 [in/out] GPR with accumulator bits 195:128 ; R0 [in] GPR with R constant bits 63:0 ; R1 [in] GPR with R constant bits 127:64 ; C1 [in] C1 = R1 + (R1 >> 2) ; T1 [clobbered] GPR register ; T2 [clobbered] GPR register ; T3 [clobbered] GPR register ; GP_RAX [clobbered] RAX register ; GP_RDX [clobbered] RDX register ; IF_A2 [in] Used if input A2 is not 0 */ #define POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, C1, T1, T2, T3, GP_RAX, GP_RDX, IF_A2) \ /* T3:T2 = (A0 * R1) */ \ mov GP_RAX, R1; \ mul A0; \ mov T2, GP_RAX; \ mov GP_RAX, R0; \ mov T3, GP_RDX; \ \ /* T1:A0 = (A0 * R0) */ \ mul A0; \ mov A0, GP_RAX; /* A0 not used in other operations */ \ mov GP_RAX, R0; \ mov T1, GP_RDX; \ \ /* T3:T2 += (A1 * R0) */ \ mul A1; \ add T2, GP_RAX; \ mov GP_RAX, C1; \ adc T3, GP_RDX; \ \ /* T1:A0 += (A1 * R1x5) */ \ mul A1; \ IF_A2(mov A1, A2); /* use A1 for A2 */ \ add A0, GP_RAX; \ adc T1, GP_RDX; \ \ /* NOTE: A2 is clamped to 2-bits, */ \ /* R1/R0 is clamped to 60-bits, */ \ /* their product is less than 2^64. */ \ \ IF_A2(/* T3:T2 += (A2 * R1x5) */); \ IF_A2(imul A1, C1); \ IF_A2(add T2, A1); \ IF_A2(mov A1, T1); /* T1:A0 => A1:A0 */ \ IF_A2(adc T3, 0); \ \ IF_A2(/* T3:A1 += (A2 * R0) */); \ IF_A2(imul A2, R0); \ IF_A2(add A1, T2); \ IF_A2(adc T3, A2); \ \ IF_A2##_INVERT(/* If A2 == 0, just move and add T1-T2 to A1 */); \ IF_A2##_INVERT(mov A1, T1); \ IF_A2##_INVERT(add A1, T2); \ IF_A2##_INVERT(adc T3, 0); \ \ /* At this point, 3 64-bit limbs are in T3:A1:A0 */ \ /* T3 can span over more than 2 bits so final partial reduction step is needed. */ \ \ /* Partial reduction (just to fit into 130 bits) */ \ /* A2 = T3 & 3 */ \ /* k = (T3 & ~3) + (T3 >> 2) */ \ /* Y x4 + Y x1 */ \ /* A2:A1:A0 += k */ \ \ /* Result will be in A2:A1:A0 */ \ mov T1, T3; \ mov DWORD(A2), DWORD(T3); \ and T1, ~3; \ shr T3, 2; \ and DWORD(A2), 3; \ add T1, T3; \ \ /* A2:A1:A0 += k (kept in T1) */ \ add A0, T1; \ adc A1, 0; \ adc DWORD(A2), 0 /* ;; ============================================================================= ;; ============================================================================= ;; Computes hash for 8 16-byte message blocks, ;; and adds new message blocks to accumulator. ;; ;; It first multiplies all 8 blocks with powers of R: ;; ;; a2 a1 a0 ;; × b2 b1 b0 ;; --------------------------------------- ;; a2×b0 a1×b0 a0×b0 ;; + a1×b1 a0×b1 5×a2×b1 ;; + a0×b2 5×a2×b2 5×a1×b2 ;; --------------------------------------- ;; p2 p1 p0 ;; ;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs, ;; multiplying by 5 in case of the carry of p2. ;; ;A0 [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks ;A1 [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks ;A2 [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks ;R0 [in] ZMM register (R0) to include the 1st limb of R ;R1 [in] ZMM register (R1) to include the 2nd limb of R ;R2 [in] ZMM register (R2) to include the 3rd limb of R ;R1P [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5) ;R2P [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5) ;P0_L [clobbered] ZMM register to contain p[0] of the 8 blocks ;P0_H [clobbered] ZMM register to contain p[0] of the 8 blocks ;P1_L [clobbered] ZMM register to contain p[1] of the 8 blocks ;P1_H [clobbered] ZMM register to contain p[1] of the 8 blocks ;P2_L [clobbered] ZMM register to contain p[2] of the 8 blocks ;P2_H [clobbered] ZMM register to contain p[2] of the 8 blocks ;ZTMP1 [clobbered] Temporary ZMM register */ #define POLY1305_MUL_REDUCE_VEC(A0, A1, A2, R0, R1, R2, R1P, R2P, P0_L, P0_H, \ P1_L, P1_H, P2_L, P2_H, ZTMP1) \ /* ;; Reset accumulator */ \ vpxorq P0_L, P0_L, P0_L; \ vpxorq P0_H, P0_H, P0_H; \ vpxorq P1_L, P1_L, P1_L; \ vpxorq P1_H, P1_H, P1_H; \ vpxorq P2_L, P2_L, P2_L; \ vpxorq P2_H, P2_H, P2_H; \ \ /* ; Reset accumulator and calculate products */ \ vpmadd52luq P0_L, A2, R1P; \ vpmadd52huq P0_H, A2, R1P; \ vpmadd52luq P1_L, A2, R2P; \ vpmadd52huq P1_H, A2, R2P; \ vpmadd52luq P2_L, A2, R0; \ vpmadd52huq P2_H, A2, R0; \ \ vpmadd52luq P1_L, A0, R1; \ vpmadd52huq P1_H, A0, R1; \ vpmadd52luq P2_L, A0, R2; \ vpmadd52huq P2_H, A0, R2; \ vpmadd52luq P0_L, A0, R0; \ vpmadd52huq P0_H, A0, R0; \ \ vpmadd52luq P0_L, A1, R2P; \ vpmadd52huq P0_H, A1, R2P; \ vpmadd52luq P1_L, A1, R0; \ vpmadd52huq P1_H, A1, R0; \ vpmadd52luq P2_L, A1, R1; \ vpmadd52huq P2_H, A1, R1; \ \ /* ; Carry propagation (first pass) */ \ vpsrlq ZTMP1, P0_L, 44; \ vpandq A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpsllq P0_H, P0_H, 8; \ vpaddq P0_H, P0_H, ZTMP1; \ vpaddq P1_L, P1_L, P0_H; \ vpandq A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpsrlq ZTMP1, P1_L, 44; \ vpsllq P1_H, P1_H, 8; \ vpaddq P1_H, P1_H, ZTMP1; \ vpaddq P2_L, P2_L, P1_H; \ vpandq A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \ vpsrlq ZTMP1, P2_L, 42; \ vpsllq P2_H, P2_H, 10; \ vpaddq P2_H, P2_H, ZTMP1; \ \ /* ; Carry propagation (second pass) */ \ \ /* ; Multiply by 5 the highest bits (above 130 bits) */ \ vpaddq A0, A0, P2_H; \ vpsllq P2_H, P2_H, 2; \ vpaddq A0, A0, P2_H; \ vpsrlq ZTMP1, A0, 44; \ vpandq A0, A0, [.Lmask_44 ADD_RIP]; \ vpaddq A1, A1, ZTMP1; /* ;; ============================================================================= ;; ============================================================================= ;; Computes hash for 16 16-byte message blocks, ;; and adds new message blocks to accumulator, ;; interleaving this computation with the loading and splatting ;; of new data. ;; ;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2 ;; and 8 blocks from B0-B2, multiplied by R0-R2) ;; ;; a2 a1 a0 ;; × b2 b1 b0 ;; --------------------------------------- ;; a2×b0 a1×b0 a0×b0 ;; + a1×b1 a0×b1 5×a2×b1 ;; + a0×b2 5×a2×b2 5×a1×b2 ;; --------------------------------------- ;; p2 p1 p0 ;; ;; Then, it propagates the carry (higher bits after bit 43) ;; from lower limbs into higher limbs, ;; multiplying by 5 in case of the carry of p2, and adds ;; the results to A0-A2 and B0-B2. ;; ;; ============================================================================= ;A0 [in/out] ZMM register containing 1st 44-bit limb of blocks 1-8 ;A1 [in/out] ZMM register containing 2nd 44-bit limb of blocks 1-8 ;A2 [in/out] ZMM register containing 3rd 44-bit limb of blocks 1-8 ;B0 [in/out] ZMM register containing 1st 44-bit limb of blocks 9-16 ;B1 [in/out] ZMM register containing 2nd 44-bit limb of blocks 9-16 ;B2 [in/out] ZMM register containing 3rd 44-bit limb of blocks 9-16 ;R0 [in] ZMM register (R0) to include the 1st limb of R ;R1 [in] ZMM register (R1) to include the 2nd limb of R ;R2 [in] ZMM register (R2) to include the 3rd limb of R ;R1P [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5) ;R2P [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5) ;P0_L [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8 ;P0_H [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8 ;P1_L [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8 ;P1_H [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8 ;P2_L [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8 ;P2_H [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8 ;Q0_L [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16 ;Q0_H [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16 ;Q1_L [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16 ;Q1_H [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16 ;Q2_L [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16 ;Q2_H [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16 ;ZTMP1 [clobbered] Temporary ZMM register ;ZTMP2 [clobbered] Temporary ZMM register ;ZTMP3 [clobbered] Temporary ZMM register ;ZTMP4 [clobbered] Temporary ZMM register ;ZTMP5 [clobbered] Temporary ZMM register ;ZTMP6 [clobbered] Temporary ZMM register ;ZTMP7 [clobbered] Temporary ZMM register ;ZTMP8 [clobbered] Temporary ZMM register ;ZTMP9 [clobbered] Temporary ZMM register ;MSG [in/out] Pointer to message ;LEN [in/out] Length left of message */ #define POLY1305_MSG_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, \ R2P, P0_L, P0_H, P1_L, P1_H, P2_L, P2_H, \ Q0_L, Q0_H, Q1_L, Q1_H, Q2_L, Q2_H, \ ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, \ ZTMP6, ZTMP7, ZTMP8, ZTMP9, MSG, LEN) \ /* ;; Reset accumulator */ \ vpxorq P0_L, P0_L, P0_L; \ vpxorq P0_H, P0_H, P0_H; \ vpxorq P1_L, P1_L, P1_L; \ vpxorq P1_H, P1_H, P1_H; \ vpxorq P2_L, P2_L, P2_L; \ vpxorq P2_H, P2_H, P2_H; \ vpxorq Q0_L, Q0_L, Q0_L; \ vpxorq Q0_H, Q0_H, Q0_H; \ vpxorq Q1_L, Q1_L, Q1_L; \ vpxorq Q1_H, Q1_H, Q1_H; \ vpxorq Q2_L, Q2_L, Q2_L; \ vpxorq Q2_H, Q2_H, Q2_H; \ \ /* ;; This code interleaves hash computation with input loading/splatting */ \ \ /* ; Calculate products */ \ vpmadd52luq P0_L, A2, R1P; \ vpmadd52huq P0_H, A2, R1P; \ /* ;; input loading of new blocks */ \ add MSG, POLY1305_BLOCK_SIZE*16; \ sub LEN, POLY1305_BLOCK_SIZE*16; \ \ vpmadd52luq Q0_L, B2, R1P; \ vpmadd52huq Q0_H, B2, R1P; \ \ vpmadd52luq P1_L, A2, R2P; \ vpmadd52huq P1_H, A2, R2P; \ /* ; Load next block of data (128 bytes) */ \ vmovdqu64 ZTMP5, [MSG]; \ vmovdqu64 ZTMP2, [MSG + 64]; \ \ vpmadd52luq Q1_L, B2, R2P; \ vpmadd52huq Q1_H, B2, R2P; \ \ /* ; Interleave new blocks of data */ \ vpunpckhqdq ZTMP3, ZTMP5, ZTMP2; \ vpunpcklqdq ZTMP5, ZTMP5, ZTMP2; \ \ vpmadd52luq P0_L, A0, R0; \ vpmadd52huq P0_H, A0, R0; \ /* ; Highest 42-bit limbs of new blocks */ \ vpsrlq ZTMP6, ZTMP3, 24; \ vporq ZTMP6, ZTMP6, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \ \ vpmadd52luq Q0_L, B0, R0; \ vpmadd52huq Q0_H, B0, R0; \ \ /* ; Middle 44-bit limbs of new blocks */ \ vpsrlq ZTMP2, ZTMP5, 44; \ vpsllq ZTMP4, ZTMP3, 20; \ \ vpmadd52luq P2_L, A2, R0; \ vpmadd52huq P2_H, A2, R0; \ vpternlogq ZTMP2, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ /* ; Lowest 44-bit limbs of new blocks */ \ vpandq ZTMP5, ZTMP5, [.Lmask_44 ADD_RIP]; \ \ vpmadd52luq Q2_L, B2, R0; \ vpmadd52huq Q2_H, B2, R0; \ \ /* ; Load next block of data (128 bytes) */ \ vmovdqu64 ZTMP8, [MSG + 64*2]; \ vmovdqu64 ZTMP9, [MSG + 64*3]; \ \ vpmadd52luq P1_L, A0, R1; \ vpmadd52huq P1_H, A0, R1; \ /* ; Interleave new blocks of data */ \ vpunpckhqdq ZTMP3, ZTMP8, ZTMP9; \ vpunpcklqdq ZTMP8, ZTMP8, ZTMP9; \ \ vpmadd52luq Q1_L, B0, R1; \ vpmadd52huq Q1_H, B0, R1; \ \ /* ; Highest 42-bit limbs of new blocks */ \ vpsrlq ZTMP7, ZTMP3, 24; \ vporq ZTMP7, ZTMP7, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \ \ vpmadd52luq P0_L, A1, R2P; \ vpmadd52huq P0_H, A1, R2P; \ \ /* ; Middle 44-bit limbs of new blocks */ \ vpsrlq ZTMP9, ZTMP8, 44; \ vpsllq ZTMP4, ZTMP3, 20; \ \ vpmadd52luq Q0_L, B1, R2P; \ vpmadd52huq Q0_H, B1, R2P; \ \ vpternlogq ZTMP9, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ /* ; Lowest 44-bit limbs of new blocks */ \ vpandq ZTMP8, ZTMP8, [.Lmask_44 ADD_RIP]; \ \ vpmadd52luq P2_L, A0, R2; \ vpmadd52huq P2_H, A0, R2; \ /* ; Carry propagation (first pass) */ \ vpsrlq ZTMP1, P0_L, 44; \ vpsllq P0_H, P0_H, 8; \ vpmadd52luq Q2_L, B0, R2; \ vpmadd52huq Q2_H, B0, R2; \ \ vpsrlq ZTMP3, Q0_L, 44; \ vpsllq Q0_H, Q0_H, 8; \ \ vpmadd52luq P1_L, A1, R0; \ vpmadd52huq P1_H, A1, R0; \ /* ; Carry propagation (first pass) - continue */ \ vpandq A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq P0_H, P0_H, ZTMP1; \ vpmadd52luq Q1_L, B1, R0; \ vpmadd52huq Q1_H, B1, R0; \ \ vpandq B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq Q0_H, Q0_H, ZTMP3; \ \ vpmadd52luq P2_L, A1, R1; \ vpmadd52huq P2_H, A1, R1; \ /* ; Carry propagation (first pass) - continue */ \ vpaddq P1_L, P1_L, P0_H; \ vpsllq P1_H, P1_H, 8; \ vpsrlq ZTMP1, P1_L, 44; \ vpmadd52luq Q2_L, B1, R1; \ vpmadd52huq Q2_H, B1, R1; \ \ vpandq A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq Q1_L, Q1_L, Q0_H; \ vpsllq Q1_H, Q1_H, 8; \ vpsrlq ZTMP3, Q1_L, 44; \ vpandq B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ \ vpaddq P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \ vpaddq P2_L, P2_L, ZTMP1; \ vpandq A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \ vpaddq A2, A2, ZTMP6; /* ; Add highest bits from new blocks to accumulator */ \ vpsrlq ZTMP1, P2_L, 42; \ vpsllq P2_H, P2_H, 10; \ vpaddq P2_H, P2_H, ZTMP1; \ \ vpaddq Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \ vpaddq Q2_L, Q2_L, ZTMP3; \ vpandq B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \ vpaddq B2, B2, ZTMP7; /* ; Add highest bits from new blocks to accumulator */ \ vpsrlq ZTMP3, Q2_L, 42; \ vpsllq Q2_H, Q2_H, 10; \ vpaddq Q2_H, Q2_H, ZTMP3; \ \ /* ; Carry propagation (second pass) */ \ /* ; Multiply by 5 the highest bits (above 130 bits) */ \ vpaddq A0, A0, P2_H; \ vpsllq P2_H, P2_H, 2; \ vpaddq A0, A0, P2_H; \ vpaddq B0, B0, Q2_H; \ vpsllq Q2_H, Q2_H, 2; \ vpaddq B0, B0, Q2_H; \ \ vpsrlq ZTMP1, A0, 44; \ vpandq A0, A0, [.Lmask_44 ADD_RIP]; \ vpaddq A0, A0, ZTMP5; /* ; Add low 42-bit bits from new blocks to accumulator */ \ vpaddq A1, A1, ZTMP2; /* ; Add medium 42-bit bits from new blocks to accumulator */ \ vpaddq A1, A1, ZTMP1; \ vpsrlq ZTMP3, B0, 44; \ vpandq B0, B0, [.Lmask_44 ADD_RIP]; \ vpaddq B0, B0, ZTMP8; /* ; Add low 42-bit bits from new blocks to accumulator */ \ vpaddq B1, B1, ZTMP9; /* ; Add medium 42-bit bits from new blocks to accumulator */ \ vpaddq B1, B1, ZTMP3 /* ;; ============================================================================= ;; ============================================================================= ;; Computes hash for 16 16-byte message blocks. ;; ;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2 ;; and 8 blocks from B0-B2, multiplied by R0-R2 and S0-S2) ;; ;; ;; a2 a1 a0 ;; × b2 b1 b0 ;; --------------------------------------- ;; a2×b0 a1×b0 a0×b0 ;; + a1×b1 a0×b1 5×a2×b1 ;; + a0×b2 5×a2×b2 5×a1×b2 ;; --------------------------------------- ;; p2 p1 p0 ;; ;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs, ;; multiplying by 5 in case of the carry of p2. ;; ;; ============================================================================= ;A0 [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks ;A1 [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks ;A2 [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks ;B0 [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks ;B1 [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks ;B2 [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks ;R0 [in] ZMM register (R0) to include the 1st limb in IDX ;R1 [in] ZMM register (R1) to include the 2nd limb in IDX ;R2 [in] ZMM register (R2) to include the 3rd limb in IDX ;R1P [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX ;R2P [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX ;S0 [in] ZMM register (R0) to include the 1st limb in IDX ;S1 [in] ZMM register (R1) to include the 2nd limb in IDX ;S2 [in] ZMM register (R2) to include the 3rd limb in IDX ;S1P [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX ;S2P [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX ;P0_L [clobbered] ZMM register to contain p[0] of the 8 blocks ;P0_H [clobbered] ZMM register to contain p[0] of the 8 blocks ;P1_L [clobbered] ZMM register to contain p[1] of the 8 blocks ;P1_H [clobbered] ZMM register to contain p[1] of the 8 blocks ;P2_L [clobbered] ZMM register to contain p[2] of the 8 blocks ;P2_H [clobbered] ZMM register to contain p[2] of the 8 blocks ;Q0_L [clobbered] ZMM register to contain p[0] of the 8 blocks ;Q0_H [clobbered] ZMM register to contain p[0] of the 8 blocks ;Q1_L [clobbered] ZMM register to contain p[1] of the 8 blocks ;Q1_H [clobbered] ZMM register to contain p[1] of the 8 blocks ;Q2_L [clobbered] ZMM register to contain p[2] of the 8 blocks ;Q2_H [clobbered] ZMM register to contain p[2] of the 8 blocks ;ZTMP1 [clobbered] Temporary ZMM register ;ZTMP2 [clobbered] Temporary ZMM register */ #define POLY1305_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, R2P,\ S0, S1, S2, S1P, S2P, P0_L, P0_H, P1_L, P1_H,\ P2_L, P2_H, Q0_L, Q0_H, Q1_L, Q1_H, Q2_L,\ Q2_H, ZTMP1, ZTMP2) \ /* ;; Reset accumulator */ \ vpxorq P0_L, P0_L, P0_L; \ vpxorq P0_H, P0_H, P0_H; \ vpxorq P1_L, P1_L, P1_L; \ vpxorq P1_H, P1_H, P1_H; \ vpxorq P2_L, P2_L, P2_L; \ vpxorq P2_H, P2_H, P2_H; \ vpxorq Q0_L, Q0_L, Q0_L; \ vpxorq Q0_H, Q0_H, Q0_H; \ vpxorq Q1_L, Q1_L, Q1_L; \ vpxorq Q1_H, Q1_H, Q1_H; \ vpxorq Q2_L, Q2_L, Q2_L; \ vpxorq Q2_H, Q2_H, Q2_H; \ \ /* ;; This code interleaves hash computation with input loading/splatting */ \ \ /* ; Calculate products */ \ vpmadd52luq P0_L, A2, R1P; \ vpmadd52huq P0_H, A2, R1P; \ \ vpmadd52luq Q0_L, B2, S1P; \ vpmadd52huq Q0_H, B2, S1P; \ \ vpmadd52luq P1_L, A2, R2P; \ vpmadd52huq P1_H, A2, R2P; \ \ vpmadd52luq Q1_L, B2, S2P; \ vpmadd52huq Q1_H, B2, S2P; \ \ vpmadd52luq P0_L, A0, R0; \ vpmadd52huq P0_H, A0, R0; \ \ vpmadd52luq Q0_L, B0, S0; \ vpmadd52huq Q0_H, B0, S0; \ \ vpmadd52luq P2_L, A2, R0; \ vpmadd52huq P2_H, A2, R0; \ vpmadd52luq Q2_L, B2, S0; \ vpmadd52huq Q2_H, B2, S0; \ \ vpmadd52luq P1_L, A0, R1; \ vpmadd52huq P1_H, A0, R1; \ vpmadd52luq Q1_L, B0, S1; \ vpmadd52huq Q1_H, B0, S1; \ \ vpmadd52luq P0_L, A1, R2P; \ vpmadd52huq P0_H, A1, R2P; \ \ vpmadd52luq Q0_L, B1, S2P; \ vpmadd52huq Q0_H, B1, S2P; \ \ vpmadd52luq P2_L, A0, R2; \ vpmadd52huq P2_H, A0, R2; \ \ vpmadd52luq Q2_L, B0, S2; \ vpmadd52huq Q2_H, B0, S2; \ \ /* ; Carry propagation (first pass) */ \ vpsrlq ZTMP1, P0_L, 44; \ vpsllq P0_H, P0_H, 8; \ vpsrlq ZTMP2, Q0_L, 44; \ vpsllq Q0_H, Q0_H, 8; \ \ vpmadd52luq P1_L, A1, R0; \ vpmadd52huq P1_H, A1, R0; \ vpmadd52luq Q1_L, B1, S0; \ vpmadd52huq Q1_H, B1, S0; \ \ /* ; Carry propagation (first pass) - continue */ \ vpandq A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq P0_H, P0_H, ZTMP1; \ vpandq B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq Q0_H, Q0_H, ZTMP2; \ \ vpmadd52luq P2_L, A1, R1; \ vpmadd52huq P2_H, A1, R1; \ vpmadd52luq Q2_L, B1, S1; \ vpmadd52huq Q2_H, B1, S1; \ \ /* ; Carry propagation (first pass) - continue */ \ vpaddq P1_L, P1_L, P0_H; \ vpsllq P1_H, P1_H, 8; \ vpsrlq ZTMP1, P1_L, 44; \ vpandq A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq Q1_L, Q1_L, Q0_H; \ vpsllq Q1_H, Q1_H, 8; \ vpsrlq ZTMP2, Q1_L, 44; \ vpandq B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ \ vpaddq P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \ vpaddq P2_L, P2_L, ZTMP1; \ vpandq A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \ vpsrlq ZTMP1, P2_L, 42; \ vpsllq P2_H, P2_H, 10; \ vpaddq P2_H, P2_H, ZTMP1; \ \ vpaddq Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \ vpaddq Q2_L, Q2_L, ZTMP2; \ vpandq B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \ vpsrlq ZTMP2, Q2_L, 42; \ vpsllq Q2_H, Q2_H, 10; \ vpaddq Q2_H, Q2_H, ZTMP2; \ \ /* ; Carry propagation (second pass) */ \ /* ; Multiply by 5 the highest bits (above 130 bits) */ \ vpaddq A0, A0, P2_H; \ vpsllq P2_H, P2_H, 2; \ vpaddq A0, A0, P2_H; \ vpaddq B0, B0, Q2_H; \ vpsllq Q2_H, Q2_H, 2; \ vpaddq B0, B0, Q2_H; \ \ vpsrlq ZTMP1, A0, 44; \ vpandq A0, A0, [.Lmask_44 ADD_RIP]; \ vpaddq A1, A1, ZTMP1; \ vpsrlq ZTMP2, B0, 44; \ vpandq B0, B0, [.Lmask_44 ADD_RIP]; \ vpaddq B1, B1, ZTMP2; /* ;; ============================================================================= ;; ============================================================================= ;; Shuffle data blocks, so they match the right power of R. ;; Powers of R are in this order: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R ;; Data blocks are coming in this order: A0 A4 A1 A5 A2 A6 A3 A7 ;; Generally the computation is: A0*R^8 + A1*R^7 + A2*R^6 + A3*R^5 + ;; A4*R^4 + A5*R^3 + A6*R^2 + A7*R ;; When there are less data blocks, less powers of R are used, so data needs to ;; be shuffled. Example: if 4 blocks are left, only A0-A3 are available and only ;; R-R^4 are used (A0*R^4 + A1*R^3 + A2*R^2 + A3*R), so A0-A3 need to be shifted ;; ============================================================================= ;A_L [in/out] 0-43 bits of input data ;A_M [in/out] 44-87 bits of input data ;A_H [in/out] 88-129 bits of input data ;TMP [clobbered] Temporary GP register ;N_BLOCKS [in] Number of remaining input blocks */ #define SHUFFLE_DATA_SMASK_1 0x39 #define SHUFFLE_DATA_KMASK_1 0xffff #define SHUFFLE_DATA_SMASK_2 0x4E #define SHUFFLE_DATA_KMASK_2 0xffff #define SHUFFLE_DATA_SMASK_3 0x93 #define SHUFFLE_DATA_KMASK_3 0xffff #define SHUFFLE_DATA_KMASK_4 0xffff #define SHUFFLE_DATA_SMASK_5 0x39 #define SHUFFLE_DATA_KMASK_5 0xfff0 #define SHUFFLE_DATA_SMASK_6 0x4E #define SHUFFLE_DATA_KMASK_6 0xff00 #define SHUFFLE_DATA_SMASK_7 0x93 #define SHUFFLE_DATA_KMASK_7 0xf000 #define SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, N_BLOCKS) \ mov TMP, SHUFFLE_DATA_KMASK_##N_BLOCKS; \ kmovq k1, TMP; \ vpshufd A_L{k1}, A_L, 0x4E; \ vpshufd A_M{k1}, A_M, 0x4E; \ vpshufd A_H{k1}, A_H, 0x4E; \ vshufi64x2 A_L, A_L, A_L, SHUFFLE_DATA_SMASK_##N_BLOCKS; \ vshufi64x2 A_M, A_M, A_M, SHUFFLE_DATA_SMASK_##N_BLOCKS; \ vshufi64x2 A_H, A_H, A_H, SHUFFLE_DATA_SMASK_##N_BLOCKS #define SHUFFLE_DATA_BLOCKS_1(A_L, A_M, A_H, TMP) \ SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 1) #define SHUFFLE_DATA_BLOCKS_2(A_L, A_M, A_H, TMP) \ SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 2) #define SHUFFLE_DATA_BLOCKS_3(A_L, A_M, A_H, TMP) \ SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 3) #define SHUFFLE_DATA_BLOCKS_4(A_L, A_M, A_H, TMP) \ mov TMP, SHUFFLE_DATA_KMASK_4; \ kmovq k1, TMP; \ vpshufd A_L{k1}, A_L, 0x4E; \ vpshufd A_M{k1}, A_M, 0x4E; \ vpshufd A_H{k1}, A_H, 0x4E; #define SHUFFLE_DATA_BLOCKS_5(A_L, A_M, A_H, TMP) \ SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 5) #define SHUFFLE_DATA_BLOCKS_6(A_L, A_M, A_H, TMP) \ SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 6) #define SHUFFLE_DATA_BLOCKS_7(A_L, A_M, A_H, TMP) \ SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 7) /* ;; ============================================================================= ;; ============================================================================= ;; Computes hash for message length being multiple of block size ;; ============================================================================= ;MSG [in/out] GPR pointer to input message (updated) ;LEN [in/out] GPR in: length in bytes / out: length mod 16 ;A0 [in/out] accumulator bits 63..0 ;A1 [in/out] accumulator bits 127..64 ;A2 [in/out] accumulator bits 195..128 ;R0 [in] R constant bits 63..0 ;R1 [in] R constant bits 127..64 ;T0 [clobbered] GPR register ;T1 [clobbered] GPR register ;T2 [clobbered] GPR register ;T3 [clobbered] GPR register ;GP_RAX [clobbered] RAX register ;GP_RDX [clobbered] RDX register */ #define POLY1305_BLOCKS(MSG, LEN, A0, A1, A2, R0, R1, T0, T1, T2, T3, \ GP_RAX, GP_RDX) \ /* ; Minimum of 256 bytes to run vectorized code */ \ cmp LEN, POLY1305_BLOCK_SIZE*16; \ jb .L_final_loop; \ \ /* ; Spread accumulator into 44-bit limbs in quadwords */ \ mov T0, A0; \ and T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (A[43:0]) */ \ vmovq xmm5, T0; \ \ mov T0, A1; \ shrd A0, T0, 44; \ and A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (A[77:52]) */ \ vmovq xmm6, A0; \ \ shrd A1, A2, 24; \ and A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (A[129:88]) */ \ vmovq xmm7, A1; \ \ /* ; Load first block of data (128 bytes) */ \ vmovdqu64 zmm0, [MSG]; \ vmovdqu64 zmm1, [MSG + 64]; \ \ /* ; Interleave the data to form 44-bit limbs */ \ /* ; */ \ /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \ /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \ /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \ vpunpckhqdq zmm15, zmm0, zmm1; \ vpunpcklqdq zmm13, zmm0, zmm1; \ \ vpsrlq zmm14, zmm13, 44; \ vpsllq zmm18, zmm15, 20; \ vpternlogq zmm14, zmm18, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ vpandq zmm13, zmm13, [.Lmask_44 ADD_RIP]; \ vpsrlq zmm15, zmm15, 24; \ \ /* ; Add 2^128 to all 8 final qwords of the message */ \ vporq zmm15, zmm15, [.Lhigh_bit ADD_RIP]; \ \ vpaddq zmm13, zmm13, zmm5; \ vpaddq zmm14, zmm14, zmm6; \ vpaddq zmm15, zmm15, zmm7; \ \ /* ; Load next blocks of data (128 bytes) */ \ vmovdqu64 zmm0, [MSG + 64*2]; \ vmovdqu64 zmm1, [MSG + 64*3]; \ \ /* ; Interleave the data to form 44-bit limbs */ \ /* ; */ \ /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \ /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \ /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \ vpunpckhqdq zmm18, zmm0, zmm1; \ vpunpcklqdq zmm16, zmm0, zmm1; \ \ vpsrlq zmm17, zmm16, 44; \ vpsllq zmm19, zmm18, 20; \ vpternlogq zmm17, zmm19, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ vpandq zmm16, zmm16, [.Lmask_44 ADD_RIP]; \ vpsrlq zmm18, zmm18, 24; \ \ /* ; Add 2^128 to all 8 final qwords of the message */ \ vporq zmm18, zmm18, [.Lhigh_bit ADD_RIP]; \ \ /* ; Use memory in stack to save powers of R, before loading them into ZMM registers */ \ /* ; The first 16*8 bytes will contain the 16 bytes of the 8 powers of R */ \ /* ; The last 64 bytes will contain the last 2 bits of powers of R, spread in 8 qwords, */ \ /* ; to be OR'd with the highest qwords (in zmm26) */ \ vmovq xmm3, R0; \ vpinsrq xmm3, xmm3, R1, 1; \ vinserti32x4 zmm1, zmm1, xmm3, 3; \ \ vpxorq zmm0, zmm0, zmm0; \ vpxorq zmm2, zmm2, zmm2; \ \ /* ; Calculate R^2 */ \ mov T0, R1; \ shr T0, 2; \ add T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \ \ mov A0, R0; \ mov A1, R1; \ \ POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_ZERO); \ \ vmovq xmm3, A0; \ vpinsrq xmm3, xmm3, A1, 1; \ vinserti32x4 zmm1, zmm1, xmm3, 2; \ \ vmovq xmm4, A2; \ vinserti32x4 zmm2, zmm2, xmm4, 2; \ \ /* ; Calculate R^3 */ \ POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \ \ vmovq xmm3, A0; \ vpinsrq xmm3, xmm3, A1, 1; \ vinserti32x4 zmm1, zmm1, xmm3, 1; \ \ vmovq xmm4, A2; \ vinserti32x4 zmm2, zmm2, xmm4, 1; \ \ /* ; Calculate R^4 */ \ POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \ \ vmovq xmm3, A0; \ vpinsrq xmm3, xmm3, A1, 1; \ vinserti32x4 zmm1, zmm1, xmm3, 0; \ \ vmovq xmm4, A2; \ vinserti32x4 zmm2, zmm2, xmm4, 0; \ \ /* ; Move 2 MSbits to top 24 bits, to be OR'ed later */ \ vpsllq zmm2, zmm2, 40; \ \ vpunpckhqdq zmm21, zmm1, zmm0; \ vpunpcklqdq zmm19, zmm1, zmm0; \ \ vpsrlq zmm20, zmm19, 44; \ vpsllq zmm4, zmm21, 20; \ vpternlogq zmm20, zmm4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ vpandq zmm19, zmm19, [.Lmask_44 ADD_RIP]; \ vpsrlq zmm21, zmm21, 24; \ \ /* ; zmm2 contains the 2 highest bits of the powers of R */ \ vporq zmm21, zmm21, zmm2; \ \ /* ; Broadcast 44-bit limbs of R^4 */ \ mov T0, A0; \ and T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (R^4[43:0]) */ \ vpbroadcastq zmm22, T0; \ \ mov T0, A1; \ shrd A0, T0, 44; \ and A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (R^4[87:44]) */ \ vpbroadcastq zmm23, A0; \ \ shrd A1, A2, 24; \ and A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (R^4[129:88]) */ \ vpbroadcastq zmm24, A1; \ \ /* ; Generate 4*5*R^4 */ \ vpsllq zmm25, zmm23, 2; \ vpsllq zmm26, zmm24, 2; \ \ /* ; 5*R^4 */ \ vpaddq zmm25, zmm25, zmm23; \ vpaddq zmm26, zmm26, zmm24; \ \ /* ; 4*5*R^4 */ \ vpsllq zmm25, zmm25, 2; \ vpsllq zmm26, zmm26, 2; \ \ vpslldq zmm29, zmm19, 8; \ vpslldq zmm30, zmm20, 8; \ vpslldq zmm31, zmm21, 8; \ \ /* ; Calculate R^8-R^5 */ \ POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \ zmm22, zmm23, zmm24, \ zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm11); \ \ /* ; Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R */ \ vporq zmm19, zmm19, zmm29; \ vporq zmm20, zmm20, zmm30; \ vporq zmm21, zmm21, zmm31; \ \ /* ; Broadcast R^8 */ \ vpbroadcastq zmm22, xmm19; \ vpbroadcastq zmm23, xmm20; \ vpbroadcastq zmm24, xmm21; \ \ /* ; Generate 4*5*R^8 */ \ vpsllq zmm25, zmm23, 2; \ vpsllq zmm26, zmm24, 2; \ \ /* ; 5*R^8 */ \ vpaddq zmm25, zmm25, zmm23; \ vpaddq zmm26, zmm26, zmm24; \ \ /* ; 4*5*R^8 */ \ vpsllq zmm25, zmm25, 2; \ vpsllq zmm26, zmm26, 2; \ \ cmp LEN, POLY1305_BLOCK_SIZE*32; \ jb .L_len_256_511; \ \ /* ; Store R^8-R for later use */ \ vmovdqa64 [rsp + STACK_r_save], zmm19; \ vmovdqa64 [rsp + STACK_r_save + 64], zmm20; \ vmovdqa64 [rsp + STACK_r_save + 64*2], zmm21; \ \ /* ; Calculate R^16-R^9 */ \ POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \ zmm22, zmm23, zmm24, \ zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm11); \ \ /* ; Store R^16-R^9 for later use */ \ vmovdqa64 [rsp + STACK_r_save + 64*3], zmm19; \ vmovdqa64 [rsp + STACK_r_save + 64*4], zmm20; \ vmovdqa64 [rsp + STACK_r_save + 64*5], zmm21; \ \ /* ; Broadcast R^16 */ \ vpbroadcastq zmm22, xmm19; \ vpbroadcastq zmm23, xmm20; \ vpbroadcastq zmm24, xmm21; \ \ /* ; Generate 4*5*R^16 */ \ vpsllq zmm25, zmm23, 2; \ vpsllq zmm26, zmm24, 2; \ \ /* ; 5*R^16 */ \ vpaddq zmm25, zmm25, zmm23; \ vpaddq zmm26, zmm26, zmm24; \ \ /* ; 4*5*R^16 */ \ vpsllq zmm25, zmm25, 2; \ vpsllq zmm26, zmm26, 2; \ \ mov T0, LEN; \ and T0, 0xffffffffffffff00; /* ; multiple of 256 bytes */ \ \ .L_poly1305_blocks_loop: \ cmp T0, POLY1305_BLOCK_SIZE*16; \ jbe .L_poly1305_blocks_loop_end; \ \ /* ; zmm13-zmm18 contain the 16 blocks of message plus the previous accumulator */ \ /* ; zmm22-24 contain the 5x44-bit limbs of the powers of R */ \ /* ; zmm25-26 contain the 5x44-bit limbs of the powers of R' (5*4*R) */ \ POLY1305_MSG_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \ zmm22, zmm23, zmm24, zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm19, zmm20, zmm21, zmm27, zmm28, zmm29, \ zmm30, zmm31, zmm11, zmm0, zmm1, \ zmm2, zmm3, zmm4, zmm12, MSG, T0); \ \ jmp .L_poly1305_blocks_loop; \ \ .L_poly1305_blocks_loop_end: \ \ /* ;; Need to multiply by r^16, r^15, r^14... r */ \ \ /* ; First multiply by r^16-r^9 */ \ \ /* ; Read R^16-R^9 */ \ vmovdqa64 zmm19, [rsp + STACK_r_save + 64*3]; \ vmovdqa64 zmm20, [rsp + STACK_r_save + 64*4]; \ vmovdqa64 zmm21, [rsp + STACK_r_save + 64*5]; \ /* ; Read R^8-R */ \ vmovdqa64 zmm22, [rsp + STACK_r_save]; \ vmovdqa64 zmm23, [rsp + STACK_r_save + 64]; \ vmovdqa64 zmm24, [rsp + STACK_r_save + 64*2]; \ \ /* ; zmm27 to have bits 87-44 of all 9-16th powers of R' in 8 qwords */ \ /* ; zmm28 to have bits 129-88 of all 9-16th powers of R' in 8 qwords */ \ vpsllq zmm0, zmm20, 2; \ vpaddq zmm27, zmm20, zmm0; /* ; R1' (R1*5) */ \ vpsllq zmm1, zmm21, 2; \ vpaddq zmm28, zmm21, zmm1; /* ; R2' (R2*5) */ \ \ /* ; 4*5*R */ \ vpsllq zmm27, zmm27, 2; \ vpsllq zmm28, zmm28, 2; \ \ /* ; Then multiply by r^8-r */ \ \ /* ; zmm25 to have bits 87-44 of all 1-8th powers of R' in 8 qwords */ \ /* ; zmm26 to have bits 129-88 of all 1-8th powers of R' in 8 qwords */ \ vpsllq zmm2, zmm23, 2; \ vpaddq zmm25, zmm23, zmm2; /* ; R1' (R1*5) */ \ vpsllq zmm3, zmm24, 2; \ vpaddq zmm26, zmm24, zmm3; /* ; R2' (R2*5) */ \ \ /* ; 4*5*R */ \ vpsllq zmm25, zmm25, 2; \ vpsllq zmm26, zmm26, 2; \ \ POLY1305_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \ zmm19, zmm20, zmm21, zmm27, zmm28, \ zmm22, zmm23, zmm24, zmm25, zmm26, \ zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, \ zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm29); \ \ /* ;; Add all blocks (horizontally) */ \ vpaddq zmm13, zmm13, zmm16; \ vpaddq zmm14, zmm14, zmm17; \ vpaddq zmm15, zmm15, zmm18; \ \ vextracti64x4 ymm0, zmm13, 1; \ vextracti64x4 ymm1, zmm14, 1; \ vextracti64x4 ymm2, zmm15, 1; \ \ vpaddq ymm13, ymm13, ymm0; \ vpaddq ymm14, ymm14, ymm1; \ vpaddq ymm15, ymm15, ymm2; \ \ vextracti32x4 xmm10, ymm13, 1; \ vextracti32x4 xmm11, ymm14, 1; \ vextracti32x4 xmm12, ymm15, 1; \ \ vpaddq xmm13, xmm13, xmm10; \ vpaddq xmm14, xmm14, xmm11; \ vpaddq xmm15, xmm15, xmm12; \ \ vpsrldq xmm10, xmm13, 8; \ vpsrldq xmm11, xmm14, 8; \ vpsrldq xmm12, xmm15, 8; \ \ /* ; Finish folding and clear second qword */ \ mov T0, 0xfd; \ kmovq k1, T0; \ vpaddq xmm13{k1}{z}, xmm13, xmm10; \ vpaddq xmm14{k1}{z}, xmm14, xmm11; \ vpaddq xmm15{k1}{z}, xmm15, xmm12; \ \ add MSG, POLY1305_BLOCK_SIZE*16; \ \ and LEN, (POLY1305_BLOCK_SIZE*16 - 1); /* ; Get remaining lengths (LEN < 256 bytes) */ \ \ .L_less_than_256: \ \ cmp LEN, POLY1305_BLOCK_SIZE*8; \ jb .L_less_than_128; \ \ /* ; Read next 128 bytes */ \ /* ; Load first block of data (128 bytes) */ \ vmovdqu64 zmm0, [MSG]; \ vmovdqu64 zmm1, [MSG + 64]; \ \ /* ; Interleave the data to form 44-bit limbs */ \ /* ; */ \ /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \ /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \ /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \ vpunpckhqdq zmm5, zmm0, zmm1; \ vpunpcklqdq zmm3, zmm0, zmm1; \ \ vpsrlq zmm4, zmm3, 44; \ vpsllq zmm8, zmm5, 20; \ vpternlogq zmm4, zmm8, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ vpandq zmm3, zmm3, [.Lmask_44 ADD_RIP]; \ vpsrlq zmm5, zmm5, 24; \ \ /* ; Add 2^128 to all 8 final qwords of the message */ \ vporq zmm5, zmm5, [.Lhigh_bit ADD_RIP]; \ \ vpaddq zmm13, zmm13, zmm3; \ vpaddq zmm14, zmm14, zmm4; \ vpaddq zmm15, zmm15, zmm5; \ \ add MSG, POLY1305_BLOCK_SIZE*8; \ sub LEN, POLY1305_BLOCK_SIZE*8; \ \ POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \ zmm22, zmm23, zmm24, \ zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm11); \ \ /* ;; Add all blocks (horizontally) */ \ vextracti64x4 ymm0, zmm13, 1; \ vextracti64x4 ymm1, zmm14, 1; \ vextracti64x4 ymm2, zmm15, 1; \ \ vpaddq ymm13, ymm13, ymm0; \ vpaddq ymm14, ymm14, ymm1; \ vpaddq ymm15, ymm15, ymm2; \ \ vextracti32x4 xmm10, ymm13, 1; \ vextracti32x4 xmm11, ymm14, 1; \ vextracti32x4 xmm12, ymm15, 1; \ \ vpaddq xmm13, xmm13, xmm10; \ vpaddq xmm14, xmm14, xmm11; \ vpaddq xmm15, xmm15, xmm12; \ \ vpsrldq xmm10, xmm13, 8; \ vpsrldq xmm11, xmm14, 8; \ vpsrldq xmm12, xmm15, 8; \ \ /* ; Finish folding and clear second qword */ \ mov T0, 0xfd; \ kmovq k1, T0; \ vpaddq xmm13{k1}{z}, xmm13, xmm10; \ vpaddq xmm14{k1}{z}, xmm14, xmm11; \ vpaddq xmm15{k1}{z}, xmm15, xmm12; \ \ .L_less_than_128: \ cmp LEN, 32; /* ; If remaining bytes is <= 32, perform last blocks in scalar */ \ jbe .L_simd_to_gp; \ \ mov T0, LEN; \ and T0, 0x3f; \ lea T1, [.Lbyte64_len_to_mask_table ADD_RIP]; \ mov T1, [T1 + 8*T0]; \ \ /* ; Load default byte masks */ \ mov T2, 0xffffffffffffffff; \ xor T3, T3; \ \ cmp LEN, 64; \ cmovb T2, T1; /* ; Load mask for first 64 bytes */ \ cmovg T3, T1; /* ; Load mask for second 64 bytes */ \ \ kmovq k1, T2; \ kmovq k2, T3; \ vmovdqu8 zmm0{k1}{z}, [MSG]; \ vmovdqu8 zmm1{k2}{z}, [MSG + 64]; \ \ /* ; Pad last block message, if partial */ \ mov T0, LEN; \ and T0, 0x70; /* ; Multiple of 16 bytes */ \ /* ; Load last block of data (up to 112 bytes) */ \ shr T0, 3; /* ; Get number of full qwords */ \ \ /* ; Interleave the data to form 44-bit limbs */ \ /* ; */ \ /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \ /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \ /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \ vpunpckhqdq zmm4, zmm0, zmm1; \ vpunpcklqdq zmm2, zmm0, zmm1; \ \ vpsrlq zmm3, zmm2, 44; \ vpsllq zmm28, zmm4, 20; \ vpternlogq zmm3, zmm28, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \ \ vpandq zmm2, zmm2, [.Lmask_44 ADD_RIP]; \ vpsrlq zmm4, zmm4, 24; \ \ lea T1, [.Lqword_high_bit_mask ADD_RIP]; \ kmovb k1, [T1 + T0]; \ /* ; Add 2^128 to final qwords of the message (all full blocks and partial block, */ \ /* ; if "pad_to_16" is selected) */ \ vporq zmm4{k1}, zmm4, [.Lhigh_bit ADD_RIP]; \ \ vpaddq zmm13, zmm13, zmm2; \ vpaddq zmm14, zmm14, zmm3; \ vpaddq zmm15, zmm15, zmm4; \ \ mov T0, LEN; \ add T0, 15; \ shr T0, 4; /* ; Get number of 16-byte blocks (including partial blocks) */ \ xor LEN, LEN; /* ; All length will be consumed */ \ \ /* ; No need to shuffle data blocks (data is in the right order) */ \ cmp T0, 8; \ je .L_end_shuffle; \ \ cmp T0, 4; \ je .L_shuffle_blocks_4; \ jb .L_shuffle_blocks_3; \ \ /* ; Number of 16-byte blocks > 4 */ \ cmp T0, 6; \ je .L_shuffle_blocks_6; \ ja .L_shuffle_blocks_7; \ jmp .L_shuffle_blocks_5; \ \ .L_shuffle_blocks_3: \ SHUFFLE_DATA_BLOCKS_3(zmm13, zmm14, zmm15, T1); \ jmp .L_end_shuffle; \ .L_shuffle_blocks_4: \ SHUFFLE_DATA_BLOCKS_4(zmm13, zmm14, zmm15, T1); \ jmp .L_end_shuffle; \ .L_shuffle_blocks_5: \ SHUFFLE_DATA_BLOCKS_5(zmm13, zmm14, zmm15, T1); \ jmp .L_end_shuffle; \ .L_shuffle_blocks_6: \ SHUFFLE_DATA_BLOCKS_6(zmm13, zmm14, zmm15, T1); \ jmp .L_end_shuffle; \ .L_shuffle_blocks_7: \ SHUFFLE_DATA_BLOCKS_7(zmm13, zmm14, zmm15, T1); \ \ .L_end_shuffle: \ \ /* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \ /* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \ /* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \ POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \ zmm22, zmm23, zmm24, \ zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm11); \ \ /* ;; Add all blocks (horizontally) */ \ vextracti64x4 ymm0, zmm13, 1; \ vextracti64x4 ymm1, zmm14, 1; \ vextracti64x4 ymm2, zmm15, 1; \ \ vpaddq ymm13, ymm13, ymm0; \ vpaddq ymm14, ymm14, ymm1; \ vpaddq ymm15, ymm15, ymm2; \ \ vextracti32x4 xmm10, ymm13, 1; \ vextracti32x4 xmm11, ymm14, 1; \ vextracti32x4 xmm12, ymm15, 1; \ \ vpaddq xmm13, xmm13, xmm10; \ vpaddq xmm14, xmm14, xmm11; \ vpaddq xmm15, xmm15, xmm12; \ \ vpsrldq xmm10, xmm13, 8; \ vpsrldq xmm11, xmm14, 8; \ vpsrldq xmm12, xmm15, 8; \ \ vpaddq xmm13, xmm13, xmm10; \ vpaddq xmm14, xmm14, xmm11; \ vpaddq xmm15, xmm15, xmm12; \ \ .L_simd_to_gp: \ /* ; Carry propagation */ \ vpsrlq xmm0, xmm13, 44; \ vpandq xmm13, xmm13, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq xmm14, xmm14, xmm0; \ vpsrlq xmm0, xmm14, 44; \ vpandq xmm14, xmm14, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \ vpaddq xmm15, xmm15, xmm0; \ vpsrlq xmm0, xmm15, 42; \ vpandq xmm15, xmm15, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \ vpsllq xmm1, xmm0, 2; \ vpaddq xmm0, xmm0, xmm1; \ vpaddq xmm13, xmm13, xmm0; \ \ /* ; Put together A */ \ vmovq A0, xmm13; \ \ vmovq T0, xmm14; \ mov T1, T0; \ shl T1, 44; \ or A0, T1; \ \ shr T0, 20; \ vmovq A2, xmm15; \ mov A1, A2; \ shl A1, 24; \ or A1, T0; \ shr A2, 40; \ \ /* ; Clear powers of R */ \ vpxorq zmm0, zmm0, zmm0; \ vmovdqa64 [rsp + STACK_r_save], zmm0; \ vmovdqa64 [rsp + STACK_r_save + 64], zmm0; \ vmovdqa64 [rsp + STACK_r_save + 64*2], zmm0; \ vmovdqa64 [rsp + STACK_r_save + 64*3], zmm0; \ vmovdqa64 [rsp + STACK_r_save + 64*4], zmm0; \ vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \ \ vzeroall; \ clear_zmm(xmm16); clear_zmm(xmm20); clear_zmm(xmm24); clear_zmm(xmm28); \ clear_zmm(xmm17); clear_zmm(xmm21); clear_zmm(xmm25); clear_zmm(xmm29); \ clear_zmm(xmm18); clear_zmm(xmm22); clear_zmm(xmm26); clear_zmm(xmm30); \ clear_zmm(xmm19); clear_zmm(xmm23); clear_zmm(xmm27); clear_zmm(xmm31); \ \ .L_final_loop: \ cmp LEN, POLY1305_BLOCK_SIZE; \ jb .L_poly1305_blocks_exit; \ \ /* ;; A += MSG[i] */ \ add A0, [MSG + 0]; \ adc A1, [MSG + 8]; \ adc A2, 1; /* ;; no padding bit */ \ \ mov T0, R1; \ shr T0, 2; \ add T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \ \ POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, \ T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \ \ add MSG, POLY1305_BLOCK_SIZE; \ sub LEN, POLY1305_BLOCK_SIZE; \ \ jmp .L_final_loop; \ \ .L_len_256_511: \ \ /* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \ /* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \ /* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \ POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \ zmm22, zmm23, zmm24, \ zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm11); \ \ /* ; Then multiply by r^8-r */ \ \ /* ; zmm19-zmm21 contains R^8-R, need to move it to zmm22-24, */ \ /* ; as it might be used in other part of the code */ \ vmovdqa64 zmm22, zmm19; \ vmovdqa64 zmm23, zmm20; \ vmovdqa64 zmm24, zmm21; \ \ /* ; zmm25 to have bits 87-44 of all 8 powers of R' in 8 qwords */ \ /* ; zmm26 to have bits 129-88 of all 8 powers of R' in 8 qwords */ \ vpsllq zmm0, zmm23, 2; \ vpaddq zmm25, zmm23, zmm0; /* ; R1' (R1*5) */ \ vpsllq zmm1, zmm24, 2; \ vpaddq zmm26, zmm24, zmm1; /* ; R2' (R2*5) */ \ \ /* ; 4*5*R^8 */ \ vpsllq zmm25, zmm25, 2; \ vpsllq zmm26, zmm26, 2; \ \ vpaddq zmm13, zmm13, zmm16; \ vpaddq zmm14, zmm14, zmm17; \ vpaddq zmm15, zmm15, zmm18; \ \ /* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \ /* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \ /* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \ POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \ zmm22, zmm23, zmm24, \ zmm25, zmm26, \ zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm11); \ \ /* ;; Add all blocks (horizontally) */ \ vextracti64x4 ymm0, zmm13, 1; \ vextracti64x4 ymm1, zmm14, 1; \ vextracti64x4 ymm2, zmm15, 1; \ \ vpaddq ymm13, ymm13, ymm0; \ vpaddq ymm14, ymm14, ymm1; \ vpaddq ymm15, ymm15, ymm2; \ \ vextracti32x4 xmm10, ymm13, 1; \ vextracti32x4 xmm11, ymm14, 1; \ vextracti32x4 xmm12, ymm15, 1; \ \ vpaddq xmm13, xmm13, xmm10; \ vpaddq xmm14, xmm14, xmm11; \ vpaddq xmm15, xmm15, xmm12; \ \ vpsrldq xmm10, xmm13, 8; \ vpsrldq xmm11, xmm14, 8; \ vpsrldq xmm12, xmm15, 8; \ \ /* ; Finish folding and clear second qword */ \ mov T0, 0xfd; \ kmovq k1, T0; \ vpaddq xmm13{k1}{z}, xmm13, xmm10; \ vpaddq xmm14{k1}{z}, xmm14, xmm11; \ vpaddq xmm15{k1}{z}, xmm15, xmm12; \ \ add MSG, POLY1305_BLOCK_SIZE*16; \ sub LEN, POLY1305_BLOCK_SIZE*16; \ \ jmp .L_less_than_256; \ .L_poly1305_blocks_exit: \ /* ;; ============================================================================= ;; ============================================================================= ;; Creates stack frame and saves registers ;; ============================================================================= */ #define FUNC_ENTRY() \ mov rax, rsp; \ CFI_DEF_CFA_REGISTER(rax); \ sub rsp, STACK_SIZE; \ and rsp, -64; \ \ mov [rsp + STACK_gpr_save + 8*0], rbx; \ mov [rsp + STACK_gpr_save + 8*1], rbp; \ mov [rsp + STACK_gpr_save + 8*2], r12; \ mov [rsp + STACK_gpr_save + 8*3], r13; \ mov [rsp + STACK_gpr_save + 8*4], r14; \ mov [rsp + STACK_gpr_save + 8*5], r15; \ mov [rsp + STACK_rsp_save], rax; \ CFI_CFA_ON_STACK(STACK_rsp_save, 0) /* ;; ============================================================================= ;; ============================================================================= ;; Restores registers and removes the stack frame ;; ============================================================================= */ #define FUNC_EXIT() \ mov rbx, [rsp + STACK_gpr_save + 8*0]; \ mov rbp, [rsp + STACK_gpr_save + 8*1]; \ mov r12, [rsp + STACK_gpr_save + 8*2]; \ mov r13, [rsp + STACK_gpr_save + 8*3]; \ mov r14, [rsp + STACK_gpr_save + 8*4]; \ mov r15, [rsp + STACK_gpr_save + 8*5]; \ mov rsp, [rsp + STACK_rsp_save]; \ CFI_DEF_CFA_REGISTER(rsp) /* ;; ============================================================================= ;; ============================================================================= ;; void poly1305_aead_update_fma_avx512(const void *msg, const uint64_t msg_len, ;; void *hash, const void *key) ;; arg1 - Input message ;; arg2 - Message length ;; arg3 - Input/output hash ;; arg4 - Poly1305 key */ .align 32 .globl _gcry_poly1305_amd64_avx512_blocks ELF(.type _gcry_poly1305_amd64_avx512_blocks,@function;) _gcry_poly1305_amd64_avx512_blocks: CFI_STARTPROC() vpxord xmm16, xmm16, xmm16; vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */ FUNC_ENTRY() #define _a0 gp3 #define _a0 gp3 #define _a1 gp4 #define _a2 gp5 #define _r0 gp6 #define _r1 gp7 #define _len arg2 #define _arg3 arg4 /* ; use rcx, arg3 = rdx */ /* ;; load R */ mov _r0, [arg4 + 0 * 8] mov _r1, [arg4 + 1 * 8] /* ;; load accumulator / current hash value */ /* ;; note: arg4 can't be used beyond this point */ mov _arg3, arg3 /* ; note: _arg3 = arg4 (linux) */ mov _a0, [_arg3 + 0 * 8] mov _a1, [_arg3 + 1 * 8] mov DWORD(_a2), [_arg3 + 2 * 8] /* ; note: _a2 = arg4 (win) */ POLY1305_BLOCKS(arg1, _len, _a0, _a1, _a2, _r0, _r1, gp10, gp11, gp8, gp9, rax, rdx) /* ;; save accumulator back */ mov [_arg3 + 0 * 8], _a0 mov [_arg3 + 1 * 8], _a1 mov [_arg3 + 2 * 8], DWORD(_a2) FUNC_EXIT() xor eax, eax - kmovw k1, eax - kmovw k2, eax + kxorw k1, k1, k1 + kxorw k2, k2, k2 ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_poly1305_amd64_avx512_blocks, .-_gcry_poly1305_amd64_avx512_blocks;) #endif #endif diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S index c0fdbc33..0e3f44ab 100644 --- a/cipher/sha512-avx512-amd64.S +++ b/cipher/sha512-avx512-amd64.S @@ -1,461 +1,461 @@ /* sha512-avx512-amd64.c - amd64/AVX512 implementation of SHA-512 transform * Copyright (C) 2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on implementation from file "sha512-avx2-bmi2-amd64.S": ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX512) && \ defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ #define Y_0 ymm0 #define Y_1 ymm1 #define Y_2 ymm2 #define Y_3 ymm3 #define YTMP0 ymm4 #define YTMP1 ymm5 #define YTMP2 ymm6 #define YTMP3 ymm7 #define YTMP4 ymm8 #define XFER YTMP0 #define BYTE_FLIP_MASK ymm9 #define PERM_VPALIGNR_8 ymm10 #define MASK_DC_00 k1 #define INP rdi /* 1st arg */ #define CTX rsi /* 2nd arg */ #define NUM_BLKS rdx /* 3rd arg */ #define SRND r8d #define RSP_SAVE r9 #define TBL rcx #define a xmm11 #define b xmm12 #define c xmm13 #define d xmm14 #define e xmm15 #define f xmm16 #define g xmm17 #define h xmm18 #define y0 xmm19 #define y1 xmm20 #define y2 xmm21 #define y3 xmm22 /* Local variables (stack frame) */ #define frame_XFER 0 #define frame_XFER_size (4*4*8) #define frame_size (frame_XFER + frame_XFER_size) #define clear_reg(x) vpxorq x,x,x /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ #define addm(p1, p2) \ vmovq y0, p1; \ vpaddq p2, p2, y0; \ vmovq p1, p2; /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */ /* Load ymm with mem and byte swap each dword */ #define COPY_YMM_AND_BSWAP(p1, p2, p3) \ vmovdqu p1, p2; \ vpshufb p1, p1, p3 /* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */ /* YDST = {YSRC1, YSRC2} >> RVAL*8 */ #define MY_VPALIGNR(YDST_SRC1, YSRC2, RVAL) \ vpermt2q YDST_SRC1, PERM_VPALIGNR_##RVAL, YSRC2; #define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \ * d += h; \ * h += Sum0 (a) + Maj (a, b, c); \ * \ * Ch(x, y, z) => ((x & y) + (~x & z)) \ * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \ */ \ \ vmovq y3, [XFERIN]; \ vmovdqa64 y2, e; \ vpaddq h, h, y3; \ vprorq y0, e, 41; \ vpternlogq y2, f, g, 0xca; /* Ch (e, f, g) */ \ vprorq y1, e, 18; \ vprorq y3, e, 14; \ vpaddq h, h, y2; \ vpternlogq y0, y1, y3, 0x96; /* Sum1 (e) */ \ vpaddq h, h, y0; /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]) */ \ vpaddq d, d, h; /* d += h */ #define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ vmovdqa64 y1, a; \ vprorq y0, a, 39; \ vpternlogq y1, b, c, 0xe8; /* Maj (a, b, c) */ \ vprorq y2, a, 34; \ vprorq y3, a, 28; \ vpternlogq y0, y2, y3, 0x96; /* Sum0 (a) */ \ vpaddq h, h, y1; \ vpaddq h, h, y0; /* h += Sum0 (a) + Maj (a, b, c) */ #define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vmovdqa YTMP0, Y_3; \ vmovdqa YTMP1, Y_1; \ /* Extract w[t-7] */; \ vpermt2q YTMP0, PERM_VPALIGNR_8, Y_2 /* YTMP0 = W[-7] */; \ /* Calculate w[t-16] + w[t-7] */; \ vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */; \ /* Extract w[t-15] */; \ vpermt2q YTMP1, PERM_VPALIGNR_8, Y_0 /* YTMP1 = W[-15] */; \ ONE_ROUND_PART1(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ \ /* Calculate sigma0 */; \ \ /* Calculate w[t-15] ror 1 */; \ vprorq YTMP3, YTMP1, 1; /* YTMP3 = W[-15] ror 1 */; \ /* Calculate w[t-15] shr 7 */; \ vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */; \ \ ONE_ROUND_PART2(a, b, c, d, e, f, g, h); \ \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ /* Calculate w[t-15] ror 8 */; \ vprorq YTMP1, YTMP1, 8 /* YTMP1 = W[-15] ror 8 */; \ /* XOR the three components */; \ vpternlogq YTMP1, YTMP3, YTMP4, 0x96 /* YTMP1 = s0 = W[-15] ror 1 ^ W[-15] >> 7 ^ W[-15] ror 8 */; \ \ /* Add three components, w[t-16], w[t-7] and sigma0 */; \ vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */; \ ONE_ROUND_PART1(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ /* Move to appropriate lanes for calculating w[16] and w[17] */; \ vshufi64x2 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \ \ /* Calculate w[16] and w[17] in both 128 bit lanes */; \ \ /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \ vshufi64x2 YTMP2, Y_3, Y_3, 0b11 /* YTMP2 = W[-2] {BABA} */; \ vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */; \ \ ONE_ROUND_PART2(h, a, b, c, d, e, f, g); \ \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vprorq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] ror 19 {BABA} */; \ vprorq YTMP1, YTMP2, 61 /* YTMP3 = W[-2] ror 61 {BABA} */; \ vpternlogq YTMP4, YTMP3, YTMP1, 0x96 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \ \ ONE_ROUND_PART1(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ /* Add sigma1 to the other compunents to get w[16] and w[17] */; \ vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */; \ \ /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \ vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */; \ \ ONE_ROUND_PART2(g, h, a, b, c, d, e, f); \ \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vprorq YTMP3, Y_0, 19 /* YTMP3 = W[-2] ror 19 {DC--} */; \ vprorq YTMP1, Y_0, 61 /* YTMP1 = W[-2] ror 61 {DC--} */; \ vpternlogq YTMP4, YTMP3, YTMP1, 0x96 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \ \ ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \ /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \ /* Form w[19, w[18], w17], w[16] */; \ vpaddq Y_0{MASK_DC_00}, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], W[1], W[0]} */; \ \ vpaddq XFER, Y_0, [TBL + (4+X)*32]; \ vmovdqa [rsp + frame_XFER + X*32], XFER; \ ONE_ROUND_PART2(f, g, h, a, b, c, d, e) #define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \ ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \ ONE_ROUND_PART2(a, b, c, d, e, f, g, h) #define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \ ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_avx512(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks */ .globl _gcry_sha512_transform_amd64_avx512 ELF(.type _gcry_sha512_transform_amd64_avx512,@function;) .align 16 _gcry_sha512_transform_amd64_avx512: CFI_STARTPROC() xor eax, eax cmp rdx, 0 je .Lnowork /* Setup mask register for DC:BA merging. */ mov eax, 0b1100 kmovd MASK_DC_00, eax /* Allocate Stack Space */ mov RSP_SAVE, rsp CFI_DEF_CFA_REGISTER(RSP_SAVE); sub rsp, frame_size and rsp, ~(0x40 - 1) /*; load initial digest */ vmovq a,[8*0 + CTX] vmovq b,[8*1 + CTX] vmovq c,[8*2 + CTX] vmovq d,[8*3 + CTX] vmovq e,[8*4 + CTX] vmovq f,[8*5 + CTX] vmovq g,[8*6 + CTX] vmovq h,[8*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vpmovzxbq PERM_VPALIGNR_8, [.LPERM_VPALIGNR_8 ADD_RIP] lea TBL,[.LK512 ADD_RIP] /*; byte swap first 16 dwords */ COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) lea INP, [INP + 128] vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ mov SRND, 4 .align 16 .Loop0: FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d) FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h) FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d) lea TBL, [TBL + 4*32] sub SRND, 1 jne .Loop0 sub NUM_BLKS, 1 je .Ldone_hash lea TBL, [.LK512 ADD_RIP] /* load next block and byte swap */ COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) lea INP, [INP + 128] DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER addm([8*0 + CTX],a) addm([8*1 + CTX],b) addm([8*2 + CTX],c) addm([8*3 + CTX],d) addm([8*4 + CTX],e) addm([8*5 + CTX],f) addm([8*6 + CTX],g) addm([8*7 + CTX],h) /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ mov SRND, 4 jmp .Loop0 .Ldone_hash: DO_4ROUNDS(0, a, b, c, d, e, f, g, h) DO_4ROUNDS(1, e, f, g, h, a, b, c, d) DO_4ROUNDS(2, a, b, c, d, e, f, g, h) DO_4ROUNDS(3, e, f, g, h, a, b, c, d) addm([8*0 + CTX],a) xor eax, eax /* burn stack */ addm([8*1 + CTX],b) addm([8*2 + CTX],c) addm([8*3 + CTX],d) addm([8*4 + CTX],e) addm([8*5 + CTX],f) addm([8*6 + CTX],g) addm([8*7 + CTX],h) - kmovd MASK_DC_00, eax + kxord MASK_DC_00, MASK_DC_00, MASK_DC_00 vzeroall vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */ vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */ vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */ vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */ clear_reg(xmm16); clear_reg(xmm17); clear_reg(xmm18); clear_reg(xmm19); clear_reg(xmm20); clear_reg(xmm21); clear_reg(xmm22); /* Restore Stack Pointer */ mov rsp, RSP_SAVE CFI_DEF_CFA_REGISTER(rsp) .Lnowork: ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_sha512_transform_amd64_avx512,.-_gcry_sha512_transform_amd64_avx512) /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /*;; Binary Data */ ELF(.type _gcry_sha512_avx512_consts,@object) _gcry_sha512_avx512_consts: .align 64 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .align 32 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 .octa 0x18191a1b1c1d1e1f1011121314151617 .align 4 .LPERM_VPALIGNR_8: .byte 5, 6, 7, 0 ELF(.size _gcry_sha512_avx512_consts,.-_gcry_sha512_avx512_consts) #endif #endif diff --git a/configure.ac b/configure.ac index e63a7d6d..a7482cf3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,3397 +1,3400 @@ # Configure.ac script for Libgcrypt # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, # 2007, 2008, 2009, 2011 Free Software Foundation, Inc. # Copyright (C) 2012-2021 g10 Code GmbH # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # (Process this file with autoconf to produce a configure script.) AC_REVISION($Revision$) AC_PREREQ([2.69]) min_automake_version="1.14" # To build a release you need to create a tag with the version number # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force". Please # bump the version number immediately after the release and do another # commit and push so that the git magic is able to work. See below # for the LT versions. m4_define([mym4_package],[libgcrypt]) m4_define([mym4_major], [1]) m4_define([mym4_minor], [11]) m4_define([mym4_micro], [0]) # Below is m4 magic to extract and compute the git revision number, # the decimalized short revision number, a beta version string and a # flag indicating a development version (mym4_isbeta). Note that the # m4 processing is done by autoconf and not during the configure run. m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \ mym4_package mym4_major mym4_minor mym4_micro),[:])) m4_define([mym4_isbeta], m4_argn(2, mym4_verslist)) m4_define([mym4_version], m4_argn(4, mym4_verslist)) m4_define([mym4_revision], m4_argn(7, mym4_verslist)) m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist)) m4_esyscmd([echo ]mym4_version[>VERSION]) AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org]) # LT Version numbers, remember to change them just *before* a release. # NOET NOTE - Already updated for a 1.11 series - NOTE NOTE # (Code changed: REVISION++) # (Interfaces added/removed/changed: CURRENT++, REVISION=0) # (Interfaces added: AGE++) # (Interfaces removed: AGE=0) # # (Interfaces removed: CURRENT++, AGE=0, REVISION=0) # (Interfaces added: CURRENT++, AGE++, REVISION=0) # (No interfaces changed: REVISION++) LIBGCRYPT_LT_CURRENT=25 LIBGCRYPT_LT_AGE=5 LIBGCRYPT_LT_REVISION=0 ################################################ AC_SUBST(LIBGCRYPT_LT_CURRENT) AC_SUBST(LIBGCRYPT_LT_AGE) AC_SUBST(LIBGCRYPT_LT_REVISION) # If the API is changed in an incompatible way: increment the next counter. # # 1.6: ABI and API change but the change is to most users irrelevant # and thus the API version number has not been incremented. LIBGCRYPT_CONFIG_API_VERSION=1 # If you change the required gpg-error version, please remove # unnecessary error code defines in src/gcrypt-int.h. NEED_GPG_ERROR_VERSION=1.27 AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_SRCDIR([src/libgcrypt.vers]) AM_INIT_AUTOMAKE([serial-tests dist-bzip2]) AC_CONFIG_HEADERS([config.h]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_LIBOBJ_DIR([compat]) AC_CANONICAL_HOST AM_MAINTAINER_MODE AM_SILENT_RULES AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory]) AH_TOP([ #ifndef _GCRYPT_CONFIG_H_INCLUDED #define _GCRYPT_CONFIG_H_INCLUDED /* Enable gpg-error's strerror macro for W32CE. */ #define GPG_ERR_ENABLE_ERRNO_MACROS 1 ]) AH_BOTTOM([ #define _GCRYPT_IN_LIBGCRYPT 1 /* Add .note.gnu.property section for Intel CET in assembler sources when CET is enabled. */ #if defined(__ASSEMBLER__) && defined(__CET__) # include #endif /* If the configure check for endianness has been disabled, get it from OS macros. This is intended for making fat binary builds on OS X. */ #ifdef DISABLED_ENDIAN_CHECK # if defined(__BIG_ENDIAN__) # define WORDS_BIGENDIAN 1 # elif defined(__LITTLE_ENDIAN__) # undef WORDS_BIGENDIAN # else # error "No endianness found" # endif #endif /*DISABLED_ENDIAN_CHECK*/ /* We basically use the original Camellia source. Make sure the symbols properly prefixed. */ #define CAMELLIA_EXT_SYM_PREFIX _gcry_ #endif /*_GCRYPT_CONFIG_H_INCLUDED*/ ]) AH_VERBATIM([_REENTRANT], [/* To allow the use of Libgcrypt in multithreaded programs we have to use special features from the library. */ #ifndef _REENTRANT # define _REENTRANT 1 #endif ]) ###################### ## Basic checks. ### (we need some results later on (e.g. $GCC) ###################### AC_PROG_MAKE_SET missing_dir=`cd $ac_aux_dir && pwd` AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir) AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir) AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir) AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir) # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir) AC_PROG_CC AC_PROG_CPP AM_PROG_CC_C_O AM_PROG_AS AC_SEARCH_LIBS([strerror],[cposix]) AC_PROG_INSTALL AC_PROG_AWK AC_USE_SYSTEM_EXTENSIONS # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has dnl the precedence over the run path, so that if a compatible MPFR library dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested dnl MPFR library will be this library instead of the MPFR library from the dnl build tree. Other OS with the same issue might be added later. dnl dnl References: dnl https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732 dnl http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html dnl dnl We need to check whether --disable-new-dtags is supported as alternate dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc). dnl case $host in *-*-linux*) if test -n "$LD_LIBRARY_PATH"; then saved_LDFLAGS="$LDFLAGS" LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags" LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE" AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker) AC_LINK_IFELSE([AC_LANG_SOURCE([[ int main (void) { return 0; } ]])], [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))], [AC_MSG_RESULT(no) LDADD_FOR_TESTS_KLUDGE="" ]) LDFLAGS="$saved_LDFLAGS" fi ;; esac AC_SUBST([LDADD_FOR_TESTS_KLUDGE]) VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \ mym4_minor mym4_micro) AC_SUBST(VERSION_NUMBER) # We need to compile and run a program on the build machine. AX_CC_FOR_BUILD LT_PREREQ([2.2.6]) LT_INIT([win32-dll disable-static]) LT_LANG([Windows Resource]) ########################## ## General definitions. ## ########################## # Used by libgcrypt-config LIBGCRYPT_CONFIG_LIBS="-lgcrypt" LIBGCRYPT_CONFIG_CFLAGS="" LIBGCRYPT_CONFIG_HOST="$host" # Definitions for symmetric ciphers. available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed" available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20" available_ciphers="$available_ciphers sm4" enabled_ciphers="" # Definitions for public-key ciphers. available_pubkey_ciphers="dsa elgamal rsa ecc" enabled_pubkey_ciphers="" # Definitions for message digests. available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512" available_digests="$available_digests sha3 tiger whirlpool stribog blake2" available_digests="$available_digests sm3" enabled_digests="" # Definitions for kdfs (optional ones) available_kdfs="s2k pkdf2 scrypt" enabled_kdfs="" # Definitions for random modules. available_random_modules="getentropy linux egd unix" auto_random_modules="$available_random_modules" # Supported thread backends. LIBGCRYPT_THREAD_MODULES="" # Other definitions. have_w32_system=no have_w32ce_system=no have_pthread=no # Setup some stuff depending on host. case "${host}" in *-*-mingw32*) ac_cv_have_dev_random=no have_w32_system=yes case "${host}" in *-mingw32ce*) have_w32ce_system=yes available_random_modules="w32ce" ;; *) available_random_modules="w32" ;; esac AC_DEFINE(USE_ONLY_8DOT3,1, [set this to limit filenames to the 8.3 format]) AC_DEFINE(HAVE_DRIVE_LETTERS,1, [defined if we must run on a stupid file system]) AC_DEFINE(HAVE_DOSISH_SYSTEM,1, [defined if we run on some of the PCDOS like systems (DOS, Windoze. OS/2) with special properties like no file modes]) ;; i?86-emx-os2 | i?86-*-os2*emx) # OS/2 with the EMX environment ac_cv_have_dev_random=no AC_DEFINE(HAVE_DRIVE_LETTERS) AC_DEFINE(HAVE_DOSISH_SYSTEM) ;; i?86-*-msdosdjgpp*) # DOS with the DJGPP environment ac_cv_have_dev_random=no AC_DEFINE(HAVE_DRIVE_LETTERS) AC_DEFINE(HAVE_DOSISH_SYSTEM) ;; *-*-hpux*) if test -z "$GCC" ; then CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE" fi ;; *-dec-osf4*) if test -z "$GCC" ; then # Suppress all warnings # to get rid of the unsigned/signed char mismatch warnings. CFLAGS="$CFLAGS -w" fi ;; m68k-atari-mint) ;; *-apple-darwin*) AC_DEFINE(_DARWIN_C_SOURCE, 1, Expose all libc features (__DARWIN_C_FULL).) AC_DEFINE(USE_POSIX_SPAWN_FOR_TESTS, 1, [defined if we use posix_spawn in test program]) AC_CHECK_HEADERS(spawn.h) ;; *) ;; esac if test "$have_w32_system" = yes; then AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system]) if test "$have_w32ce_system" = yes; then AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE]) fi fi AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes) AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes) # A printable OS Name is sometimes useful. case "${host}" in *-*-mingw32ce*) PRINTABLE_OS_NAME="W32CE" ;; *-*-mingw32*) PRINTABLE_OS_NAME="W32" ;; i?86-emx-os2 | i?86-*-os2*emx ) PRINTABLE_OS_NAME="OS/2" ;; i?86-*-msdosdjgpp*) PRINTABLE_OS_NAME="MSDOS/DJGPP" ;; *-linux*) PRINTABLE_OS_NAME="GNU/Linux" ;; *) PRINTABLE_OS_NAME=`uname -s || echo "Unknown"` ;; esac NAME_OF_DEV_RANDOM="/dev/random" NAME_OF_DEV_URANDOM="/dev/urandom" AC_ARG_ENABLE(endian-check, AS_HELP_STRING([--disable-endian-check], [disable the endian check and trust the OS provided macros]), endiancheck=$enableval,endiancheck=yes) if test x"$endiancheck" = xyes ; then AC_C_BIGENDIAN else AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness]) fi AC_CHECK_SIZEOF(unsigned short, 2) AC_CHECK_SIZEOF(unsigned int, 4) AC_CHECK_SIZEOF(unsigned long, 4) AC_CHECK_SIZEOF(unsigned long long, 0) AC_CHECK_SIZEOF(void *, 0) AC_TYPE_UINTPTR_T if test "$ac_cv_sizeof_unsigned_short" = "0" \ || test "$ac_cv_sizeof_unsigned_int" = "0" \ || test "$ac_cv_sizeof_unsigned_long" = "0"; then AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]); fi # Ensure that we have UINT64_C before we bother to check for uint64_t AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works], AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[uint64_t foo=UINT64_C(42);]])], gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no)) if test "$gnupg_cv_uint64_c_works" = "yes" ; then AC_CHECK_SIZEOF(uint64_t) fi # Do we have any 64-bit data types? if test "$ac_cv_sizeof_unsigned_int" != "8" \ && test "$ac_cv_sizeof_unsigned_long" != "8" \ && test "$ac_cv_sizeof_unsigned_long_long" != "8" \ && test "$ac_cv_sizeof_uint64_t" != "8"; then AC_MSG_ERROR([[ *** *** No 64-bit integer type available. *** It is not possible to build Libgcrypt on this platform. ***]]) fi # If not specified otherwise, all available algorithms will be # included. default_ciphers="$available_ciphers" default_pubkey_ciphers="$available_pubkey_ciphers" default_digests="$available_digests" default_kdfs="$available_kdfs" # Blacklist MD2 by default default_digests=`echo $default_digests | sed -e 's/md2//g'` # Substitutions to set generated files in a Emacs buffer to read-only. AC_SUBST(emacs_local_vars_begin, ['Local Variables:']) AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t']) AC_SUBST(emacs_local_vars_end, ['End:']) ############################ ## Command line switches. ## ############################ # Implementation of the --enable-ciphers switch. AC_ARG_ENABLE(ciphers, AS_HELP_STRING([--enable-ciphers=ciphers], [select the symmetric ciphers to include]), [enabled_ciphers=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_ciphers=""]) if test "x$enabled_ciphers" = "x" \ -o "$enabled_ciphers" = "yes" \ -o "$enabled_ciphers" = "no"; then enabled_ciphers=$default_ciphers fi AC_MSG_CHECKING([which symmetric ciphers to include]) for cipher in $enabled_ciphers; do LIST_MEMBER($cipher, $available_ciphers) if test "$found" = "0"; then AC_MSG_ERROR([unsupported cipher "$cipher" specified]) fi done AC_MSG_RESULT([$enabled_ciphers]) # Implementation of the --enable-pubkey-ciphers switch. AC_ARG_ENABLE(pubkey-ciphers, AS_HELP_STRING([--enable-pubkey-ciphers=ciphers], [select the public-key ciphers to include]), [enabled_pubkey_ciphers=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_pubkey_ciphers=""]) if test "x$enabled_pubkey_ciphers" = "x" \ -o "$enabled_pubkey_ciphers" = "yes" \ -o "$enabled_pubkey_ciphers" = "no"; then enabled_pubkey_ciphers=$default_pubkey_ciphers fi AC_MSG_CHECKING([which public-key ciphers to include]) for cipher in $enabled_pubkey_ciphers; do LIST_MEMBER($cipher, $available_pubkey_ciphers) if test "$found" = "0"; then AC_MSG_ERROR([unsupported public-key cipher specified]) fi done AC_MSG_RESULT([$enabled_pubkey_ciphers]) # Implementation of the --enable-digests switch. AC_ARG_ENABLE(digests, AS_HELP_STRING([--enable-digests=digests], [select the message digests to include]), [enabled_digests=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_digests=""]) if test "x$enabled_digests" = "x" \ -o "$enabled_digests" = "yes" \ -o "$enabled_digests" = "no"; then enabled_digests=$default_digests fi AC_MSG_CHECKING([which message digests to include]) for digest in $enabled_digests; do LIST_MEMBER($digest, $available_digests) if test "$found" = "0"; then AC_MSG_ERROR([unsupported message digest specified]) fi done AC_MSG_RESULT([$enabled_digests]) # Implementation of the --enable-kdfs switch. AC_ARG_ENABLE(kdfs, AS_HELP_STRING([--enable-kfds=kdfs], [select the KDFs to include]), [enabled_kdfs=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_kdfs=""]) if test "x$enabled_kdfs" = "x" \ -o "$enabled_kdfs" = "yes" \ -o "$enabled_kdfs" = "no"; then enabled_kdfs=$default_kdfs fi AC_MSG_CHECKING([which key derivation functions to include]) for kdf in $enabled_kdfs; do LIST_MEMBER($kdf, $available_kdfs) if test "$found" = "0"; then AC_MSG_ERROR([unsupported key derivation function specified]) fi done AC_MSG_RESULT([$enabled_kdfs]) # Implementation of the --enable-random switch. AC_ARG_ENABLE(random, AS_HELP_STRING([--enable-random=name], [select which random number generator to use]), [random=`echo $enableval | tr '[A-Z]' '[a-z]'`], []) if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then random=default fi AC_MSG_CHECKING([which random module to use]) if test "$random" != "default" -a "$random" != "auto"; then LIST_MEMBER($random, $available_random_modules) if test "$found" = "0"; then AC_MSG_ERROR([unsupported random module specified]) fi fi AC_MSG_RESULT($random) # Implementation of the --disable-dev-random switch. AC_MSG_CHECKING([whether use of /dev/random is requested]) AC_ARG_ENABLE(dev-random, [ --disable-dev-random disable the use of dev random], try_dev_random=$enableval, try_dev_random=yes) AC_MSG_RESULT($try_dev_random) # Implementation of the --with-egd-socket switch. AC_ARG_WITH(egd-socket, [ --with-egd-socket=NAME Use NAME for the EGD socket)], egd_socket_name="$withval", egd_socket_name="" ) AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name", [Define if you don't want the default EGD socket name. For details see cipher/rndegd.c]) # Implementation of --disable-asm. AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested]) AC_ARG_ENABLE([asm], AS_HELP_STRING([--disable-asm], [Disable MPI and cipher assembler modules]), [try_asm_modules=$enableval], [try_asm_modules=yes]) AC_MSG_RESULT($try_asm_modules) if test "$try_asm_modules" != yes ; then AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure]) fi # Implementation of the --enable-large-data-tests switch. AC_MSG_CHECKING([whether to run large data tests]) AC_ARG_ENABLE(large-data-tests, AS_HELP_STRING([--enable-large-data-tests], [Enable the real long ruinning large data tests]), large_data_tests=$enableval,large_data_tests=no) AC_MSG_RESULT($large_data_tests) AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests) # Implementation of --enable-force-soft-hwfeatures AC_MSG_CHECKING([whether 'soft' HW feature bits are forced on]) AC_ARG_ENABLE([force-soft-hwfeatures], AS_HELP_STRING([--enable-force-soft-hwfeatures], [Enable forcing 'soft' HW feature bits on]), [force_soft_hwfeatures=$enableval], [force_soft_hwfeatures=no]) AC_MSG_RESULT($force_soft_hwfeatures) # Implementation of the --with-capabilities switch. # Check whether we want to use Linux capabilities AC_MSG_CHECKING([whether use of capabilities is requested]) AC_ARG_WITH(capabilities, AS_HELP_STRING([--with-capabilities], [Use linux capabilities [default=no]]), [use_capabilities="$withval"],[use_capabilities=no]) AC_MSG_RESULT($use_capabilities) # Implementation of the --enable-hmac-binary-check. AC_MSG_CHECKING([whether a HMAC binary check is requested]) AC_ARG_ENABLE(hmac-binary-check, AS_HELP_STRING([--enable-hmac-binary-check], [Enable library integrity check]), [use_hmac_binary_check="$enableval"], [use_hmac_binary_check=no]) AC_MSG_RESULT($use_hmac_binary_check) if test "$use_hmac_binary_check" = no ; then DEF_HMAC_BINARY_CHECK='' else AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1, [Define to support an HMAC based integrity check]) AC_CHECK_TOOL(OBJCOPY, [objcopy]) AC_CHECK_TOOL(READELF, [readelf]) if test "$use_hmac_binary_check" != yes ; then DEF_HMAC_BINARY_CHECK=-DKEY_FOR_BINARY_CHECK="'\"$use_hmac_binary_check\"'" fi fi AM_CONDITIONAL(USE_HMAC_BINARY_CHECK, test "x$use_hmac_binary_check" != xno) AC_SUBST(DEF_HMAC_BINARY_CHECK) # Implementation of the --with-fips-module-version. AC_ARG_WITH(fips-module-version, AS_HELP_STRING([--with-fips-module-version=VERSION], [Specify the FIPS module version for the build]), fips_module_version="$withval", fips_module_version="" ) AC_DEFINE_UNQUOTED(FIPS_MODULE_VERSION, "$fips_module_version", [Define FIPS module version for certification]) # Implementation of the --disable-jent-support switch. AC_MSG_CHECKING([whether jitter entropy support is requested]) AC_ARG_ENABLE(jent-support, AS_HELP_STRING([--disable-jent-support], [Disable support for the Jitter entropy collector]), jentsupport=$enableval,jentsupport=yes) AC_MSG_RESULT($jentsupport) # Implementation of the --disable-padlock-support switch. AC_MSG_CHECKING([whether padlock support is requested]) AC_ARG_ENABLE(padlock-support, AS_HELP_STRING([--disable-padlock-support], [Disable support for the PadLock Engine of VIA processors]), padlocksupport=$enableval,padlocksupport=yes) AC_MSG_RESULT($padlocksupport) # Implementation of the --disable-aesni-support switch. AC_MSG_CHECKING([whether AESNI support is requested]) AC_ARG_ENABLE(aesni-support, AS_HELP_STRING([--disable-aesni-support], [Disable support for the Intel AES-NI instructions]), aesnisupport=$enableval,aesnisupport=yes) AC_MSG_RESULT($aesnisupport) # Implementation of the --disable-shaext-support switch. AC_MSG_CHECKING([whether SHAEXT support is requested]) AC_ARG_ENABLE(shaext-support, AS_HELP_STRING([--disable-shaext-support], [Disable support for the Intel SHAEXT instructions]), shaextsupport=$enableval,shaextsupport=yes) AC_MSG_RESULT($shaextsupport) # Implementation of the --disable-pclmul-support switch. AC_MSG_CHECKING([whether PCLMUL support is requested]) AC_ARG_ENABLE(pclmul-support, AS_HELP_STRING([--disable-pclmul-support], [Disable support for the Intel PCLMUL instructions]), pclmulsupport=$enableval,pclmulsupport=yes) AC_MSG_RESULT($pclmulsupport) # Implementation of the --disable-sse41-support switch. AC_MSG_CHECKING([whether SSE4.1 support is requested]) AC_ARG_ENABLE(sse41-support, AS_HELP_STRING([--disable-sse41-support], [Disable support for the Intel SSE4.1 instructions]), sse41support=$enableval,sse41support=yes) AC_MSG_RESULT($sse41support) # Implementation of the --disable-drng-support switch. AC_MSG_CHECKING([whether DRNG support is requested]) AC_ARG_ENABLE(drng-support, AS_HELP_STRING([--disable-drng-support], [Disable support for the Intel DRNG (RDRAND instruction)]), drngsupport=$enableval,drngsupport=yes) AC_MSG_RESULT($drngsupport) # Implementation of the --disable-avx-support switch. AC_MSG_CHECKING([whether AVX support is requested]) AC_ARG_ENABLE(avx-support, AS_HELP_STRING([--disable-avx-support], [Disable support for the Intel AVX instructions]), avxsupport=$enableval,avxsupport=yes) AC_MSG_RESULT($avxsupport) # Implementation of the --disable-avx2-support switch. AC_MSG_CHECKING([whether AVX2 support is requested]) AC_ARG_ENABLE(avx2-support, AS_HELP_STRING([--disable-avx2-support], [Disable support for the Intel AVX2 instructions]), avx2support=$enableval,avx2support=yes) AC_MSG_RESULT($avx2support) # Implementation of the --disable-avx512-support switch. AC_MSG_CHECKING([whether AVX512 support is requested]) AC_ARG_ENABLE(avx512-support, AS_HELP_STRING([--disable-avx512-support], [Disable support for the Intel AVX512 instructions]), avx512support=$enableval,avx512support=yes) AC_MSG_RESULT($avx512support) # Implementation of the --disable-gfni-support switch. AC_MSG_CHECKING([whether GFNI support is requested]) AC_ARG_ENABLE(gfni-support, AS_HELP_STRING([--disable-gfni-support], [Disable support for the Intel GFNI instructions]), gfnisupport=$enableval,gfnisupport=yes) AC_MSG_RESULT($gfnisupport) # Implementation of the --disable-neon-support switch. AC_MSG_CHECKING([whether NEON support is requested]) AC_ARG_ENABLE(neon-support, AS_HELP_STRING([--disable-neon-support], [Disable support for the ARM NEON instructions]), neonsupport=$enableval,neonsupport=yes) AC_MSG_RESULT($neonsupport) # Implementation of the --disable-arm-crypto-support switch. AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested]) AC_ARG_ENABLE(arm-crypto-support, AS_HELP_STRING([--disable-arm-crypto-support], [Disable support for the ARMv8 Crypto Extension instructions]), armcryptosupport=$enableval,armcryptosupport=yes) AC_MSG_RESULT($armcryptosupport) # Implementation of the --disable-ppc-crypto-support switch. AC_MSG_CHECKING([whether PPC crypto support is requested]) AC_ARG_ENABLE(ppc-crypto-support, AS_HELP_STRING([--disable-ppc-crypto-support], [Disable support for the PPC crypto instructions introduced in POWER 8 (PowerISA 2.07)]), ppccryptosupport=$enableval,ppccryptosupport=yes) AC_MSG_RESULT($ppccryptosupport) # Implementation of the --disable-O-flag-munging switch. AC_MSG_CHECKING([whether a -O flag munging is requested]) AC_ARG_ENABLE([O-flag-munging], AS_HELP_STRING([--disable-O-flag-munging], [Disable modification of the cc -O flag]), [enable_o_flag_munging=$enableval], [enable_o_flag_munging=yes]) AC_MSG_RESULT($enable_o_flag_munging) AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes") # Implementation of the --disable-instrumentation-munging switch. AC_MSG_CHECKING([whether a instrumentation (-fprofile, -fsanitize) munging is requested]) AC_ARG_ENABLE([instrumentation-munging], AS_HELP_STRING([--disable-instrumentation-munging], [Disable modification of the cc instrumentation options]), [enable_instrumentation_munging=$enableval], [enable_instrumentation_munging=yes]) AC_MSG_RESULT($enable_instrumentation_munging) AM_CONDITIONAL(ENABLE_INSTRUMENTATION_MUNGING, test "$enable_instrumentation_munging" = "yes") # Implementation of the --disable-amd64-as-feature-detection switch. AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection]) AC_ARG_ENABLE(amd64-as-feature-detection, AS_HELP_STRING([--disable-amd64-as-feature-detection], [Disable the auto-detection of AMD64 as(1) features]), amd64_as_feature_detection=$enableval, amd64_as_feature_detection=yes) AC_MSG_RESULT($amd64_as_feature_detection) AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME", [A human readable text with the name of the OS]) # For some systems we know that we have ld_version scripts. # Use it then as default. have_ld_version_script=no case "${host}" in *-*-linux*) have_ld_version_script=yes ;; *-*-gnu*) have_ld_version_script=yes ;; esac AC_ARG_ENABLE([ld-version-script], AS_HELP_STRING([--enable-ld-version-script], [enable/disable use of linker version script. (default is system dependent)]), [have_ld_version_script=$enableval], [ : ] ) AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes") AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM", [defined to the name of the strong random device]) AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM", [defined to the name of the weaker random device]) ############################### #### Checks for libraries. #### ############################### # # gpg-error is required. # AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION") if test "x$GPG_ERROR_LIBS" = "x"; then AC_MSG_ERROR([libgpg-error is needed. See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .]) fi AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT, [The default error source for libgcrypt.]) AM_CONDITIONAL(USE_GPGRT_CONFIG, [test -n "$GPGRT_CONFIG" \ -a "$ac_cv_path_GPG_ERROR_CONFIG" = no]) # # Check whether pthreads is available # if test "$have_w32_system" != yes; then AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes) if test "$have_pthread" = yes; then AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.]) fi fi # Solaris needs -lsocket and -lnsl. Unisys system includes # gethostbyname in libsocket but needs libnsl for socket. AC_SEARCH_LIBS(setsockopt, [socket], , [AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])]) AC_SEARCH_LIBS(setsockopt, [nsl]) ################################## #### Checks for header files. #### ################################## AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h) ########################################## #### Checks for typedefs, structures, #### #### and compiler characteristics. #### ########################################## AC_C_CONST AC_C_INLINE AC_TYPE_SIZE_T AC_TYPE_PID_T AC_CHECK_TYPES([byte, ushort, u16, u32, u64]) # # Check for __builtin_bswap32 intrinsic. # AC_CACHE_CHECK(for __builtin_bswap32, [gcry_cv_have_builtin_bswap32], [gcry_cv_have_builtin_bswap32=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [int x = 0; int y = __builtin_bswap32(x); return y;])], [gcry_cv_have_builtin_bswap32=yes])]) if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP32,1, [Defined if compiler has '__builtin_bswap32' intrinsic]) fi # # Check for __builtin_bswap64 intrinsic. # AC_CACHE_CHECK(for __builtin_bswap64, [gcry_cv_have_builtin_bswap64], [gcry_cv_have_builtin_bswap64=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [long long x = 0; long long y = __builtin_bswap64(x); return y;])], [gcry_cv_have_builtin_bswap64=yes])]) if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP64,1, [Defined if compiler has '__builtin_bswap64' intrinsic]) fi # # Check for __builtin_ctz intrinsic. # AC_CACHE_CHECK(for __builtin_ctz, [gcry_cv_have_builtin_ctz], [gcry_cv_have_builtin_ctz=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned int x = 0; int y = __builtin_ctz(x); return y;])], [gcry_cv_have_builtin_ctz=yes])]) if test "$gcry_cv_have_builtin_ctz" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CTZ, 1, [Defined if compiler has '__builtin_ctz' intrinsic]) fi # # Check for __builtin_ctzl intrinsic. # AC_CACHE_CHECK(for __builtin_ctzl, [gcry_cv_have_builtin_ctzl], [gcry_cv_have_builtin_ctzl=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned long x = 0; long y = __builtin_ctzl(x); return y;])], [gcry_cv_have_builtin_ctzl=yes])]) if test "$gcry_cv_have_builtin_ctzl" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CTZL, 1, [Defined if compiler has '__builtin_ctzl' intrinsic]) fi # # Check for __builtin_clz intrinsic. # AC_CACHE_CHECK(for __builtin_clz, [gcry_cv_have_builtin_clz], [gcry_cv_have_builtin_clz=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned int x = 0; int y = __builtin_clz(x); return y;])], [gcry_cv_have_builtin_clz=yes])]) if test "$gcry_cv_have_builtin_clz" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CLZ, 1, [Defined if compiler has '__builtin_clz' intrinsic]) fi # # Check for __builtin_clzl intrinsic. # AC_CACHE_CHECK(for __builtin_clzl, [gcry_cv_have_builtin_clzl], [gcry_cv_have_builtin_clzl=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned long x = 0; long y = __builtin_clzl(x); return y;])], [gcry_cv_have_builtin_clzl=yes])]) if test "$gcry_cv_have_builtin_clzl" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CLZL, 1, [Defined if compiler has '__builtin_clzl' intrinsic]) fi # # Check for __sync_synchronize intrinsic. # AC_CACHE_CHECK(for __sync_synchronize, [gcry_cv_have_sync_synchronize], [gcry_cv_have_sync_synchronize=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [__sync_synchronize(); return 0;])], [gcry_cv_have_sync_synchronize=yes])]) if test "$gcry_cv_have_sync_synchronize" = "yes" ; then AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1, [Defined if compiler has '__sync_synchronize' intrinsic]) fi # # Check for VLA support (variable length arrays). # AC_CACHE_CHECK(whether the variable length arrays are supported, [gcry_cv_have_vla], [gcry_cv_have_vla=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void f1(char *, int); char foo(int i) { char b[(i < 0 ? 0 : i) + 1]; f1(b, sizeof b); return b[0];}]])], [gcry_cv_have_vla=yes])]) if test "$gcry_cv_have_vla" = "yes" ; then AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported]) fi # # Check for ELF visibility support. # AC_CACHE_CHECK(whether the visibility attribute is supported, gcry_cv_visibility_attribute, [gcry_cv_visibility_attribute=no AC_LANG_CONFTEST([AC_LANG_SOURCE( [[int foo __attribute__ ((visibility ("hidden"))) = 1; int bar __attribute__ ((visibility ("protected"))) = 1; ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then gcry_cv_visibility_attribute=yes fi fi fi ]) if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(for broken visibility attribute, gcry_cv_broken_visibility_attribute, [gcry_cv_broken_visibility_attribute=yes AC_LANG_CONFTEST([AC_LANG_SOURCE( [[int foo (int x); int bar (int x) __asm__ ("foo") __attribute__ ((visibility ("hidden"))); int bar (int x) { return x; } ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep '\.hidden@<:@ _@:>@foo' conftest.s >/dev/null 2>&1; then gcry_cv_broken_visibility_attribute=no fi fi ]) fi if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(for broken alias attribute, gcry_cv_broken_alias_attribute, [gcry_cv_broken_alias_attribute=yes AC_LANG_CONFTEST([AC_LANG_SOURCE( [[extern int foo (int x) __asm ("xyzzy"); int bar (int x) { return x; } extern __typeof (bar) foo __attribute ((weak, alias ("bar"))); extern int dfoo; extern __typeof (dfoo) dfoo __asm ("abccb"); int dfoo = 1; ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \ grep 'abccb' conftest.s >/dev/null 2>&1; then gcry_cv_broken_alias_attribute=no fi fi ]) fi if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(if gcc supports -fvisibility=hidden, gcry_cv_gcc_has_f_visibility, [gcry_cv_gcc_has_f_visibility=no _gcc_cflags_save=$CFLAGS CFLAGS="-fvisibility=hidden" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])], gcry_cv_gcc_has_f_visibility=yes) CFLAGS=$_gcc_cflags_save; ]) fi if test "$gcry_cv_visibility_attribute" = "yes" \ && test "$gcry_cv_broken_visibility_attribute" != "yes" \ && test "$gcry_cv_broken_alias_attribute" != "yes" \ && test "$gcry_cv_gcc_has_f_visibility" = "yes" then AC_DEFINE(GCRY_USE_VISIBILITY, 1, [Define to use the GNU C visibility attribute.]) CFLAGS="$CFLAGS -fvisibility=hidden" fi # Following attribute tests depend on warnings to cause compile to fail, # so set -Werror temporarily. _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -Werror" # # Check whether the compiler supports the GCC style aligned attribute # AC_CACHE_CHECK([whether the GCC style aligned attribute is supported], [gcry_cv_gcc_attribute_aligned], [gcry_cv_gcc_attribute_aligned=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[struct { int a; } foo __attribute__ ((aligned (16)));]])], [gcry_cv_gcc_attribute_aligned=yes])]) if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1, [Defined if a GCC style "__attribute__ ((aligned (n))" is supported]) fi # # Check whether the compiler supports the GCC style packed attribute # AC_CACHE_CHECK([whether the GCC style packed attribute is supported], [gcry_cv_gcc_attribute_packed], [gcry_cv_gcc_attribute_packed=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[struct foolong_s { long b; } __attribute__ ((packed)); struct foo_s { char a; struct foolong_s b; } __attribute__ ((packed)); enum bar { FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))), };]])], [gcry_cv_gcc_attribute_packed=yes])]) if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1, [Defined if a GCC style "__attribute__ ((packed))" is supported]) fi # # Check whether the compiler supports the GCC style may_alias attribute # AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported], [gcry_cv_gcc_attribute_may_alias], [gcry_cv_gcc_attribute_may_alias=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[typedef struct foo_s { int a; } __attribute__ ((may_alias)) foo_t;]])], [gcry_cv_gcc_attribute_may_alias=yes])]) if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1, [Defined if a GCC style "__attribute__ ((may_alias))" is supported]) fi # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether the compiler supports 'asm' or '__asm__' keyword for # assembler blocks. # AC_CACHE_CHECK([whether 'asm' assembler keyword is supported], [gcry_cv_have_asm], [gcry_cv_have_asm=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { asm("":::"memory"); }]])], [gcry_cv_have_asm=yes])]) AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported], [gcry_cv_have___asm__], [gcry_cv_have___asm__=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("":::"memory"); }]])], [gcry_cv_have___asm__=yes])]) if test "$gcry_cv_have_asm" = "no" ; then if test "$gcry_cv_have___asm__" = "yes" ; then AC_DEFINE(asm,__asm__, [Define to supported assembler block keyword, if plain 'asm' was not supported]) fi fi # # Check whether the compiler supports inline assembly memory barrier. # if test "$gcry_cv_have_asm" = "no" ; then if test "$gcry_cv_have___asm__" = "yes" ; then AC_CACHE_CHECK([whether inline assembly memory barrier is supported], [gcry_cv_have_asm_volatile_memory], [gcry_cv_have_asm_volatile_memory=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(int x) { __asm__ volatile("":::"memory"); __asm__ volatile("":"+r"(x)::"memory"); }]])], [gcry_cv_have_asm_volatile_memory=yes])]) fi else AC_CACHE_CHECK([whether inline assembly memory barrier is supported], [gcry_cv_have_asm_volatile_memory], [gcry_cv_have_asm_volatile_memory=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(int x) { asm volatile("":::"memory"); asm volatile("":"+r"(x)::"memory"); }]])], [gcry_cv_have_asm_volatile_memory=yes])]) fi if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1, [Define if inline asm memory barrier is supported]) fi # # Check whether GCC assembler supports features needed for our ARM # implementations. This needs to be done before setting up the # assembler stuff. # AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations], [gcry_cv_gcc_arm_platform_as_ok], [if test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_arm_platform_as_ok="n/a" else gcry_cv_gcc_arm_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( /* Test if assembler supports UAL syntax. */ ".syntax unified\n\t" ".arm\n\t" /* our assembly code is in ARM mode */ ".text\n\t" /* Following causes error if assembler ignored '.syntax unified'. */ "asmfunc:\n\t" "add %r0, %r0, %r4, ror #12;\n\t" /* Test if '.type' and '.size' are supported. */ ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,%function;\n\t" );]], [ asmfunc(); ] )], [gcry_cv_gcc_arm_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1, [Defined if underlying assembler is compatible with ARM assembly implementations]) fi # # Check whether GCC assembler supports features needed for our ARMv8/Aarch64 # implementations. This needs to be done before setting up the # assembler stuff. # AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations], [gcry_cv_gcc_aarch64_platform_as_ok], [if test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_aarch64_platform_as_ok="n/a" else gcry_cv_gcc_aarch64_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".text\n\t" "asmfunc:\n\t" "eor x0, x0, x30, ror #12;\n\t" "add x0, x0, x30, asr #12;\n\t" "eor v0.16b, v0.16b, v31.16b;\n\t" );]], [ asmfunc(); ] )], [gcry_cv_gcc_aarch64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations]) fi # # Check whether GCC assembler supports for CFI directives. # AC_CACHE_CHECK([whether GCC assembler supports for CFI directives], [gcry_cv_gcc_asm_cfi_directives], [gcry_cv_gcc_asm_cfi_directives=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".text\n\t" "ac_test:\n\t" ".cfi_startproc\n\t" ".cfi_remember_state\n\t" ".cfi_adjust_cfa_offset 8\n\t" ".cfi_rel_offset 0, 8\n\t" ".cfi_def_cfa_register 1\n\t" ".cfi_register 2, 3\n\t" ".cfi_restore 2\n\t" ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t" ".cfi_restore_state\n\t" ".long 0\n\t" ".cfi_endproc\n\t" );]])], [gcry_cv_gcc_asm_cfi_directives=yes])]) if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1, [Defined if underlying assembler supports for CFI directives]) fi # # Check whether GCC assembler supports for ELF directives. # AC_CACHE_CHECK([whether GCC assembler supports for ELF directives], [gcry_cv_gcc_asm_elf_directives], [gcry_cv_gcc_asm_elf_directives=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( /* Test if ELF directives '.type' and '.size' are supported. */ ".text\n\t" "asmfunc:\n\t" ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,STT_FUNC;\n\t" );]])], [gcry_cv_gcc_asm_elf_directives=yes])]) if test "$gcry_cv_gcc_asm_elf_directives" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_ELF_DIRECTIVES,1, [Defined if underlying assembler supports for ELF directives]) fi # # Check whether underscores in symbols are required. This needs to be # done before setting up the assembler stuff. # GNUPG_SYS_SYMBOL_UNDERSCORE() ################################# #### #### #### Setup assembler stuff. #### #### Define mpi_cpu_arch. #### #### #### ################################# AC_ARG_ENABLE(mpi-path, AS_HELP_STRING([--enable-mpi-path=EXTRA_PATH], [prepend EXTRA_PATH to list of CPU specific optimizations]), mpi_extra_path="$enableval",mpi_extra_path="") AC_MSG_CHECKING(architecture and mpi assembler functions) if test -f $srcdir/mpi/config.links ; then . $srcdir/mpi/config.links AC_CONFIG_LINKS("$mpi_ln_list") ac_cv_mpi_sflags="$mpi_sflags" AC_MSG_RESULT($mpi_cpu_arch) else AC_MSG_RESULT(failed) AC_MSG_ERROR([mpi/config.links missing!]) fi MPI_SFLAGS="$ac_cv_mpi_sflags" AC_SUBST(MPI_SFLAGS) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes) AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes) AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes) AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes) AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes) # Reset non applicable feature flags. if test "$mpi_cpu_arch" != "x86" ; then aesnisupport="n/a" shaextsupport="n/a" pclmulsupport="n/a" sse41support="n/a" avxsupport="n/a" avx2support="n/a" avx512support="n/a" gfnisupport="n/a" padlocksupport="n/a" drngsupport="n/a" fi if test "$mpi_cpu_arch" != "arm" ; then if test "$mpi_cpu_arch" != "aarch64" ; then neonsupport="n/a" armcryptosupport="n/a" fi fi if test "$mpi_cpu_arch" != "ppc"; then ppccryptosupport="n/a" fi ############################################# #### #### #### Platform specific compiler checks. #### #### #### ############################################# # Following tests depend on warnings to cause compile to fail, so set -Werror # temporarily. _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -Werror" # # Check whether compiler supports 'ms_abi' function attribute. # AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute], [gcry_cv_gcc_attribute_ms_abi], [gcry_cv_gcc_attribute_ms_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[int __attribute__ ((ms_abi)) proto(int);]])], [gcry_cv_gcc_attribute_ms_abi=yes])]) if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1, [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute]) fi # # Check whether compiler supports 'sysv_abi' function attribute. # AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute], [gcry_cv_gcc_attribute_sysv_abi], [gcry_cv_gcc_attribute_sysv_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[int __attribute__ ((sysv_abi)) proto(int);]])], [gcry_cv_gcc_attribute_sysv_abi=yes])]) if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1, [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute]) fi # # Check whether default calling convention is 'ms_abi'. # if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then AC_CACHE_CHECK([whether default calling convention is 'ms_abi'], [gcry_cv_gcc_default_abi_is_ms_abi], [gcry_cv_gcc_default_abi_is_ms_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void *test(void) { void *(*def_func)(void) = test; void *__attribute__((ms_abi))(*msabi_func)(void); /* warning on SysV abi targets, passes on Windows based targets */ msabi_func = def_func; return msabi_func; }]])], [gcry_cv_gcc_default_abi_is_ms_abi=yes])]) if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1, [Defined if default calling convention is 'ms_abi']) fi fi # # Check whether default calling convention is 'sysv_abi'. # if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'], [gcry_cv_gcc_default_abi_is_sysv_abi], [gcry_cv_gcc_default_abi_is_sysv_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void *test(void) { void *(*def_func)(void) = test; void *__attribute__((sysv_abi))(*sysvabi_func)(void); /* warning on MS ABI targets, passes on SysV ABI targets */ sysvabi_func = def_func; return sysvabi_func; }]])], [gcry_cv_gcc_default_abi_is_sysv_abi=yes])]) if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1, [Defined if default calling convention is 'sysv_abi']) fi fi # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether GCC inline assembler supports SSSE3 instructions # This is required for the AES-NI instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions], [gcry_cv_gcc_inline_asm_ssse3], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_ssse3="n/a" else gcry_cv_gcc_inline_asm_ssse3=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; void a(void) { __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_ssse3=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1, [Defined if inline assembler supports SSSE3 instructions]) fi # # Check whether GCC inline assembler supports PCLMUL instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions], [gcry_cv_gcc_inline_asm_pclmul], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_pclmul="n/a" else gcry_cv_gcc_inline_asm_pclmul=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_pclmul=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1, [Defined if inline assembler supports PCLMUL instructions]) fi # # Check whether GCC inline assembler supports SHA Extensions instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions], [gcry_cv_gcc_inline_asm_shaext], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_shaext="n/a" else gcry_cv_gcc_inline_asm_shaext=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_shaext=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1, [Defined if inline assembler supports SHA Extensions instructions]) fi # # Check whether GCC inline assembler supports SSE4.1 instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions], [gcry_cv_gcc_inline_asm_sse41], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_sse41="n/a" else gcry_cv_gcc_inline_asm_sse41=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { int i; __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i)); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_sse41=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1, [Defined if inline assembler supports SSE4.1 instructions]) fi # # Check whether GCC inline assembler supports AVX instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions], [gcry_cv_gcc_inline_asm_avx], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_avx="n/a" else gcry_cv_gcc_inline_asm_avx=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_avx=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1, [Defined if inline assembler supports AVX instructions]) fi # # Check whether GCC inline assembler supports AVX2 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions], [gcry_cv_gcc_inline_asm_avx2], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_avx2="n/a" else gcry_cv_gcc_inline_asm_avx2=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_avx2=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1, [Defined if inline assembler supports AVX2 instructions]) fi # # Check whether GCC inline assembler supports AVX512 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX512 instructions], [gcry_cv_gcc_inline_asm_avx512], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_avx512="n/a" else gcry_cv_gcc_inline_asm_avx512=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("xgetbv; vpopcntq %%zmm7, %%zmm1%{%%k1%}%{z%};\n\t":::"cc"); __asm__("vpexpandb %%zmm3, %%zmm1;\n\t":::"cc"); __asm__("vpxorq %%xmm7, %%xmm7, %%xmm7;\n\t":::"cc"); __asm__("vpxorq %%ymm7, %%ymm7, %%ymm7;\n\t":::"cc"); __asm__("vpxorq (%%eax)%{1to8%}, %%zmm7, %%zmm7;\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_avx512=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx512" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX512,1, [Defined if inline assembler supports AVX512 instructions]) fi # # Check whether GCC inline assembler supports VAES and VPCLMUL instructions # AC_CACHE_CHECK([whether GCC inline assembler supports VAES and VPCLMUL instructions], [gcry_cv_gcc_inline_asm_vaes_vpclmul], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_vaes_vpclmul="n/a" else gcry_cv_gcc_inline_asm_vaes_vpclmul=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("vaesenclast %%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/ __asm__("vaesenclast %%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/ __asm__("vpclmulqdq \$0,%%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/ __asm__("vpclmulqdq \$0,%%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/ }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_vaes_vpclmul=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_vaes_vpclmul" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL,1, [Defined if inline assembler supports VAES and VPCLMUL instructions]) fi # # Check whether GCC inline assembler supports GFNI instructions # AC_CACHE_CHECK([whether GCC inline assembler supports GFNI instructions], [gcry_cv_gcc_inline_asm_gfni], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_gfni="n/a" else gcry_cv_gcc_inline_asm_gfni=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("gf2p8affineqb \$123, %%xmm0, %%xmm0;\n\t":::"cc"); /* SSE */ __asm__("vgf2p8affineinvqb \$234, %%ymm1, %%ymm1, %%ymm1;\n\t":::"cc"); /* AVX */ __asm__("vgf2p8mulb (%%eax), %%zmm2, %%zmm2;\n\t":::"cc"); /* AVX512 */ }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_gfni=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_gfni" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_GFNI,1, [Defined if inline assembler supports GFNI instructions]) fi # # Check whether GCC inline assembler supports BMI2 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions], [gcry_cv_gcc_inline_asm_bmi2], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_bmi2="n/a" else gcry_cv_gcc_inline_asm_bmi2=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[unsigned int a(unsigned int x, unsigned int y) { unsigned int tmp1, tmp2; asm ("rorxl %2, %1, %0" : "=r" (tmp1) : "rm0" (x), "J" (32 - ((23) & 31))); asm ("andnl %2, %1, %0" : "=r" (tmp2) : "r0" (x), "rm" (y)); return tmp1 + tmp2; }]], [ a(1, 2); ] )], [gcry_cv_gcc_inline_asm_bmi2=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1, [Defined if inline assembler supports BMI2 instructions]) fi # # Check whether GCC assembler needs "-Wa,--divide" to correctly handle # constant division # if test $amd64_as_feature_detection = yes; then AC_CACHE_CHECK([whether GCC assembler handles division correctly], [gcry_cv_gcc_as_const_division_ok], [gcry_cv_gcc_as_const_division_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]], [fn();])], [gcry_cv_gcc_as_const_division_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then # # Add '-Wa,--divide' to CPPFLAGS and try check again. # _gcc_cppflags_save="$CPPFLAGS" CPPFLAGS="$CPPFLAGS -Wa,--divide" AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"], [gcry_cv_gcc_as_const_division_with_wadivide_ok], [gcry_cv_gcc_as_const_division_with_wadivide_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]], [fn();])], [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then # '-Wa,--divide' did not work, restore old flags. CPPFLAGS="$_gcc_cppflags_save" fi fi fi # # Check whether GCC assembler supports features needed for our amd64 # implementations # if test $amd64_as_feature_detection = yes; then AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations], [gcry_cv_gcc_amd64_platform_as_ok], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_amd64_platform_as_ok="n/a" else gcry_cv_gcc_amd64_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( /* Test if '.type' and '.size' are supported. */ /* These work only on ELF targets. */ ".text\n\t" "asmfunc:\n\t" ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,@function;\n\t" /* Test if assembler allows use of '/' for constant division * (Solaris/x86 issue). If previous constant division check * and "-Wa,--divide" workaround failed, this causes assembly * to be disable on this machine. */ "xorl \$(123456789/12345678), %ebp;\n\t" );]], [ asmfunc(); ])], [gcry_cv_gcc_amd64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with amd64 assembly implementations]) fi if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" && test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" && test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations], [gcry_cv_gcc_win64_platform_as_ok], [gcry_cv_gcc_win64_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".text\n\t" ".globl asmfunc\n\t" "asmfunc:\n\t" "xorq \$(1234), %rbp;\n\t" );]], [ asmfunc(); ])], [gcry_cv_gcc_win64_platform_as_ok=yes])]) if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with WIN64 assembly implementations]) fi fi fi # # Check whether GCC assembler supports features needed for assembly # implementations that use Intel syntax # AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations], [gcry_cv_gcc_platform_as_ok_for_intel_syntax], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a" else gcry_cv_gcc_platform_as_ok_for_intel_syntax=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".intel_syntax noprefix\n\t" ".text\n\t" "actest:\n\t" "pxor xmm1, xmm7;\n\t" "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t" "add eax, ebp;\n\t" "rorx eax, ebp, 1;\n\t" "sub eax, [esp + 4];\n\t" "add dword ptr [esp + eax], 0b10101;\n\t" ".att_syntax prefix\n\t" );]], [ actest(); ])], [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes]) fi]) if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1, [Defined if underlying assembler is compatible with Intel syntax assembly implementations]) fi # # Check whether compiler is configured for ARMv6 or newer architecture # AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture], [gcry_cv_cc_arm_arch_is_v6], [if test "$mpi_cpu_arch" != "arm" || test "$try_asm_modules" != "yes" ; then gcry_cv_cc_arm_arch_is_v6="n/a" else gcry_cv_cc_arm_arch_is_v6=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[ #if defined(__arm__) && \ ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ || defined(__ARM_ARCH_7EM__)) /* empty */ #else /* fail compile if not ARMv6. */ not_armv6 not_armv6 = (not_armv6)not_armv6; #endif ]])], [gcry_cv_cc_arm_arch_is_v6=yes]) fi]) if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then AC_DEFINE(HAVE_ARM_ARCH_V6,1, [Defined if ARM architecture is v6 or newer]) fi # # Check whether GCC inline assembler supports NEON instructions # AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions], [gcry_cv_gcc_inline_asm_neon], [if test "$mpi_cpu_arch" != "arm" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_neon="n/a" else gcry_cv_gcc_inline_asm_neon=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".syntax unified\n\t" ".arm\n\t" ".fpu neon\n\t" ".text\n\t" "testfn:\n\t" "vld1.64 {%q0-%q1}, [%r0]!;\n\t" "vrev64.8 %q0, %q3;\n\t" "vadd.u64 %q0, %q1;\n\t" "vadd.s64 %d3, %d2, %d3;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_neon=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1, [Defined if inline assembler supports NEON instructions]) fi # # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions], [gcry_cv_gcc_inline_asm_aarch32_crypto], [if test "$mpi_cpu_arch" != "arm" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_aarch32_crypto="n/a" else gcry_cv_gcc_inline_asm_aarch32_crypto=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".syntax unified\n\t" ".arch armv8-a\n\t" ".arm\n\t" ".fpu crypto-neon-fp-armv8\n\t" ".text\n\t" "testfn:\n\t" "sha1h.32 q0, q0;\n\t" "sha1c.32 q0, q0, q0;\n\t" "sha1p.32 q0, q0, q0;\n\t" "sha1su0.32 q0, q0, q0;\n\t" "sha1su1.32 q0, q0;\n\t" "sha256h.32 q0, q0, q0;\n\t" "sha256h2.32 q0, q0, q0;\n\t" "sha1p.32 q0, q0, q0;\n\t" "sha256su0.32 q0, q0;\n\t" "sha256su1.32 q0, q0, q15;\n\t" "aese.8 q0, q0;\n\t" "aesd.8 q0, q0;\n\t" "aesmc.8 q0, q0;\n\t" "aesimc.8 q0, q0;\n\t" "vmull.p64 q0, d0, d0;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch32_crypto=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1, [Defined if inline assembler supports AArch32 Crypto Extension instructions]) fi # # Check whether GCC inline assembler supports AArch64 NEON instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions], [gcry_cv_gcc_inline_asm_aarch64_neon], [if test "$mpi_cpu_arch" != "aarch64" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_aarch64_neon="n/a" else gcry_cv_gcc_inline_asm_aarch64_neon=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".cpu generic+simd\n\t" ".text\n\t" "testfn:\n\t" "mov w0, \#42;\n\t" "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch64_neon=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1, [Defined if inline assembler supports AArch64 NEON instructions]) fi # # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions], [gcry_cv_gcc_inline_asm_aarch64_crypto], [if test "$mpi_cpu_arch" != "aarch64" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_aarch64_crypto="n/a" else gcry_cv_gcc_inline_asm_aarch64_crypto=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".cpu generic+simd+crypto\n\t" ".text\n\t" "testfn:\n\t" "mov w0, \#42;\n\t" "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" "sha1h s0, s0;\n\t" "sha1c q0, s0, v0.4s;\n\t" "sha1p q0, s0, v0.4s;\n\t" "sha1su0 v0.4s, v0.4s, v0.4s;\n\t" "sha1su1 v0.4s, v0.4s;\n\t" "sha256h q0, q0, v0.4s;\n\t" "sha256h2 q0, q0, v0.4s;\n\t" "sha1p q0, s0, v0.4s;\n\t" "sha256su0 v0.4s, v0.4s;\n\t" "sha256su1 v0.4s, v0.4s, v31.4s;\n\t" "aese v0.16b, v0.16b;\n\t" "aesd v0.16b, v0.16b;\n\t" "aesmc v0.16b, v0.16b;\n\t" "aesimc v0.16b, v0.16b;\n\t" "pmull v0.1q, v0.1d, v31.1d;\n\t" "pmull2 v0.1q, v0.2d, v31.2d;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch64_crypto=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1, [Defined if inline assembler supports AArch64 Crypto Extension instructions]) fi # # Check whether PowerPC AltiVec/VSX intrinsics # AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics], [gcry_cv_cc_ppc_altivec], [if test "$mpi_cpu_arch" != "ppc" || test "$try_asm_modules" != "yes" ; then gcry_cv_cc_ppc_altivec="n/a" else gcry_cv_cc_ppc_altivec=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#include typedef vector unsigned char block; typedef vector unsigned int vecu32; static inline __attribute__((always_inline)) vecu32 vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx) { return vec_sld (a, b, (4 * idx) & 15); } block fn(block in) { block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0)); vecu32 y = vec_vsx_ld (0, (unsigned int*)0); y = vec_sld_u32 (y, y, 3); return vec_cipher_be (t, in) ^ (block)y; } ]])], [gcry_cv_cc_ppc_altivec=yes]) fi]) if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1, [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics]) fi _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto" if test "$gcry_cv_cc_ppc_altivec" = "no" && test "$mpi_cpu_arch" = "ppc" && test "$try_asm_modules" == "yes" ; then AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags], [gcry_cv_cc_ppc_altivec_cflags], [gcry_cv_cc_ppc_altivec_cflags=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#include typedef vector unsigned char block; typedef vector unsigned int vecu32; static inline __attribute__((always_inline)) vecu32 vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx) { return vec_sld (a, b, (4 * idx) & 15); } block fn(block in) { block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0)); vecu32 y = vec_vsx_ld (0, (unsigned int*)0); y = vec_sld_u32 (y, y, 3); return vec_cipher_be (t, in) ^ (block)y; }]])], [gcry_cv_cc_ppc_altivec_cflags=yes])]) if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1, [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics]) AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1, [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags]) fi fi AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS, test "$gcry_cv_cc_ppc_altivec_cflags" = "yes") # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions # AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions], [gcry_cv_gcc_inline_asm_ppc_altivec], [if test "$mpi_cpu_arch" != "ppc" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_ppc_altivec="n/a" else gcry_cv_gcc_inline_asm_ppc_altivec=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".globl testfn;\n" ".text\n\t" "testfn:\n" "stvx %v31,%r12,%r0;\n" "lvx %v20,%r12,%r0;\n" "vcipher %v0, %v1, %v22;\n" "lxvw4x %vs32, %r0, %r1;\n" "vadduwm %v0, %v1, %v22;\n" "vshasigmaw %v0, %v1, 0, 15;\n" "vshasigmad %v0, %v1, 0, 15;\n" "vpmsumd %v11, %v11, %v11;\n" ); ]], [ testfn(); ] )], [gcry_cv_gcc_inline_asm_ppc_altivec=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1, [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions]) fi # # Check whether GCC inline assembler supports PowerISA 3.00 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions], [gcry_cv_gcc_inline_asm_ppc_arch_3_00], [if test "$mpi_cpu_arch" != "ppc" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a" else gcry_cv_gcc_inline_asm_ppc_arch_3_00=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".text\n\t" ".globl testfn;\n" "testfn:\n" "stxvb16x %r1,%v12,%v30;\n" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1, [Defined if inline assembler supports PowerISA 3.00 instructions]) fi # # Check whether GCC inline assembler supports zSeries instructions # AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions], [gcry_cv_gcc_inline_asm_s390x], [if test "$mpi_cpu_arch" != "s390x" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_s390x="n/a" else gcry_cv_gcc_inline_asm_s390x=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[typedef unsigned int u128_t __attribute__ ((mode (TI))); unsigned int testfunc(unsigned int x, void *y, unsigned int z) { unsigned long fac[8]; register unsigned long reg0 asm("0") = 0; register unsigned long reg1 asm("1") = x; u128_t r1 = ((u128_t)(unsigned long)y << 64) | (unsigned long)z; u128_t r2 = 0; u128_t r3 = 0; asm volatile (".insn rre,0xb92e << 16, %[r1], %[r2]\n\t" : [r1] "+a" (r1), [r2] "+a" (r2) : "r" (reg0), "r" (reg1) : "cc", "memory"); asm volatile (".insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t" : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3) : "r" (reg0), "r" (reg1) : "cc", "memory"); reg0 = 8 - 1; asm ("stfle %1\n\t" : "+d" (reg0), "=Q" (fac[0]) : : "cc", "memory"); asm volatile ("mvc 0(16, %0), 0(%1)\n\t" : : "a" (y), "a" (fac) : "memory"); asm volatile ("xc 0(16, %0), 0(%0)\n\t" : : "a" (fac) : "memory"); asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t" : : : "memory", "r11"); asm volatile ("algrk %%r14, %%r14, %%r14\n\t" : : : "memory", "r14"); return (unsigned int)r1 ^ reg0; } ]] , [ testfunc(0, 0, 0); ])], [gcry_cv_gcc_inline_asm_s390x=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X,1, [Defined if inline assembler supports zSeries instructions]) fi # # Check whether GCC inline assembler supports zSeries vector instructions # AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instructions], [gcry_cv_gcc_inline_asm_s390x_vx], [if test "$mpi_cpu_arch" != "s390x" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_s390x_vx="n/a" else gcry_cv_gcc_inline_asm_s390x_vx=no if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void testfunc(void) { asm volatile (".machine \"z13+vx\"\n\t" "vx %%v0, %%v1, %%v31\n\t" "verllf %%v11, %%v11, (16)(0)\n\t" : : : "memory"); } ]], [ testfunc(); ])], [gcry_cv_gcc_inline_asm_s390x_vx=yes]) fi fi]) if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X_VX,1, [Defined if inline assembler supports zSeries vector instructions]) fi ####################################### #### Checks for library functions. #### ####################################### AC_FUNC_VPRINTF # We have replacements for these in src/missing-string.c AC_CHECK_FUNCS(stpcpy strcasecmp) # We have replacements for these in src/g10lib.h AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise) # Other checks AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4) AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog) AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info) AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy) GNUPG_CHECK_MLOCK # # Replacement functions. # AC_REPLACE_FUNCS([getpid clock]) # # Check whether it is necessary to link against libdl. # DL_LIBS="" if test "$use_hmac_binary_check" != no ; then _gcry_save_libs="$LIBS" LIBS="" AC_SEARCH_LIBS(dlopen, c dl,,,) DL_LIBS=$LIBS LIBS="$_gcry_save_libs" fi AC_SUBST(DL_LIBS) # # Check whether we can use Linux capabilities as requested. # if test "$use_capabilities" = "yes" ; then use_capabilities=no AC_CHECK_HEADERS(sys/capability.h) if test "$ac_cv_header_sys_capability_h" = "yes" ; then AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1) if test "$ac_cv_lib_cap_cap_init" = "yes"; then AC_DEFINE(USE_CAPABILITIES,1, [define if capabilities should be used]) LIBS="$LIBS -lcap" use_capabilities=yes fi fi if test "$use_capabilities" = "no" ; then AC_MSG_WARN([[ *** *** The use of capabilities on this system is not possible. *** You need a recent Linux kernel and some patches: *** fcaps-2.2.9-990610.patch (kernel patch for 2.2.9) *** fcap-module-990613.tar.gz (kernel module) *** libcap-1.92.tar.gz (user mode library and utilities) *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN *** set (filesystems menu). Be warned: This code is *really* ALPHA. ***]]) fi fi # Check whether a random device is available. if test "$try_dev_random" = yes ; then AC_CACHE_CHECK(for random device, ac_cv_have_dev_random, [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi]) if test "$ac_cv_have_dev_random" = yes; then AC_DEFINE(HAVE_DEV_RANDOM,1, [defined if the system supports a random device] ) fi else AC_MSG_CHECKING(for random device) ac_cv_have_dev_random=no AC_MSG_RESULT(has been disabled) fi # Figure out the random modules for this configuration. if test "$random" = "default"; then # Select default value. if test "$ac_cv_func_getentropy" = yes; then random_modules="getentropy" elif test "$ac_cv_have_dev_random" = yes; then # Try Linuxish random device. random_modules="linux" else case "${host}" in *-*-mingw32ce*) # WindowsCE random device. random_modules="w32ce" ;; *-*-mingw32*|*-*-cygwin*) # Windows random device. random_modules="w32" ;; *) # Build everything, allow to select at runtime. random_modules="$auto_random_modules" ;; esac fi else if test "$random" = "auto"; then # Build everything, allow to select at runtime. random_modules="$auto_random_modules" else random_modules="$random" fi fi # # Other defines # if test mym4_isgit = "yes"; then AC_DEFINE(IS_DEVELOPMENT_VERSION,1, [Defined if this is not a regular release]) fi AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes) # This is handy for debugging so the compiler doesn't rearrange # things and eliminate variables. AC_ARG_ENABLE(optimization, AS_HELP_STRING([--disable-optimization], [disable compiler optimization]), [if test $enableval = no ; then CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'` fi]) AC_MSG_NOTICE([checking for cc features]) # CFLAGS mangling when using gcc. if test "$GCC" = yes; then AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks]) _gcc_cflags_save=$CFLAGS CFLAGS="-fno-delete-null-pointer-checks" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -fno-delete-null-pointer-checks" fi CFLAGS="$CFLAGS -Wall" if test "$USE_MAINTAINER_MODE" = "yes"; then CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes" CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security" # If -Wno-missing-field-initializers is supported we can enable a # a bunch of really useful warnings. AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers]) _gcc_cflags_save=$CFLAGS CFLAGS="-Wno-missing-field-initializers" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast" CFLAGS="$CFLAGS -Wwrite-strings" CFLAGS="$CFLAGS -Wdeclaration-after-statement" CFLAGS="$CFLAGS -Wno-missing-field-initializers" CFLAGS="$CFLAGS -Wno-sign-compare" fi AC_MSG_CHECKING([if gcc supports -Wpointer-arith]) _gcc_cflags_save=$CFLAGS CFLAGS="-Wpointer-arith" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -Wpointer-arith" fi fi fi # Check whether as(1) supports a noeexecstack feature. This test # includes an override option. CL_AS_NOEXECSTACK AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION) AC_SUBST(LIBGCRYPT_CONFIG_LIBS) AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS) AC_SUBST(LIBGCRYPT_CONFIG_HOST) AC_SUBST(LIBGCRYPT_THREAD_MODULES) AC_CONFIG_COMMANDS([gcrypt-conf],[[ chmod +x src/libgcrypt-config ]],[[ prefix=$prefix exec_prefix=$exec_prefix libdir=$libdir datadir=$datadir DATADIRNAME=$DATADIRNAME ]]) ##################### #### Conclusion. #### ##################### # Check that requested feature can actually be used and define # ENABLE_foo_SUPPORT macros. if test x"$aesnisupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then aesnisupport="no (unsupported by compiler)" fi fi if test x"$shaextsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then shaextsupport="no (unsupported by compiler)" fi fi if test x"$pclmulsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then pclmulsupport="no (unsupported by compiler)" fi fi if test x"$sse41support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then sse41support="no (unsupported by compiler)" fi fi if test x"$avxsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then avxsupport="no (unsupported by compiler)" fi fi if test x"$avx2support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then avx2support="no (unsupported by compiler)" fi fi if test x"$avx512support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx512" != "yes" ; then avx512support="no (unsupported by compiler)" fi fi if test x"$gfnisupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_gfni" != "yes" ; then gfnisupport="no (unsupported by compiler)" fi fi if test x"$neonsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then neonsupport="no (unsupported by compiler)" fi fi fi if test x"$armcryptosupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then armcryptosupport="no (unsupported by compiler)" fi fi fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, [Enable support for Intel AES-NI instructions.]) fi if test x"$shaextsupport" = xyes ; then AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1, [Enable support for Intel SHAEXT instructions.]) fi if test x"$pclmulsupport" = xyes ; then AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1, [Enable support for Intel PCLMUL instructions.]) fi if test x"$sse41support" = xyes ; then AC_DEFINE(ENABLE_SSE41_SUPPORT, 1, [Enable support for Intel SSE4.1 instructions.]) fi if test x"$avxsupport" = xyes ; then AC_DEFINE(ENABLE_AVX_SUPPORT,1, [Enable support for Intel AVX instructions.]) fi if test x"$avx2support" = xyes ; then AC_DEFINE(ENABLE_AVX2_SUPPORT,1, [Enable support for Intel AVX2 instructions.]) fi if test x"$avx512support" = xyes ; then AC_DEFINE(ENABLE_AVX512_SUPPORT,1, [Enable support for Intel AVX512 instructions.]) fi if test x"$gfnisupport" = xyes ; then AC_DEFINE(ENABLE_GFNI_SUPPORT,1, [Enable support for Intel GFNI instructions.]) fi if test x"$neonsupport" = xyes ; then AC_DEFINE(ENABLE_NEON_SUPPORT,1, [Enable support for ARM NEON instructions.]) fi if test x"$armcryptosupport" = xyes ; then AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1, [Enable support for ARMv8 Crypto Extension instructions.]) fi if test x"$ppccryptosupport" = xyes ; then AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1, [Enable support for POWER 8 (PowerISA 2.07) crypto extension.]) fi if test x"$jentsupport" = xyes ; then AC_DEFINE(ENABLE_JENT_SUPPORT, 1, [Enable support for the jitter entropy collector.]) fi if test x"$padlocksupport" = xyes ; then AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1, [Enable support for the PadLock engine.]) fi if test x"$drngsupport" = xyes ; then AC_DEFINE(ENABLE_DRNG_SUPPORT, 1, [Enable support for Intel DRNG (RDRAND instruction).]) fi if test x"$force_soft_hwfeatures" = xyes ; then AC_DEFINE(ENABLE_FORCE_SOFT_HWFEATURES, 1, [Enable forcing 'soft' HW feature bits on (for testing).]) fi # Define conditional sources and config.h symbols depending on the # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules. LIST_MEMBER(arcfour, $enabled_ciphers) if test "$found" = "1"; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo" AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS arcfour-amd64.lo" ;; esac fi LIST_MEMBER(blowfish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo" AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-arm.lo" ;; esac fi LIST_MEMBER(cast5, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo" AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-arm.lo" ;; esac fi LIST_MEMBER(des, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo" AC_DEFINE(USE_DES, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS des-amd64.lo" ;; esac fi LIST_MEMBER(aes, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo" AC_DEFINE(USE_AES, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-amd64.lo" # Build with the SSSE3 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64-asm.lo" # Build with the VAES/AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-avx2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-arm.lo" # Build with the ARMv8/AArch32 CE implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aarch64.lo" # Build with the ARMv8/AArch64 CE implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc9le.lo" if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" && test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then # Build with AES-GCM bulk implementation for P10 GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-gcm-p10le.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-p10le.lo" fi ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo" ;; s390x-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-s390x.lo" ;; esac case "$mpi_cpu_arch" in x86) # Build with the AES-NI implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aesni.lo" # Build with the Padlock implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-padlock.lo" ;; esac fi LIST_MEMBER(twofish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo" AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-amd64.lo" if test x"$avx2support" = xyes ; then # Build with the AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-avx2-amd64.lo" fi ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-arm.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-aarch64.lo" ;; esac fi LIST_MEMBER(serpent, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo" AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the SSE2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-sse2-amd64.lo" ;; esac if test x"$avx2support" = xyes ; then # Build with the AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx2-amd64.lo" fi if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-armv7-neon.lo" fi fi LIST_MEMBER(rfc2268, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo" AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included]) fi LIST_MEMBER(seed, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo" AC_DEFINE(USE_SEED, 1, [Defined if this module should be included]) fi LIST_MEMBER(camellia, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo" AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included]) case "${host}" in arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-arm.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo" ;; esac if test x"$avxsupport" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx-amd64.lo" fi fi if test x"$avx2support" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx2-amd64.lo" # Build with the VAES/AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo" # Build with the GFNI/AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo" + + # Build with the GFNI/AVX512 implementation + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx512-amd64.lo" fi fi fi LIST_MEMBER(idea, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo" AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included]) fi LIST_MEMBER(salsa20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo" AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-amd64.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-armv7-neon.lo" fi fi LIST_MEMBER(gost28147, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo" AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included]) fi LIST_MEMBER(chacha20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-aarch64.lo" ;; powerpc64le-*-*) # Build with the ppc8 vector implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo" ;; powerpc64-*-*) # Build with the ppc8 vector implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo" ;; powerpc-*-*) # Build with the ppc8 vector implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo" ;; s390x-*-*) # Build with the s390x/zSeries vector implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-armv7-neon.lo" fi fi LIST_MEMBER(sm4, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4.lo" AC_DEFINE(USE_SM4, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo" esac fi LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" AC_DEFINE(USE_DSA, 1, [Defined if this module should be included]) fi LIST_MEMBER(rsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo" AC_DEFINE(USE_RSA, 1, [Defined if this module should be included]) fi LIST_MEMBER(elgamal, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo" AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included]) fi LIST_MEMBER(ecc, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \ ecc.lo ecc-curves.lo ecc-misc.lo \ ecc-ecdh.lo ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo \ ecc-sm2.lo" AC_DEFINE(USE_ECC, 1, [Defined if this module should be included]) fi LIST_MEMBER(crc, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo" AC_DEFINE(USE_CRC, 1, [Defined if this module should be included]) case "${host}" in i?86-*-* | x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-intel-pclmul.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-ce.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo" ;; powerpc64-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo" ;; powerpc-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo" ;; esac fi LIST_MEMBER(gostr3411-94, $enabled_digests) if test "$found" = "1" ; then # GOST R 34.11-94 internally uses GOST 28147-89 LIST_MEMBER(gost28147, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo" AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included]) fi fi LIST_MEMBER(stribog, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo" AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included]) fi LIST_MEMBER(md2, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo" AC_DEFINE(USE_MD2, 1, [Defined if this module should be included]) fi LIST_MEMBER(md4, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo" AC_DEFINE(USE_MD4, 1, [Defined if this module should be included]) fi LIST_MEMBER(md5, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo" AC_DEFINE(USE_MD5, 1, [Defined if this module should be included]) fi LIST_MEMBER(rmd160, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo" AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included]) fi LIST_MEMBER(sha256, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo" AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ssse3-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo" esac case "$mpi_cpu_arch" in x86) # Build with the SHAEXT implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-intel-shaext.lo" ;; esac fi LIST_MEMBER(sha512, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo" AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx2-bmi2-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx512-amd64.lo" ;; i?86-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-i386.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo" ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo" esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv7-neon.lo" fi fi LIST_MEMBER(sha3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo" AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation : ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-armv7-neon.lo" fi fi LIST_MEMBER(tiger, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo" AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included]) fi LIST_MEMBER(whirlpool, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo" AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS whirlpool-sse2-amd64.lo" ;; esac fi LIST_MEMBER(blake2, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo" AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo" ;; esac fi LIST_MEMBER(sm3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo" AC_DEFINE(USE_SM3, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-avx-bmi2-amd64.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo" ;; esac fi # SHA-1 needs to be included always for example because it is used by # random-csprng.c. GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo" AC_DEFINE(USE_SHA1, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-ssse3-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-bmi2-amd64.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv7-neon.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch64-ce.lo" ;; esac case "$mpi_cpu_arch" in x86) # Build with the SHAEXT implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-intel-shaext.lo" ;; esac # Arch specific GCM implementations case "${host}" in i?86-*-* | x86_64-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-intel-pclmul.lo" ;; arm*-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv7-neon.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch32-ce.lo" ;; aarch64-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch64-ce.lo" ;; powerpc64le-*-* | powerpc64-*-* | powerpc-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-ppc.lo" ;; esac # Arch specific MAC implementations case "${host}" in s390x-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo" ;; x86_64-*-*) GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo" ;; esac LIST_MEMBER(scrypt, $enabled_kdfs) if test "$found" = "1" ; then GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo" AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included]) fi LIST_MEMBER(getentropy, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndgetentropy.lo" AC_DEFINE(USE_RNDGETENTROPY, 1, [Defined if the getentropy RNG should be used.]) fi LIST_MEMBER(linux, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndoldlinux.lo" AC_DEFINE(USE_RNDOLDLINUX, 1, [Defined if the /dev/random RNG should be used.]) fi LIST_MEMBER(unix, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo" AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.]) fi LIST_MEMBER(egd, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo" AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.]) fi LIST_MEMBER(w32, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo" AC_DEFINE(USE_RNDW32, 1, [Defined if the Windows specific RNG should be used.]) fi LIST_MEMBER(w32ce, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo" AC_DEFINE(USE_RNDW32CE, 1, [Defined if the WindowsCE specific RNG should be used.]) fi if test "$try_asm_modules" = yes ; then # Build with assembly implementations GCRYPT_CIPHERS="$GCRYPT_CIPHERS $GCRYPT_ASM_CIPHERS" GCRYPT_DIGESTS="$GCRYPT_DIGESTS $GCRYPT_ASM_DIGESTS" fi AC_SUBST([GCRYPT_CIPHERS]) AC_SUBST([GCRYPT_PUBKEY_CIPHERS]) AC_SUBST([GCRYPT_DIGESTS]) AC_SUBST([GCRYPT_KDFS]) AC_SUBST([GCRYPT_RANDOM]) AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers) AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers) AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests) # For printing the configuration we need a colon separated list of # algorithm names. tmp=`echo "$enabled_ciphers" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp", [List of available cipher algorithms]) tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp", [List of available public key cipher algorithms]) tmp=`echo "$enabled_digests" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp", [List of available digest algorithms]) tmp=`echo "$enabled_kdfs" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp", [List of available KDF algorithms]) # # Define conditional sources depending on the used hardware platform. # Note that all possible modules must also be listed in # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES). # GCRYPT_HWF_MODULES= case "$mpi_cpu_arch" in x86) AC_DEFINE(HAVE_CPU_ARCH_X86, 1, [Defined for the x86 platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo" ;; alpha) AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms]) ;; sparc) AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms]) ;; mips) AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1, [Defined for MIPS platforms]) ;; m68k) AC_DEFINE(HAVE_CPU_ARCH_M68K, 1, [Defined for M68k platforms]) ;; ppc) AC_DEFINE(HAVE_CPU_ARCH_PPC, 1, [Defined for PPC platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-ppc.lo" ;; arm) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo" ;; aarch64) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM AArch64 platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo" ;; s390x) AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo" ;; esac AC_SUBST([GCRYPT_HWF_MODULES]) # # Option to disable building of doc file # build_doc=yes AC_ARG_ENABLE([doc], AS_HELP_STRING([--disable-doc], [do not build the documentation]), build_doc=$enableval, build_doc=yes) AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno]) # # Provide information about the build. # BUILD_REVISION="mym4_revision" AC_SUBST(BUILD_REVISION) AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION", [GIT commit id revision used to build this package]) changequote(,)dnl BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'` changequote([,])dnl BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec" BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,` AC_SUBST(BUILD_VERSION) AC_SUBST(BUILD_FILEVERSION) AC_ARG_ENABLE([build-timestamp], AS_HELP_STRING([--enable-build-timestamp], [set an explicit build timestamp for reproducibility. (default is the current time in ISO-8601 format)]), [if test "$enableval" = "yes"; then BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date` else BUILD_TIMESTAMP="$enableval" fi], [BUILD_TIMESTAMP=""]) AC_SUBST(BUILD_TIMESTAMP) AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP", [The time this package was configured for a build]) # And create the files. AC_CONFIG_FILES([ Makefile m4/Makefile compat/Makefile mpi/Makefile cipher/Makefile random/Makefile doc/Makefile src/Makefile src/gcrypt.h src/libgcrypt-config src/libgcrypt.pc src/versioninfo.rc tests/Makefile ]) AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g]) AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf]) AC_OUTPUT detection_module="${GCRYPT_HWF_MODULES%.lo}" test -n "$detection_module" || detection_module="none" # Give some feedback GCRY_MSG_SHOW([],[]) GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:]) GCRY_MSG_SHOW([],[]) GCRY_MSG_SHOW([Platform: ],[$PRINTABLE_OS_NAME ($host)]) GCRY_MSG_SHOW([Hardware detection module:],[$detection_module]) GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers]) GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests]) GCRY_MSG_WRAP([Enabled kdf algorithms: ],[$enabled_kdfs]) GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers]) GCRY_MSG_SHOW([Random number generator: ],[$random]) GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport]) GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities]) GCRY_MSG_SHOW([FIPS module version: ],[$fips_module_version]) GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport]) GCRY_MSG_SHOW([Try using AES-NI crypto: ],[$aesnisupport]) GCRY_MSG_SHOW([Try using Intel SHAEXT: ],[$shaextsupport]) GCRY_MSG_SHOW([Try using Intel PCLMUL: ],[$pclmulsupport]) GCRY_MSG_SHOW([Try using Intel SSE4.1: ],[$sse41support]) GCRY_MSG_SHOW([Try using DRNG (RDRAND): ],[$drngsupport]) GCRY_MSG_SHOW([Try using Intel AVX: ],[$avxsupport]) GCRY_MSG_SHOW([Try using Intel AVX2: ],[$avx2support]) GCRY_MSG_SHOW([Try using Intel AVX512: ],[$avx512support]) GCRY_MSG_SHOW([Try using Intel GFNI: ],[$gfnisupport]) GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport]) GCRY_MSG_SHOW([Try using PPC crypto: ],[$ppccryptosupport]) GCRY_MSG_SHOW([],[]) if test "x${gpg_config_script_warn}" != x; then cat <