diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 55f96014..a6171bf5 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -1,278 +1,279 @@
 # Makefile for cipher modules
 # Copyright (C) 1998, 1999, 2000, 2001, 2002,
 #               2003, 2009 Free Software Foundation, Inc.
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # Process this file with automake to produce Makefile.in
 
 # Need to include ../src in addition to top_srcdir because gcrypt.h is
 # a built header.
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
 AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
 
 EXTRA_DIST = gost-s-box.c
 
 CLEANFILES = gost-s-box$(EXEEXT_FOR_BUILD)
 DISTCLEANFILES = gost-sb.h
 
 noinst_LTLIBRARIES = libcipher.la
 
 GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
                  @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
 
 libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
 libcipher_la_LIBADD = $(GCRYPT_MODULES)
 
 libcipher_la_SOURCES = \
 	cipher.c cipher-internal.h \
 	cipher-cbc.c \
 	cipher-cfb.c \
 	cipher-ofb.c \
 	cipher-ctr.c \
 	cipher-aeswrap.c \
 	cipher-ccm.c \
 	cipher-cmac.c \
 	cipher-gcm.c \
 	cipher-poly1305.c \
 	cipher-ocb.c \
 	cipher-xts.c \
 	cipher-eax.c \
 	cipher-siv.c \
 	cipher-gcm-siv.c \
 	cipher-selftest.c cipher-selftest.h \
 	pubkey.c pubkey-internal.h pubkey-util.c \
 	md.c \
 	mac.c mac-internal.h \
 	mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
 	poly1305.c poly1305-internal.h \
 	kdf.c kdf-internal.h \
 	bithelp.h  \
 	bufhelp.h  \
 	primegen.c  \
 	hash-common.c hash-common.h \
 	dsa-common.c rsa-common.c \
 	sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
 	asm-common-aarch64.h \
 	asm-common-amd64.h \
 	asm-common-s390x.h \
 	asm-inline-s390x.h \
 	asm-poly1305-aarch64.h \
 	asm-poly1305-amd64.h \
 	asm-poly1305-s390x.h \
 	arcfour.c arcfour-amd64.S \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
 	chacha20-ppc.c chacha20-s390x.S \
 	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
 	des.c des-amd64.S \
 	dsa.c \
 	elgamal.c \
 	ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
 	ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c ecc-sm2.c \
 	idea.c \
 	gost28147.c gost.h \
 	gostr3411-94.c \
 	md4.c \
 	md5.c \
 	poly1305-s390x.S poly1305-amd64-avx512.S \
 	rijndael.c rijndael-internal.h rijndael-tables.h   \
 	rijndael-aesni.c rijndael-padlock.c                \
 	rijndael-amd64.S rijndael-arm.S                    \
 	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
 	rijndael-vaes.c rijndael-vaes-avx2-amd64.S         \
 	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
 	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
 	rijndael-ppc.c rijndael-ppc9le.c                   \
 	rijndael-p10le.c rijndael-gcm-p10le.s             \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
 	rijndael-s390x.c                                   \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
 	scrypt.c \
 	seed.c \
 	serpent.c serpent-sse2-amd64.S \
 	sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
 	sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \
 	serpent-avx2-amd64.S serpent-armv7-neon.S \
 	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 	sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
 	sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
 	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
 	sha256-avx2-bmi2-amd64.S \
 	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
 	sha256-intel-shaext.c sha256-ppc.c \
 	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
 	sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
 	sha512-armv7-neon.S sha512-arm.S \
 	sha512-ppc.c sha512-ssse3-i386.c \
 	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
 	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 	stribog.c \
 	tiger.c \
 	whirlpool.c whirlpool-sse2-amd64.S \
 	twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
 	twofish-avx2-amd64.S \
 	rfc2268.c \
 	camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
-	camellia-aesni-avx2-amd64.h camellia-gfni-avx2-amd64.S \
+	camellia-aesni-avx2-amd64.h \
+	camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
 	camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
 	camellia-arm.S camellia-aarch64.S \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2s-amd64-avx.S
 
 gost28147.lo: gost-sb.h
 gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD)
 	./gost-s-box$(EXEEXT_FOR_BUILD) $@
 
 gost-s-box$(EXEEXT_FOR_BUILD): gost-s-box.c
 	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
 	    $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
 
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O\([2-9sg][2-9sg]*\)/-O1/' -e 's/-Ofast/-O1/g'
 else
 o_flag_munging = cat
 endif
 
 
 # We need to lower the optimization for this module.
 tiger.o: $(srcdir)/tiger.c Makefile
 	`echo $(COMPILE) -c $< | $(o_flag_munging) `
 
 tiger.lo: $(srcdir)/tiger.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(o_flag_munging) `
 
 
 # We need to disable instrumentation for these modules as they use cc as
 # thin assembly front-end and do not tolerate in-between function calls
 # inserted by compiler as those functions may clobber the XMM registers.
 if ENABLE_INSTRUMENTATION_MUNGING
 instrumentation_munging = sed \
 	-e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
 	-e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
 	-e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
 else
 instrumentation_munging = cat
 endif
 
 rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
 ppc_vcrypto_cflags = -O2 -maltivec -mvsx -mcrypto
 else
 ppc_vcrypto_cflags =
 endif
 
 rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-p10le.o: $(srcdir)/rijndael-p10le.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-p10le.lo: $(srcdir)/rijndael-p10le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h
index b1b4b2e1..8c322ede 100644
--- a/cipher/bulkhelp.h
+++ b/cipher/bulkhelp.h
@@ -1,396 +1,425 @@
 /* bulkhelp.h  -  Some bulk processing helpers
  * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef GCRYPT_BULKHELP_H
 #define GCRYPT_BULKHELP_H
 
 
 #include "g10lib.h"
 #include "cipher-internal.h"
 
 
 #ifdef __x86_64__
 /* Use u64 to store pointers for x32 support (assembly function assumes
  * 64-bit pointers). */
 typedef u64 ocb_L_uintptr_t;
 #else
 typedef uintptr_t ocb_L_uintptr_t;
 #endif
 
 typedef unsigned int (*bulk_crypt_fn_t) (const void *ctx, byte *out,
                                          const byte *in,
                                          unsigned int num_blks);
 
 
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk64 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[64], u64 blkn)
+{
+  unsigned int n = 64 - (blkn % 64);
+  unsigned int i;
+
+  for (i = 0; i < 64; i += 8)
+    {
+      Ls[(i + 0 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(15 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(23 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(31 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[5];
+  Ls[(39 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(47 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(55 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(63 + n) % 64];
+}
+
+
 static inline ocb_L_uintptr_t *
 bulk_ocb_prepare_L_pointers_array_blk32 (gcry_cipher_hd_t c,
                                          ocb_L_uintptr_t Ls[32], u64 blkn)
 {
   unsigned int n = 32 - (blkn % 32);
   unsigned int i;
 
   for (i = 0; i < 32; i += 8)
     {
       Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
       Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
       Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
       Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
       Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
       Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
       Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
     }
 
   Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
   Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
   Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
   return &Ls[(31 + n) % 32];
 }
 
 
 static inline ocb_L_uintptr_t *
 bulk_ocb_prepare_L_pointers_array_blk16 (gcry_cipher_hd_t c,
                                          ocb_L_uintptr_t Ls[16], u64 blkn)
 {
   unsigned int n = 16 - (blkn % 16);
   unsigned int i;
 
   for (i = 0; i < 16; i += 8)
     {
       Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
       Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
       Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
       Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
       Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
       Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
       Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
     }
 
   Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
   return &Ls[(15 + n) % 16];
 }
 
 
 static inline ocb_L_uintptr_t *
 bulk_ocb_prepare_L_pointers_array_blk8 (gcry_cipher_hd_t c,
                                         ocb_L_uintptr_t Ls[8], u64 blkn)
 {
   unsigned int n = 8 - (blkn % 8);
 
   Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
   Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
   Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
   Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
   Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
   Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
   Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
   Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
 
   return &Ls[(7 + n) % 8];
 }
 
 
 static inline unsigned int
 bulk_ctr_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
                   const byte *inbuf, size_t nblocks, byte *ctr,
                   byte *tmpbuf, size_t tmpbuf_nblocks,
                   unsigned int *num_used_tmpblocks)
 {
   unsigned int tmp_used = 16;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
   while (nblocks >= 1)
     {
       size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
       size_t i;
 
       if (curr_blks * 16 > tmp_used)
         tmp_used = curr_blks * 16;
 
       cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
       for (i = 1; i < curr_blks; i++)
         {
           cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
           cipher_block_add (&tmpbuf[i * 16], i, 16);
         }
       cipher_block_add (ctr, curr_blks, 16);
 
       nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
 
       for (i = 0; i < curr_blks; i++)
         {
           cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
           outbuf += 16;
           inbuf += 16;
         }
 
       nblocks -= curr_blks;
     }
 
   *num_used_tmpblocks = tmp_used;
   return burn_depth;
 }
 
 
 static inline unsigned int
 bulk_cbc_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
                   const byte *inbuf, size_t nblocks, byte *iv,
                   byte *tmpbuf, size_t tmpbuf_nblocks,
                   unsigned int *num_used_tmpblocks)
 {
   unsigned int tmp_used = 16;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
   while (nblocks >= 1)
     {
       size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
       size_t i;
 
       if (curr_blks * 16 > tmp_used)
         tmp_used = curr_blks * 16;
 
       nburn = crypt_fn (priv, tmpbuf, inbuf, curr_blks);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
 
       for (i = 0; i < curr_blks; i++)
         {
           cipher_block_xor_n_copy_2(outbuf, &tmpbuf[i * 16], iv, inbuf, 16);
           outbuf += 16;
           inbuf += 16;
         }
 
       nblocks -= curr_blks;
     }
 
   *num_used_tmpblocks = tmp_used;
   return burn_depth;
 }
 
 
 static inline unsigned int
 bulk_cfb_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
                   const byte *inbuf, size_t nblocks, byte *iv,
                   byte *tmpbuf, size_t tmpbuf_nblocks,
                   unsigned int *num_used_tmpblocks)
 {
   unsigned int tmp_used = 16;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
   while (nblocks >= 1)
     {
       size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
       size_t i;
 
       if (curr_blks * 16 > tmp_used)
         tmp_used = curr_blks * 16;
 
       cipher_block_cpy (&tmpbuf[0 * 16], iv, 16);
       if (curr_blks > 1)
         memcpy (&tmpbuf[1 * 16], &inbuf[(1 - 1) * 16], 16 * curr_blks - 16);
       cipher_block_cpy (iv, &inbuf[(curr_blks - 1) * 16], 16);
 
       nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
 
       for (i = 0; i < curr_blks; i++)
         {
           cipher_block_xor (outbuf, inbuf, &tmpbuf[i * 16], 16);
           outbuf += 16;
           inbuf += 16;
         }
 
       nblocks -= curr_blks;
     }
 
   *num_used_tmpblocks = tmp_used;
   return burn_depth;
 }
 
 
 static inline unsigned int
 bulk_ocb_crypt_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
                     byte *outbuf, const byte *inbuf, size_t nblocks, u64 *blkn,
                     int encrypt, byte *tmpbuf, size_t tmpbuf_nblocks,
                     unsigned int *num_used_tmpblocks)
 {
   unsigned int tmp_used = 16;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
   while (nblocks >= 1)
     {
       size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
       size_t i;
 
       if (curr_blks * 16 > tmp_used)
         tmp_used = curr_blks * 16;
 
       for (i = 0; i < curr_blks; i++)
         {
           const unsigned char *l = ocb_get_l(c, ++*blkn);
 
           /* Checksum_i = Checksum_{i-1} xor P_i  */
           if (encrypt)
             cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
           cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
                             c->u_iv.iv, 16);
         }
 
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
       nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
 
       for (i = 0; i < curr_blks; i++)
         {
           cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
 
           /* Checksum_i = Checksum_{i-1} xor P_i  */
           if (!encrypt)
               cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
         }
 
       outbuf += curr_blks * 16;
       inbuf  += curr_blks * 16;
       nblocks -= curr_blks;
     }
 
   *num_used_tmpblocks = tmp_used;
   return burn_depth;
 }
 
 
 static inline unsigned int
 bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
                    const byte *abuf, size_t nblocks, u64 *blkn, byte *tmpbuf,
                    size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks)
 {
   unsigned int tmp_used = 16;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
   while (nblocks >= 1)
     {
       size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
       size_t i;
 
       if (curr_blks * 16 > tmp_used)
         tmp_used = curr_blks * 16;
 
       for (i = 0; i < curr_blks; i++)
         {
           const unsigned char *l = ocb_get_l(c, ++*blkn);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           cipher_block_xor_2dst (&tmpbuf[i * 16],
                                   c->u_mode.ocb.aad_offset, l, 16);
           cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
         }
 
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
       nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
 
       for (i = 0; i < curr_blks; i++)
         {
           cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
         }
 
       abuf += curr_blks * 16;
       nblocks -= curr_blks;
     }
 
   *num_used_tmpblocks = tmp_used;
   return burn_depth;
 }
 
 
 static inline unsigned int
 bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
                     const byte *inbuf, size_t nblocks, byte *tweak,
                     byte *tmpbuf, size_t tmpbuf_nblocks,
                     unsigned int *num_used_tmpblocks)
 {
   u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
   unsigned int tmp_used = 16;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
   tweak_next_lo = buf_get_le64 (tweak + 0);
   tweak_next_hi = buf_get_le64 (tweak + 8);
 
   while (nblocks >= 1)
     {
       size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
       size_t i;
 
       if (curr_blks * 16 > tmp_used)
         tmp_used = curr_blks * 16;
 
       for (i = 0; i < curr_blks; i++)
         {
           tweak_lo = tweak_next_lo;
           tweak_hi = tweak_next_hi;
 
           /* Generate next tweak. */
           carry = -(tweak_next_hi >> 63) & 0x87;
           tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
           tweak_next_lo = (tweak_next_lo << 1) ^ carry;
 
           /* Xor-Encrypt/Decrypt-Xor block. */
           tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo;
           tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi;
           buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo);
           buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi);
           buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
           buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
         }
 
       nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
 
       for (i = 0; i < curr_blks; i++)
         {
           /* Xor-Encrypt/Decrypt-Xor block. */
           tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]);
           tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]);
           tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo;
           tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi;
           buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
           buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
         }
 
       inbuf += curr_blks * 16;
       outbuf += curr_blks * 16;
       nblocks -= curr_blks;
     }
 
   buf_put_le64 (tweak + 0, tweak_next_lo);
   buf_put_le64 (tweak + 8, tweak_next_hi);
 
   *num_used_tmpblocks = tmp_used;
   return burn_depth;
 }
 
 
 #endif /*GCRYPT_BULKHELP_H*/
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 9cc5621e..411e790f 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1,2220 +1,2218 @@
 /* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia
  *
  * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef GCRY_CAMELLIA_AESNI_AVX2_AMD64_H
 #define GCRY_CAMELLIA_AESNI_AVX2_AMD64_H
 
 #include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
 #define key_table 0
 #define key_bitlength CAMELLIA_TABLE_BYTE_LEN
 
 /* register macros */
 #define CTX %rdi
 #define RIO %r8
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 #ifndef CAMELLIA_GFNI_BUILD
 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
 	vpand x, mask4bit, tmp0; \
 	vpandn x, mask4bit, x; \
 	vpsrld $4, x, x; \
 	\
 	vpshufb tmp0, lo_t, tmp0; \
 	vpshufb x, hi_t, x; \
 	vpxor tmp0, x, x;
 #endif
 
 #define ymm0_x xmm0
 #define ymm1_x xmm1
 #define ymm2_x xmm2
 #define ymm3_x xmm3
 #define ymm4_x xmm4
 #define ymm5_x xmm5
 #define ymm6_x xmm6
 #define ymm7_x xmm7
 #define ymm8_x xmm8
 #define ymm9_x xmm9
 #define ymm10_x xmm10
 #define ymm11_x xmm11
 #define ymm12_x xmm12
 #define ymm13_x xmm13
 #define ymm14_x xmm14
 #define ymm15_x xmm15
 
 #ifdef CAMELLIA_VAES_BUILD
 # define IF_AESNI(...)
 # define IF_VAES(...) __VA_ARGS__
 #else
 # define IF_AESNI(...) __VA_ARGS__
 # define IF_VAES(...)
 #endif
 
 /**********************************************************************
   GFNI helper macros and constants
  **********************************************************************/
 
 #ifdef CAMELLIA_GFNI_BUILD
 
 #define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
 	( (((a0) & 1) << 0) | \
 	  (((a1) & 1) << 1) | \
 	  (((a2) & 1) << 2) | \
 	  (((a3) & 1) << 3) | \
 	  (((a4) & 1) << 4) | \
 	  (((a5) & 1) << 5) | \
 	  (((a6) & 1) << 6) | \
 	  (((a7) & 1) << 7) )
 
 #define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
 	( ((l7) << (0 * 8)) | \
 	  ((l6) << (1 * 8)) | \
 	  ((l5) << (2 * 8)) | \
 	  ((l4) << (3 * 8)) | \
 	  ((l3) << (4 * 8)) | \
 	  ((l2) << (5 * 8)) | \
 	  ((l1) << (6 * 8)) | \
 	  ((l0) << (7 * 8)) )
 
 /* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
  *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
  *
  * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
  * combination of function "A" (AES SubBytes affine transformation) and
  * "ψ₁"/"ψ₂"/"ψ₃".
  */
 
 /* Constant from "θ₁(x)" and "θ₄(x)" functions. */
 #define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
 
 /* Constant from "ψ₁(A(x))" function: */
 #define post_filter_constant_s14  BV8(0, 1, 1, 1, 0, 1, 1, 0)
 
 /* Constant from "ψ₂(A(x))" function: */
 #define post_filter_constant_s2   BV8(0, 0, 1, 1, 1, 0, 1, 1)
 
 /* Constant from "ψ₃(A(x))" function: */
 #define post_filter_constant_s3   BV8(1, 1, 1, 0, 1, 1, 0, 0)
 
 #endif /* CAMELLIA_GFNI_BUILD */
 
 /**********************************************************************
   32-way camellia
  **********************************************************************/
 
 #ifdef CAMELLIA_GFNI_BUILD
 
 /* roundsm32 (GFNI version)
  * IN:
  *   x0..x7: byte-sliced AB state
  *   mem_cd: register pointer storing CD state
  *   key: index for key material
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
 		  t6, t7, mem_cd, key) \
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
 	vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
 	vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
 	vpxor t7##_x, t7##_x, t7##_x; \
 	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
 	\
 	/* prefilter sboxes */ \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
 	\
 	/* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
 	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
 	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
 	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
 	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
 	\
 	/* sbox GF8 inverse + postfilter sbox 3 */ \
 	vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
 	vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
 	\
 	/* sbox GF8 inverse + postfilter sbox 2 */ \
 	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
 	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
 	\
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
 	vpshufb t7, t1, t1; \
 	vpsrldq $3, t0, t3; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
 	vpxor x6, x1, x1; \
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
 	vpshufb t7, t2, t2; \
 	vpsrldq $4, t0, t4; \
 	vpshufb t7, t3, t3; \
 	vpsrldq $5, t0, t5; \
 	vpshufb t7, t4, t4; \
 	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
 	vpsrldq $6, t0, t6; \
 	vpshufb t7, t5, t5; \
 	vpshufb t7, t6, t6; \
 	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
 	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
 	vpxor t6, x1, x1; \
 	vpxor 5 * 32(mem_cd), x1, x1; \
 	\
 	vpsrldq $7, t0, t6; \
 	vpshufb t7, t0, t0; \
 	vpshufb t7, t6, t7; \
 	\
 	vpxor t7, x0, x0; \
 	vpxor 4 * 32(mem_cd), x0, x0; \
 	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
 	vpxor t4, x3, x3; \
 	vpxor 7 * 32(mem_cd), x3, x3; \
 	\
 	vpxor t3, x4, x4; \
 	vpxor 0 * 32(mem_cd), x4, x4; \
 	\
 	vpxor t2, x5, x5; \
 	vpxor 1 * 32(mem_cd), x5, x5; \
 	\
 	vpxor t1, x6, x6; \
 	vpxor 2 * 32(mem_cd), x6, x6; \
 	\
 	vpxor t0, x7, x7; \
 	vpxor 3 * 32(mem_cd), x7, x7;
 
 #else /* CAMELLIA_GFNI_BUILD */
 
 /* roundsm32 (AES-NI / VAES version)
  * IN:
  *   x0..x7: byte-sliced AB state
  *   mem_cd: register pointer storing CD state
  *   key: index for key material
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
 		  t6, t7, mem_cd, key) \
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
 	vbroadcasti128 .Linv_shift_row rRIP, t4; \
 	vpbroadcastd .L0f0f0f0f rRIP, t7; \
 	vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \
 	vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \
 	vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \
 	vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
 	vpshufb t4, x7, x7; \
 	vpshufb t4, x3, x3; \
 	vpshufb t4, x6, x6; \
 	vpshufb t4, x2, x2; \
 	vpshufb t4, x5, x5; \
 	vpshufb t4, x1, x1; \
 	vpshufb t4, x4, x4; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
 	/* prefilter sbox 4 */ \
 	filter_8bit(x0, t5, t6, t7, t4); \
 	filter_8bit(x7, t5, t6, t7, t4); \
 	IF_AESNI(vextracti128 $1, x0, t0##_x); \
 	IF_AESNI(vextracti128 $1, x7, t1##_x); \
 	filter_8bit(x3, t2, t3, t7, t4); \
 	filter_8bit(x6, t2, t3, t7, t4); \
 	IF_AESNI(vextracti128 $1, x3, t3##_x); \
 	IF_AESNI(vextracti128 $1, x6, t2##_x); \
 	filter_8bit(x2, t5, t6, t7, t4); \
 	filter_8bit(x5, t5, t6, t7, t4); \
 	filter_8bit(x1, t5, t6, t7, t4); \
 	filter_8bit(x4, t5, t6, t7, t4); \
 	\
 	vpxor t4##_x, t4##_x, t4##_x; \
 	\
 	/* AES subbytes + AES shift rows */ \
 	IF_AESNI(vextracti128 $1, x2, t6##_x; \
 		 vextracti128 $1, x5, t5##_x; \
 		 vaesenclast t4##_x, x0##_x, x0##_x; \
 		 vaesenclast t4##_x, t0##_x, t0##_x; \
 		 vaesenclast t4##_x, x7##_x, x7##_x; \
 		 vaesenclast t4##_x, t1##_x, t1##_x; \
 		 vaesenclast t4##_x, x3##_x, x3##_x; \
 		 vaesenclast t4##_x, t3##_x, t3##_x; \
 		 vaesenclast t4##_x, x6##_x, x6##_x; \
 		 vaesenclast t4##_x, t2##_x, t2##_x; \
 		 vinserti128 $1, t0##_x, x0, x0; \
 		 vinserti128 $1, t1##_x, x7, x7; \
 		 vinserti128 $1, t3##_x, x3, x3; \
 		 vinserti128 $1, t2##_x, x6, x6; \
 		 vextracti128 $1, x1, t3##_x; \
 		 vextracti128 $1, x4, t2##_x); \
 	vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \
 	vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \
 	IF_AESNI(vaesenclast t4##_x, x2##_x, x2##_x; \
 		 vaesenclast t4##_x, t6##_x, t6##_x; \
 		 vaesenclast t4##_x, x5##_x, x5##_x; \
 		 vaesenclast t4##_x, t5##_x, t5##_x; \
 		 vaesenclast t4##_x, x1##_x, x1##_x; \
 		 vaesenclast t4##_x, t3##_x, t3##_x; \
 		 vaesenclast t4##_x, x4##_x, x4##_x; \
 		 vaesenclast t4##_x, t2##_x, t2##_x; \
 		 vinserti128 $1, t6##_x, x2, x2; \
 		 vinserti128 $1, t5##_x, x5, x5; \
 		 vinserti128 $1, t3##_x, x1, x1; \
 		 vinserti128 $1, t2##_x, x4, x4); \
 	IF_VAES(vaesenclast t4, x0, x0; \
 		vaesenclast t4, x7, x7; \
 		vaesenclast t4, x3, x3; \
 		vaesenclast t4, x6, x6; \
 		vaesenclast t4, x2, x2; \
 		vaesenclast t4, x5, x5; \
 		vaesenclast t4, x1, x1; \
 		vaesenclast t4, x4, x4); \
 	\
 	/* postfilter sboxes 1 and 4 */ \
 	vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \
 	vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t4); \
 	filter_8bit(x7, t0, t1, t7, t4); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
 	vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \
 	vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
 	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
 	\
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
 	vpxor t7##_x, t7##_x, t7##_x; \
 	\
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
 	vpshufb t7, t1, t1; \
 	vpsrldq $3, t0, t3; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
 	vpxor x6, x1, x1; \
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
 	vpshufb t7, t2, t2; \
 	vpsrldq $4, t0, t4; \
 	vpshufb t7, t3, t3; \
 	vpsrldq $5, t0, t5; \
 	vpshufb t7, t4, t4; \
 	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
 	vpsrldq $6, t0, t6; \
 	vpshufb t7, t5, t5; \
 	vpshufb t7, t6, t6; \
 	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
 	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
 	vpxor t6, x1, x1; \
 	vpxor 5 * 32(mem_cd), x1, x1; \
 	\
 	vpsrldq $7, t0, t6; \
 	vpshufb t7, t0, t0; \
 	vpshufb t7, t6, t7; \
 	\
 	vpxor t7, x0, x0; \
 	vpxor 4 * 32(mem_cd), x0, x0; \
 	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
 	vpxor t4, x3, x3; \
 	vpxor 7 * 32(mem_cd), x3, x3; \
 	\
 	vpxor t3, x4, x4; \
 	vpxor 0 * 32(mem_cd), x4, x4; \
 	\
 	vpxor t2, x5, x5; \
 	vpxor 1 * 32(mem_cd), x5, x5; \
 	\
 	vpxor t1, x6, x6; \
 	vpxor 2 * 32(mem_cd), x6, x6; \
 	\
 	vpxor t0, x7, x7; \
 	vpxor 3 * 32(mem_cd), x7, x7;
 
 #endif /* CAMELLIA_GFNI_BUILD */
 
 /*
  * IN/OUT:
  *  x0..x7: byte-sliced AB state preloaded
  *  mem_ab: byte-sliced AB state in memory
  *  mem_cb: byte-sliced CD state in memory
  */
 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 	roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
 	\
 	vmovdqu x0, 4 * 32(mem_cd); \
 	vmovdqu x1, 5 * 32(mem_cd); \
 	vmovdqu x2, 6 * 32(mem_cd); \
 	vmovdqu x3, 7 * 32(mem_cd); \
 	vmovdqu x4, 0 * 32(mem_cd); \
 	vmovdqu x5, 1 * 32(mem_cd); \
 	vmovdqu x6, 2 * 32(mem_cd); \
 	vmovdqu x7, 3 * 32(mem_cd); \
 	\
 	roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
 	\
 	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 
 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
 
 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 	/* Store new AB state */ \
 	vmovdqu x4, 4 * 32(mem_ab); \
 	vmovdqu x5, 5 * 32(mem_ab); \
 	vmovdqu x6, 6 * 32(mem_ab); \
 	vmovdqu x7, 7 * 32(mem_ab); \
 	vmovdqu x0, 0 * 32(mem_ab); \
 	vmovdqu x1, 1 * 32(mem_ab); \
 	vmovdqu x2, 2 * 32(mem_ab); \
 	vmovdqu x3, 3 * 32(mem_ab);
 
 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 
 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 
 /*
  * IN:
  *  v0..3: byte-sliced 32-bit integers
  * OUT:
  *  v0..3: (IN <<< 1)
  */
 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
 	vpcmpgtb v0, zero, t0; \
 	vpaddb v0, v0, v0; \
 	vpabsb t0, t0; \
 	\
 	vpcmpgtb v1, zero, t1; \
 	vpaddb v1, v1, v1; \
 	vpabsb t1, t1; \
 	\
 	vpcmpgtb v2, zero, t2; \
 	vpaddb v2, v2, v2; \
 	vpabsb t2, t2; \
 	\
 	vpor t0, v1, v1; \
 	\
 	vpcmpgtb v3, zero, t0; \
 	vpaddb v3, v3, v3; \
 	vpabsb t0, t0; \
 	\
 	vpor t1, v2, v2; \
 	vpor t2, v3, v3; \
 	vpor t0, v0, v0;
 
 /*
  * IN:
  *   r: byte-sliced AB state in memory
  *   l: byte-sliced CD state in memory
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 	      tt1, tt2, tt3, kll, klr, krl, krr) \
 	/* \
 	 * t0 = kll; \
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
 	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 	vpxor tt0, tt0, tt0; \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
 	vpand l2, t2, t2; \
 	vpand l3, t3, t3; \
 	\
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor l4, t0, l4; \
 	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 	vmovdqu l4, 4 * 32(l); \
 	vpxor l5, t1, l5; \
 	vmovdqu l5, 5 * 32(l); \
 	vpxor l6, t2, l6; \
 	vmovdqu l6, 6 * 32(l); \
 	vpxor l7, t3, l7; \
 	vmovdqu l7, 7 * 32(l); \
 	\
 	/* \
 	 * t2 = krr; \
 	 * t2 |= rr; \
 	 * rl ^= t2; \
 	 */ \
 	\
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpor 4 * 32(r), t0, t0; \
 	vpor 5 * 32(r), t1, t1; \
 	vpor 6 * 32(r), t2, t2; \
 	vpor 7 * 32(r), t3, t3; \
 	\
 	vpxor 0 * 32(r), t0, t0; \
 	vpxor 1 * 32(r), t1, t1; \
 	vpxor 2 * 32(r), t2, t2; \
 	vpxor 3 * 32(r), t3, t3; \
 	vmovdqu t0, 0 * 32(r); \
 	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 1 * 32(r); \
 	vmovdqu t2, 2 * 32(r); \
 	vmovdqu t3, 3 * 32(r); \
 	\
 	/* \
 	 * t2 = krl; \
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpand 0 * 32(r), t0, t0; \
 	vpand 1 * 32(r), t1, t1; \
 	vpand 2 * 32(r), t2, t2; \
 	vpand 3 * 32(r), t3, t3; \
 	\
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor 4 * 32(r), t0, t0; \
 	vpxor 5 * 32(r), t1, t1; \
 	vpxor 6 * 32(r), t2, t2; \
 	vpxor 7 * 32(r), t3, t3; \
 	vmovdqu t0, 4 * 32(r); \
 	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 5 * 32(r); \
 	vmovdqu t2, 6 * 32(r); \
 	vmovdqu t3, 7 * 32(r); \
 	\
 	/* \
 	 * t0 = klr; \
 	 * t0 |= lr; \
 	 * ll ^= t0; \
 	 */ \
 	\
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
 	vpor l6, t2, t2; \
 	vpor l7, t3, t3; \
 	\
 	vpxor l0, t0, l0; \
 	vmovdqu l0, 0 * 32(l); \
 	vpxor l1, t1, l1; \
 	vmovdqu l1, 1 * 32(l); \
 	vpxor l2, t2, l2; \
 	vmovdqu l2, 2 * 32(l); \
 	vpxor l3, t3, l3; \
 	vmovdqu l3, 3 * 32(l);
 
 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1, x0, x1; \
 	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2, t2, x2;
 
 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
 			      a3, b3, c3, d3, st0, st1) \
 	vmovdqu d2, st0; \
 	vmovdqu d3, st1; \
 	transpose_4x4(a0, a1, a2, a3, d2, d3); \
 	transpose_4x4(b0, b1, b2, b3, d2, d3); \
 	vmovdqu st0, d2; \
 	vmovdqu st1, d3; \
 	\
 	vmovdqu a0, st0; \
 	vmovdqu a1, st1; \
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
 	vbroadcasti128 .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
 	vpshufb a0, b0, b0; \
 	vpshufb a0, b1, b1; \
 	vpshufb a0, b2, b2; \
 	vpshufb a0, b3, b3; \
 	vpshufb a0, a1, a1; \
 	vpshufb a0, c0, c0; \
 	vpshufb a0, c1, c1; \
 	vpshufb a0, c2, c2; \
 	vpshufb a0, c3, c3; \
 	vpshufb a0, d0, d0; \
 	vpshufb a0, d1, d1; \
 	vpshufb a0, d2, d2; \
 	vpshufb a0, d3, d3; \
 	vmovdqu d3, st1; \
 	vmovdqu st0, d3; \
 	vpshufb a0, d3, a0; \
 	vmovdqu d2, st0; \
 	\
 	transpose_4x4(a0, b0, c0, d0, d2, d3); \
 	transpose_4x4(a1, b1, c1, d1, d2, d3); \
 	vmovdqu st0, d2; \
 	vmovdqu st1, d3; \
 	\
 	vmovdqu b0, st0; \
 	vmovdqu b1, st1; \
 	transpose_4x4(a2, b2, c2, d2, b0, b1); \
 	transpose_4x4(a3, b3, c3, d3, b0, b1); \
 	vmovdqu st0, b0; \
 	vmovdqu st1, b1; \
 	/* does not adjust output bytes inside vectors */
 
 /* load blocks to registers and apply pre-whitening */
 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vpbroadcastq key, x0; \
 	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 32(rio), x0, y7; \
 	vpxor 1 * 32(rio), x0, y6; \
 	vpxor 2 * 32(rio), x0, y5; \
 	vpxor 3 * 32(rio), x0, y4; \
 	vpxor 4 * 32(rio), x0, y3; \
 	vpxor 5 * 32(rio), x0, y2; \
 	vpxor 6 * 32(rio), x0, y1; \
 	vpxor 7 * 32(rio), x0, y0; \
 	vpxor 8 * 32(rio), x0, x7; \
 	vpxor 9 * 32(rio), x0, x6; \
 	vpxor 10 * 32(rio), x0, x5; \
 	vpxor 11 * 32(rio), x0, x4; \
 	vpxor 12 * 32(rio), x0, x3; \
 	vpxor 13 * 32(rio), x0, x2; \
 	vpxor 14 * 32(rio), x0, x1; \
 	vpxor 15 * 32(rio), x0, x0;
 
 /* byteslice pre-whitened blocks and store to temporary memory */
 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd) \
 	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
 			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
 	\
 	vmovdqu x0, 0 * 32(mem_ab); \
 	vmovdqu x1, 1 * 32(mem_ab); \
 	vmovdqu x2, 2 * 32(mem_ab); \
 	vmovdqu x3, 3 * 32(mem_ab); \
 	vmovdqu x4, 4 * 32(mem_ab); \
 	vmovdqu x5, 5 * 32(mem_ab); \
 	vmovdqu x6, 6 * 32(mem_ab); \
 	vmovdqu x7, 7 * 32(mem_ab); \
 	vmovdqu y0, 0 * 32(mem_cd); \
 	vmovdqu y1, 1 * 32(mem_cd); \
 	vmovdqu y2, 2 * 32(mem_cd); \
 	vmovdqu y3, 3 * 32(mem_cd); \
 	vmovdqu y4, 4 * 32(mem_cd); \
 	vmovdqu y5, 5 * 32(mem_cd); \
 	vmovdqu y6, 6 * 32(mem_cd); \
 	vmovdqu y7, 7 * 32(mem_cd);
 
 /* de-byteslice, apply post-whitening and store blocks */
 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
 			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
 	\
 	vmovdqu x0, stack_tmp0; \
 	\
 	vpbroadcastq key, x0; \
 	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
 	vpxor x0, y5, y5; \
 	vpxor x0, y4, y4; \
 	vpxor x0, y3, y3; \
 	vpxor x0, y2, y2; \
 	vpxor x0, y1, y1; \
 	vpxor x0, y0, y0; \
 	vpxor x0, x7, x7; \
 	vpxor x0, x6, x6; \
 	vpxor x0, x5, x5; \
 	vpxor x0, x4, x4; \
 	vpxor x0, x3, x3; \
 	vpxor x0, x2, x2; \
 	vpxor x0, x1, x1; \
 	vpxor stack_tmp0, x0, x0;
 
 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio) \
 	vmovdqu x0, 0 * 32(rio); \
 	vmovdqu x1, 1 * 32(rio); \
 	vmovdqu x2, 2 * 32(rio); \
 	vmovdqu x3, 3 * 32(rio); \
 	vmovdqu x4, 4 * 32(rio); \
 	vmovdqu x5, 5 * 32(rio); \
 	vmovdqu x6, 6 * 32(rio); \
 	vmovdqu x7, 7 * 32(rio); \
 	vmovdqu y0, 8 * 32(rio); \
 	vmovdqu y1, 9 * 32(rio); \
 	vmovdqu y2, 10 * 32(rio); \
 	vmovdqu y3, 11 * 32(rio); \
 	vmovdqu y4, 12 * 32(rio); \
 	vmovdqu y5, 13 * 32(rio); \
 	vmovdqu y6, 14 * 32(rio); \
 	vmovdqu y7, 15 * 32(rio);
 
 .text
 .align 32
 
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 
 FUNC_NAME(_constants):
 ELF(.type   FUNC_NAME(_constants),@object;)
 
-.Lshufb_16x16b:
-	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
-	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
-
 .Lpack_bswap:
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 #ifdef CAMELLIA_GFNI_BUILD
 
 /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
  * and s4.
  *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
  *
  * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
  * combination of function "A" (AES SubBytes affine transformation) and
  * "ψ₁"/"ψ₂"/"ψ₃".
  */
 
 /* Bit-matrix from "θ₁(x)" function: */
 .Lpre_filter_bitmatrix_s123:
 	.quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
 		    BV8(0, 0, 1, 1, 0, 0, 1, 0),
 		    BV8(1, 1, 0, 1, 0, 0, 0, 0),
 		    BV8(1, 0, 1, 1, 0, 0, 1, 1),
 		    BV8(0, 0, 0, 0, 1, 1, 0, 0),
 		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
 		    BV8(0, 0, 1, 0, 1, 1, 0, 0),
 		    BV8(1, 0, 0, 0, 0, 1, 1, 0))
 
 /* Bit-matrix from "θ₄(x)" function: */
 .Lpre_filter_bitmatrix_s4:
 	.quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
 		    BV8(0, 1, 1, 0, 0, 1, 0, 0),
 		    BV8(1, 0, 1, 0, 0, 0, 0, 1),
 		    BV8(0, 1, 1, 0, 0, 1, 1, 1),
 		    BV8(0, 0, 0, 1, 1, 0, 0, 0),
 		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
 		    BV8(0, 1, 0, 1, 1, 0, 0, 0),
 		    BV8(0, 0, 0, 0, 1, 1, 0, 1))
 
 /* Bit-matrix from "ψ₁(A(x))" function: */
 .Lpost_filter_bitmatrix_s14:
 	.quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
 		    BV8(0, 1, 1, 0, 0, 1, 1, 0),
 		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
 		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
 		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
 		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
 		    BV8(0, 1, 1, 1, 1, 1, 1, 1),
 		    BV8(0, 0, 0, 1, 1, 1, 0, 0))
 
 /* Bit-matrix from "ψ₂(A(x))" function: */
 .Lpost_filter_bitmatrix_s2:
 	.quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1),
 		    BV8(0, 1, 1, 0, 0, 1, 1, 0),
 		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
 		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
 		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
 		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
 		    BV8(0, 1, 1, 1, 1, 1, 1, 1))
 
 /* Bit-matrix from "ψ₃(A(x))" function: */
 .Lpost_filter_bitmatrix_s3:
 	.quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
 		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
 		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
 		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
 		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
 		    BV8(0, 1, 1, 1, 1, 1, 1, 1),
 		    BV8(0, 0, 0, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 
 #else /* CAMELLIA_GFNI_BUILD */
 
 /*
  * pre-SubByte transform
  *
  * pre-lookup for sbox1, sbox2, sbox3:
  *   swap_bitendianness(
  *       isom_map_camellia_to_aes(
  *           camellia_f(
  *               swap_bitendianess(in)
  *           )
  *       )
  *   )
  *
  * (note: '⊕ 0xc5' inside camellia_f())
  */
 .Lpre_tf_lo_s1:
 	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 .Lpre_tf_hi_s1:
 	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 
 /*
  * pre-SubByte transform
  *
  * pre-lookup for sbox4:
  *   swap_bitendianness(
  *       isom_map_camellia_to_aes(
  *           camellia_f(
  *               swap_bitendianess(in <<< 1)
  *           )
  *       )
  *   )
  *
  * (note: '⊕ 0xc5' inside camellia_f())
  */
 .Lpre_tf_lo_s4:
 	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 .Lpre_tf_hi_s4:
 	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox1, sbox4:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  )
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s1:
 	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 .Lpost_tf_hi_s1:
 	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox2:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  ) <<< 1
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s2:
 	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 .Lpost_tf_hi_s2:
 	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox3:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  ) >>> 1
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s3:
 	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 .Lpost_tf_hi_s3:
 	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 
 /* For isolating SubBytes from AESENCLAST, inverse shift row */
 .Linv_shift_row:
 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
 
 #endif /* CAMELLIA_GFNI_BUILD */
 
 ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);)
 
 .align 8
-ELF(.type   __camellia_enc_blk32,@function;)
+ELF(.type   FUNC_NAME(enc_blk32),@function;)
 
-__camellia_enc_blk32:
+FUNC_NAME(enc_blk32):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rax: temporary storage, 512 bytes
 	 *	%r8d: 24 for 16 byte key, 32 for larger
 	 *	%ymm0..%ymm15: 32 plaintext blocks
 	 * output:
 	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
 	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
 	leaq (-8 * 8)(CTX, %r8, 8), %r8;
 
 	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		      %ymm15, %rax, %rcx);
 
 .align 8
 .Lenc_loop:
 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 0);
 
 	cmpq %r8, CTX;
 	je .Lenc_done;
 	leaq (8 * 8)(CTX), CTX;
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table) + 0)(CTX),
 	      ((key_table) + 4)(CTX),
 	      ((key_table) + 8)(CTX),
 	      ((key_table) + 12)(CTX));
 	jmp .Lenc_loop;
 
 .align 8
 .Lenc_done:
 	/* load CD for output */
 	vmovdqu 0 * 32(%rcx), %ymm8;
 	vmovdqu 1 * 32(%rcx), %ymm9;
 	vmovdqu 2 * 32(%rcx), %ymm10;
 	vmovdqu 3 * 32(%rcx), %ymm11;
 	vmovdqu 4 * 32(%rcx), %ymm12;
 	vmovdqu 5 * 32(%rcx), %ymm13;
 	vmovdqu 6 * 32(%rcx), %ymm14;
 	vmovdqu 7 * 32(%rcx), %ymm15;
 
 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax));
 
 	ret_spec_stop;
 	CFI_ENDPROC();
-ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
+ELF(.size FUNC_NAME(enc_blk32),.-FUNC_NAME(enc_blk32);)
 
 .align 8
-ELF(.type   __camellia_dec_blk32,@function;)
+ELF(.type   FUNC_NAME(dec_blk32),@function;)
 
-__camellia_dec_blk32:
+FUNC_NAME(dec_blk32):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rax: temporary storage, 512 bytes
 	 *	%r8d: 24 for 16 byte key, 32 for larger
-	 *	%ymm0..%ymm15: 16 encrypted blocks
+	 *	%ymm0..%ymm15: 32 encrypted blocks
 	 * output:
-	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
+	 *	%ymm0..%ymm15: 32 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
 	CFI_STARTPROC();
 
 	movq %r8, %rcx;
 	movq CTX, %r8
 	leaq (-8 * 8)(CTX, %rcx, 8), CTX;
 
 	leaq 8 * 32(%rax), %rcx;
 
 	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		      %ymm15, %rax, %rcx);
 
 .align 8
 .Ldec_loop:
 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 0);
 
 	cmpq %r8, CTX;
 	je .Ldec_done;
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table) + 8)(CTX),
 	      ((key_table) + 12)(CTX),
 	      ((key_table) + 0)(CTX),
 	      ((key_table) + 4)(CTX));
 
 	leaq (-8 * 8)(CTX), CTX;
 	jmp .Ldec_loop;
 
 .align 8
 .Ldec_done:
 	/* load CD for output */
 	vmovdqu 0 * 32(%rcx), %ymm8;
 	vmovdqu 1 * 32(%rcx), %ymm9;
 	vmovdqu 2 * 32(%rcx), %ymm10;
 	vmovdqu 3 * 32(%rcx), %ymm11;
 	vmovdqu 4 * 32(%rcx), %ymm12;
 	vmovdqu 5 * 32(%rcx), %ymm13;
 	vmovdqu 6 * 32(%rcx), %ymm14;
 	vmovdqu 7 * 32(%rcx), %ymm15;
 
 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
 
 	ret_spec_stop;
 	CFI_ENDPROC();
-ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
+ELF(.size FUNC_NAME(dec_blk32),.-FUNC_NAME(dec_blk32);)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
 	vpslldq $8, tmp, tmp; \
 	vpsubq tmp, x, x;
 
 .align 8
 .globl FUNC_NAME(ctr_enc)
 ELF(.type   FUNC_NAME(ctr_enc),@function;)
 
 FUNC_NAME(ctr_enc):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movq 8(%rcx), %r11;
 	bswapq %r11;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	vpcmpeqd %ymm15, %ymm15, %ymm15;
 	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm0;
 	vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0;
 	vmovdqa %xmm0, %xmm1;
 	inc_le128(%xmm0, %xmm15, %xmm14);
 	vbroadcasti128 .Lbswap128_mask rRIP, %ymm14;
 	vinserti128 $1, %xmm0, %ymm1, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 15 * 32(%rax);
 
 	/* check need for handling 64-bit overflow and carry */
 	cmpq $(0xffffffffffffffff - 32), %r11;
 	ja .Lload_ctr_carry;
 
 	/* construct IVs */
 	vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 14 * 32(%rax);
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 13 * 32(%rax);
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm12;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm11;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm10;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm9;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm8;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm7;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm6;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm5;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm4;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm3;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm2;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm1;
 	vpsubq %ymm15, %ymm0, %ymm0;  /* +30 ; +31 */
 	vpsubq %xmm15, %xmm0, %xmm13; /* +32 */
 	vpshufb %ymm14, %ymm0, %ymm0;
 	vpshufb %xmm14, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
 	jmp .Lload_ctr_done;
 
 .align 4
 .Lload_ctr_carry:
 	/* construct IVs */
 	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */
 	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 14 * 32(%rax);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 13 * 32(%rax);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm12;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm11;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm10;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm9;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm8;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm7;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm6;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm5;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm4;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm3;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm2;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm1;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vextracti128 $1, %ymm0, %xmm13;
 	vpshufb %ymm14, %ymm0, %ymm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
 	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
 .align 4
 .Lload_ctr_done:
-	/* inpack16_pre: */
+	/* inpack32_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
 	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor 13 * 32(%rax), %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call __camellia_enc_blk32;
+	call FUNC_NAME(enc_blk32);
 
 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
 	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
 	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
 	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
 	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
 	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
 	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
 	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
 	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
 	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
 	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
 	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
 	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
 	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
-	leaq 32 * 16(%rdx), %rdx;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);)
 
 .align 8
 .globl FUNC_NAME(cbc_dec)
 ELF(.type   FUNC_NAME(cbc_dec),@function;)
 
 FUNC_NAME(cbc_dec):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movq %rcx, %r9;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
 
-	call __camellia_dec_blk32;
+	call FUNC_NAME(dec_blk32);
 
 	/* XOR output with IV */
 	vmovdqu %ymm8, (%rax);
 	vmovdqu (%r9), %xmm8;
 	vinserti128 $1, (%rdx), %ymm8, %ymm8;
 	vpxor %ymm8, %ymm7, %ymm7;
 	vmovdqu (%rax), %ymm8;
 	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
 	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
 	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
 	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
 	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
 	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
 	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
 	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
 	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
 	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
 	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
 	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
 	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
 	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
 	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
 	movq (15 * 32 + 16 + 0)(%rdx), %rax;
 	movq (15 * 32 + 16 + 8)(%rdx), %rcx;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	/* store new IV */
 	movq %rax, (0)(%r9);
 	movq %rcx, (8)(%r9);
 
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);)
 
 .align 8
 .globl FUNC_NAME(cfb_dec)
 ELF(.type   FUNC_NAME(cfb_dec),@function;)
 
 FUNC_NAME(cfb_dec):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	/* inpack16_pre: */
+	/* inpack32_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm0;
 	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
 	vmovdqu (%rcx), %xmm15;
 	vinserti128 $1, (%rdx), %ymm15, %ymm15;
 	vpxor %ymm15, %ymm0, %ymm15;
 	vmovdqu (15 * 32 + 16)(%rdx), %xmm1;
 	vmovdqu %xmm1, (%rcx); /* store new IV */
 	vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14;
 	vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13;
 	vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12;
 	vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11;
 	vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10;
 	vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9;
 	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8;
 	vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7;
 	vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6;
 	vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5;
 	vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4;
 	vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3;
 	vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2;
 	vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1;
 	vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0;
 
-	call __camellia_enc_blk32;
+	call FUNC_NAME(enc_blk32);
 
 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
 	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
 	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
 	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
 	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
 	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
 	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
 	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
 	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
 	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
 	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
 	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
 	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
 	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);)
 
 .align 8
 .globl FUNC_NAME(ocb_enc)
 ELF(.type   FUNC_NAME(ocb_enc),@function;)
 
 FUNC_NAME(ocb_enc):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	movq %r10, (16 * 32 + 0 * 8)(%rsp);
 	movq %r11, (16 * 32 + 1 * 8)(%rsp);
 	movq %r12, (16 * 32 + 2 * 8)(%rsp);
 	movq %r13, (16 * 32 + 3 * 8)(%rsp);
 	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
 	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
 	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
 	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm13;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), %xmm14, %xmm15; \
 	  vpxor (l1reg), %xmm15, %xmm14; \
 	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
 	  vpxor yreg, %ymm13, %ymm13; \
 	  vpxor yreg, %ymm15, yreg; \
 	  vmovdqu %ymm15, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (15 * 32)(%rax);
 	OCB_INPUT(1, %r12, %r13, %ymm0);
 	vmovdqu %ymm0, (14 * 32)(%rax);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (13 * 32)(%rax);
 	OCB_INPUT(3, %r12, %r13, %ymm12);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, %ymm11);
 	OCB_INPUT(5, %r12, %r13, %ymm10);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, %ymm9);
 	OCB_INPUT(7, %r12, %r13, %ymm8);
 	movq (16 * 8)(%r9), %r10;
 	movq (17 * 8)(%r9), %r11;
 	movq (18 * 8)(%r9), %r12;
 	movq (19 * 8)(%r9), %r13;
 	OCB_INPUT(8, %r10, %r11, %ymm7);
 	OCB_INPUT(9, %r12, %r13, %ymm6);
 	movq (20 * 8)(%r9), %r10;
 	movq (21 * 8)(%r9), %r11;
 	movq (22 * 8)(%r9), %r12;
 	movq (23 * 8)(%r9), %r13;
 	OCB_INPUT(10, %r10, %r11, %ymm5);
 	OCB_INPUT(11, %r12, %r13, %ymm4);
 	movq (24 * 8)(%r9), %r10;
 	movq (25 * 8)(%r9), %r11;
 	movq (26 * 8)(%r9), %r12;
 	movq (27 * 8)(%r9), %r13;
 	OCB_INPUT(12, %r10, %r11, %ymm3);
 	OCB_INPUT(13, %r12, %r13, %ymm2);
 	movq (28 * 8)(%r9), %r10;
 	movq (29 * 8)(%r9), %r11;
 	movq (30 * 8)(%r9), %r12;
 	movq (31 * 8)(%r9), %r13;
 	OCB_INPUT(14, %r10, %r11, %ymm1);
 	OCB_INPUT(15, %r12, %r13, %ymm0);
 #undef OCB_INPUT
 
 	vextracti128 $1, %ymm13, %xmm15;
 	vmovdqu %xmm14, (%rcx);
 	vpxor %xmm13, %xmm15, %xmm15;
 	vmovdqu %xmm15, (%r8);
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %r10d;
 	cmovel %r10d, %r8d; /* max */
 
-	/* inpack16_pre: */
+	/* inpack32_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
 	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor 13 * 32(%rax), %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call __camellia_enc_blk32;
+	call FUNC_NAME(enc_blk32);
 
 	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
 	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
 	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
 	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
 	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
 	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
 	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
 	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
 	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
 	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
 	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
 	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
 	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
 	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
 	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
 	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
 	movq (16 * 32 + 0 * 8)(%rsp), %r10;
 	movq (16 * 32 + 1 * 8)(%rsp), %r11;
 	movq (16 * 32 + 2 * 8)(%rsp), %r12;
 	movq (16 * 32 + 3 * 8)(%rsp), %r13;
 	CFI_RESTORE(%r10);
 	CFI_RESTORE(%r11);
 	CFI_RESTORE(%r12);
 	CFI_RESTORE(%r13);
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);)
 
 .align 8
 .globl FUNC_NAME(ocb_dec)
 ELF(.type   FUNC_NAME(ocb_dec),@function;)
 
 FUNC_NAME(ocb_dec):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	movq %r10, (16 * 32 + 0 * 8)(%rsp);
 	movq %r11, (16 * 32 + 1 * 8)(%rsp);
 	movq %r12, (16 * 32 + 2 * 8)(%rsp);
 	movq %r13, (16 * 32 + 3 * 8)(%rsp);
 	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
 	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
 	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
 	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), %xmm14, %xmm15; \
 	  vpxor (l1reg), %xmm15, %xmm14; \
 	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
 	  vpxor yreg, %ymm15, yreg; \
 	  vmovdqu %ymm15, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (15 * 32)(%rax);
 	OCB_INPUT(1, %r12, %r13, %ymm0);
 	vmovdqu %ymm0, (14 * 32)(%rax);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, %ymm13);
 	OCB_INPUT(3, %r12, %r13, %ymm12);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, %ymm11);
 	OCB_INPUT(5, %r12, %r13, %ymm10);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, %ymm9);
 	OCB_INPUT(7, %r12, %r13, %ymm8);
 	movq (16 * 8)(%r9), %r10;
 	movq (17 * 8)(%r9), %r11;
 	movq (18 * 8)(%r9), %r12;
 	movq (19 * 8)(%r9), %r13;
 	OCB_INPUT(8, %r10, %r11, %ymm7);
 	OCB_INPUT(9, %r12, %r13, %ymm6);
 	movq (20 * 8)(%r9), %r10;
 	movq (21 * 8)(%r9), %r11;
 	movq (22 * 8)(%r9), %r12;
 	movq (23 * 8)(%r9), %r13;
 	OCB_INPUT(10, %r10, %r11, %ymm5);
 	OCB_INPUT(11, %r12, %r13, %ymm4);
 	movq (24 * 8)(%r9), %r10;
 	movq (25 * 8)(%r9), %r11;
 	movq (26 * 8)(%r9), %r12;
 	movq (27 * 8)(%r9), %r13;
 	OCB_INPUT(12, %r10, %r11, %ymm3);
 	OCB_INPUT(13, %r12, %r13, %ymm2);
 	movq (28 * 8)(%r9), %r10;
 	movq (29 * 8)(%r9), %r11;
 	movq (30 * 8)(%r9), %r12;
 	movq (31 * 8)(%r9), %r13;
 	OCB_INPUT(14, %r10, %r11, %ymm1);
 	OCB_INPUT(15, %r12, %r13, %ymm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm14, (%rcx);
 
 	movq %r8, %r10;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %r9d;
 	cmovel %r9d, %r8d; /* max */
 
-	/* inpack16_pre: */
+	/* inpack32_pre: */
 	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
 	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor %ymm13, %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call __camellia_dec_blk32;
+	call FUNC_NAME(dec_blk32);
 
 	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
 	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
 	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
 	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
 	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
 	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
 	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
 	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
 	vmovdqu %ymm7, (7 * 32)(%rax);
 	vmovdqu %ymm6, (6 * 32)(%rax);
 	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
 	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
 	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
 	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
 	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
 	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
 	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
 	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
 	vpxor %ymm5, %ymm7, %ymm7;
 	vpxor %ymm4, %ymm6, %ymm6;
 	vpxor %ymm3, %ymm7, %ymm7;
 	vpxor %ymm2, %ymm6, %ymm6;
 	vpxor %ymm1, %ymm7, %ymm7;
 	vpxor %ymm0, %ymm6, %ymm6;
 	vpxor %ymm15, %ymm7, %ymm7;
 	vpxor %ymm14, %ymm6, %ymm6;
 	vpxor %ymm13, %ymm7, %ymm7;
 	vpxor %ymm12, %ymm6, %ymm6;
 	vpxor %ymm11, %ymm7, %ymm7;
 	vpxor %ymm10, %ymm6, %ymm6;
 	vpxor %ymm9, %ymm7, %ymm7;
 	vpxor %ymm8, %ymm6, %ymm6;
 	vpxor %ymm7, %ymm6, %ymm7;
 
 	vextracti128 $1, %ymm7, %xmm6;
 	vpxor %xmm6, %xmm7, %xmm7;
 	vpxor (%r10), %xmm7, %xmm7;
 	vmovdqu %xmm7, (%r10);
 
 	vmovdqu 7 * 32(%rax), %ymm7;
 	vmovdqu 6 * 32(%rax), %ymm6;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
 	movq (16 * 32 + 0 * 8)(%rsp), %r10;
 	movq (16 * 32 + 1 * 8)(%rsp), %r11;
 	movq (16 * 32 + 2 * 8)(%rsp), %r12;
 	movq (16 * 32 + 3 * 8)(%rsp), %r13;
 	CFI_RESTORE(%r10);
 	CFI_RESTORE(%r11);
 	CFI_RESTORE(%r12);
 	CFI_RESTORE(%r13);
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);)
 
 .align 8
 .globl FUNC_NAME(ocb_auth)
 ELF(.type   FUNC_NAME(ocb_auth),@function;)
 
 FUNC_NAME(ocb_auth):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (16 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	movq %r10, (16 * 32 + 0 * 8)(%rsp);
 	movq %r11, (16 * 32 + 1 * 8)(%rsp);
 	movq %r12, (16 * 32 + 2 * 8)(%rsp);
 	movq %r13, (16 * 32 + 3 * 8)(%rsp);
 	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
 	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
 	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
 	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm14;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rsi), yreg; \
 	  vpxor (l0reg), %xmm14, %xmm15; \
 	  vpxor (l1reg), %xmm15, %xmm14; \
 	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
 	  vpxor yreg, %ymm15, yreg;
 
 	movq (0 * 8)(%r8), %r10;
 	movq (1 * 8)(%r8), %r11;
 	movq (2 * 8)(%r8), %r12;
 	movq (3 * 8)(%r8), %r13;
 	OCB_INPUT(0, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (15 * 32)(%rax);
 	OCB_INPUT(1, %r12, %r13, %ymm0);
 	vmovdqu %ymm0, (14 * 32)(%rax);
 	movq (4 * 8)(%r8), %r10;
 	movq (5 * 8)(%r8), %r11;
 	movq (6 * 8)(%r8), %r12;
 	movq (7 * 8)(%r8), %r13;
 	OCB_INPUT(2, %r10, %r11, %ymm13);
 	OCB_INPUT(3, %r12, %r13, %ymm12);
 	movq (8 * 8)(%r8), %r10;
 	movq (9 * 8)(%r8), %r11;
 	movq (10 * 8)(%r8), %r12;
 	movq (11 * 8)(%r8), %r13;
 	OCB_INPUT(4, %r10, %r11, %ymm11);
 	OCB_INPUT(5, %r12, %r13, %ymm10);
 	movq (12 * 8)(%r8), %r10;
 	movq (13 * 8)(%r8), %r11;
 	movq (14 * 8)(%r8), %r12;
 	movq (15 * 8)(%r8), %r13;
 	OCB_INPUT(6, %r10, %r11, %ymm9);
 	OCB_INPUT(7, %r12, %r13, %ymm8);
 	movq (16 * 8)(%r8), %r10;
 	movq (17 * 8)(%r8), %r11;
 	movq (18 * 8)(%r8), %r12;
 	movq (19 * 8)(%r8), %r13;
 	OCB_INPUT(8, %r10, %r11, %ymm7);
 	OCB_INPUT(9, %r12, %r13, %ymm6);
 	movq (20 * 8)(%r8), %r10;
 	movq (21 * 8)(%r8), %r11;
 	movq (22 * 8)(%r8), %r12;
 	movq (23 * 8)(%r8), %r13;
 	OCB_INPUT(10, %r10, %r11, %ymm5);
 	OCB_INPUT(11, %r12, %r13, %ymm4);
 	movq (24 * 8)(%r8), %r10;
 	movq (25 * 8)(%r8), %r11;
 	movq (26 * 8)(%r8), %r12;
 	movq (27 * 8)(%r8), %r13;
 	OCB_INPUT(12, %r10, %r11, %ymm3);
 	OCB_INPUT(13, %r12, %r13, %ymm2);
 	movq (28 * 8)(%r8), %r10;
 	movq (29 * 8)(%r8), %r11;
 	movq (30 * 8)(%r8), %r12;
 	movq (31 * 8)(%r8), %r13;
 	OCB_INPUT(14, %r10, %r11, %ymm1);
 	OCB_INPUT(15, %r12, %r13, %ymm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm14, (%rdx);
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %r10d;
 	cmovel %r10d, %r8d; /* max */
 
 	movq %rcx, %r10;
 
-	/* inpack16_pre: */
+	/* inpack32_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
 	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor %ymm13, %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call __camellia_enc_blk32;
+	call FUNC_NAME(enc_blk32);
 
 	vpxor %ymm7, %ymm6, %ymm6;
 	vpxor %ymm5, %ymm4, %ymm4;
 	vpxor %ymm3, %ymm2, %ymm2;
 	vpxor %ymm1, %ymm0, %ymm0;
 	vpxor %ymm15, %ymm14, %ymm14;
 	vpxor %ymm13, %ymm12, %ymm12;
 	vpxor %ymm11, %ymm10, %ymm10;
 	vpxor %ymm9, %ymm8, %ymm8;
 
 	vpxor %ymm6, %ymm4, %ymm4;
 	vpxor %ymm2, %ymm0, %ymm0;
 	vpxor %ymm14, %ymm12, %ymm12;
 	vpxor %ymm10, %ymm8, %ymm8;
 
 	vpxor %ymm4, %ymm0, %ymm0;
 	vpxor %ymm12, %ymm8, %ymm8;
 
 	vpxor %ymm0, %ymm8, %ymm0;
 
 	vextracti128 $1, %ymm0, %xmm1;
 	vpxor (%r10), %xmm0, %xmm0;
 	vpxor %xmm0, %xmm1, %xmm0;
 	vmovdqu %xmm0, (%r10);
 
 	vzeroall;
 
 	movq (16 * 32 + 0 * 8)(%rsp), %r10;
 	movq (16 * 32 + 1 * 8)(%rsp), %r11;
 	movq (16 * 32 + 2 * 8)(%rsp), %r12;
 	movq (16 * 32 + 3 * 8)(%rsp), %r13;
 	CFI_RESTORE(%r10);
 	CFI_RESTORE(%r11);
 	CFI_RESTORE(%r12);
 	CFI_RESTORE(%r13);
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)
 
 .align 8
 .globl FUNC_NAME(enc_blk1_32)
 ELF(.type   FUNC_NAME(enc_blk1_32),@function;)
 
 FUNC_NAME(enc_blk1_32):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%ecx: nblocks (1 to 32)
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movl %ecx, %r9d;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	cmpl $31, %ecx;
 	vpxor %xmm0, %xmm0, %xmm0;
 	ja 1f;
 	jb 2f;
 	  vmovdqu 15 * 32(%rdx), %xmm0;
 	  jmp 2f;
 	1:
 	  vmovdqu 15 * 32(%rdx), %ymm0;
 	2:
 	  vmovdqu %ymm0, (%rax);
 
 	vpbroadcastq (key_table)(CTX), %ymm0;
 	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
 
 #define LOAD_INPUT(offset, ymm) \
 	cmpl $(1 + 2 * (offset)), %ecx; \
 	jb 2f; \
 	ja 1f; \
 	  vmovdqu (offset) * 32(%rdx), %ymm##_x; \
 	  vpxor %ymm0, %ymm, %ymm; \
 	  jmp 2f; \
 	1: \
 	  vpxor (offset) * 32(%rdx), %ymm0, %ymm;
 
 	LOAD_INPUT(0, ymm15);
 	LOAD_INPUT(1, ymm14);
 	LOAD_INPUT(2, ymm13);
 	LOAD_INPUT(3, ymm12);
 	LOAD_INPUT(4, ymm11);
 	LOAD_INPUT(5, ymm10);
 	LOAD_INPUT(6, ymm9);
 	LOAD_INPUT(7, ymm8);
 	LOAD_INPUT(8, ymm7);
 	LOAD_INPUT(9, ymm6);
 	LOAD_INPUT(10, ymm5);
 	LOAD_INPUT(11, ymm4);
 	LOAD_INPUT(12, ymm3);
 	LOAD_INPUT(13, ymm2);
 	LOAD_INPUT(14, ymm1);
 	vpxor (%rax), %ymm0, %ymm0;
 
 2:
-	call __camellia_enc_blk32;
+	call FUNC_NAME(enc_blk32);
 
 #define STORE_OUTPUT(ymm, offset) \
 	cmpl $(1 + 2 * (offset)), %r9d; \
 	jb 2f; \
 	ja 1f; \
 	  vmovdqu %ymm##_x, (offset) * 32(%rsi); \
 	  jmp 2f; \
 	1: \
 	  vmovdqu %ymm, (offset) * 32(%rsi);
 
 	STORE_OUTPUT(ymm7, 0);
 	STORE_OUTPUT(ymm6, 1);
 	STORE_OUTPUT(ymm5, 2);
 	STORE_OUTPUT(ymm4, 3);
 	STORE_OUTPUT(ymm3, 4);
 	STORE_OUTPUT(ymm2, 5);
 	STORE_OUTPUT(ymm1, 6);
 	STORE_OUTPUT(ymm0, 7);
 	STORE_OUTPUT(ymm15, 8);
 	STORE_OUTPUT(ymm14, 9);
 	STORE_OUTPUT(ymm13, 10);
 	STORE_OUTPUT(ymm12, 11);
 	STORE_OUTPUT(ymm11, 12);
 	STORE_OUTPUT(ymm10, 13);
 	STORE_OUTPUT(ymm9, 14);
 	STORE_OUTPUT(ymm8, 15);
 
 2:
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
 
 .align 8
 .globl FUNC_NAME(dec_blk1_32)
 ELF(.type   FUNC_NAME(dec_blk1_32),@function;)
 
 FUNC_NAME(dec_blk1_32):
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%ecx: nblocks (1 to 32)
 	 */
 	CFI_STARTPROC();
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movl %ecx, %r9d;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	cmpl $31, %ecx;
 	vpxor %xmm0, %xmm0, %xmm0;
 	ja 1f;
 	jb 2f;
 	  vmovdqu 15 * 32(%rdx), %xmm0;
 	  jmp 2f;
 	1:
 	  vmovdqu 15 * 32(%rdx), %ymm0;
 	2:
 	  vmovdqu %ymm0, (%rax);
 
 	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm0;
 	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
 
 	LOAD_INPUT(0, ymm15);
 	LOAD_INPUT(1, ymm14);
 	LOAD_INPUT(2, ymm13);
 	LOAD_INPUT(3, ymm12);
 	LOAD_INPUT(4, ymm11);
 	LOAD_INPUT(5, ymm10);
 	LOAD_INPUT(6, ymm9);
 	LOAD_INPUT(7, ymm8);
 	LOAD_INPUT(8, ymm7);
 	LOAD_INPUT(9, ymm6);
 	LOAD_INPUT(10, ymm5);
 	LOAD_INPUT(11, ymm4);
 	LOAD_INPUT(12, ymm3);
 	LOAD_INPUT(13, ymm2);
 	LOAD_INPUT(14, ymm1);
 	vpxor (%rax), %ymm0, %ymm0;
 
 2:
-	call __camellia_dec_blk32;
+	call FUNC_NAME(dec_blk32);
 
 	STORE_OUTPUT(ymm7, 0);
 	STORE_OUTPUT(ymm6, 1);
 	STORE_OUTPUT(ymm5, 2);
 	STORE_OUTPUT(ymm4, 3);
 	STORE_OUTPUT(ymm3, 4);
 	STORE_OUTPUT(ymm2, 5);
 	STORE_OUTPUT(ymm1, 6);
 	STORE_OUTPUT(ymm0, 7);
 	STORE_OUTPUT(ymm15, 8);
 	STORE_OUTPUT(ymm14, 9);
 	STORE_OUTPUT(ymm13, 10);
 	STORE_OUTPUT(ymm12, 11);
 	STORE_OUTPUT(ymm11, 12);
 	STORE_OUTPUT(ymm10, 13);
 	STORE_OUTPUT(ymm9, 14);
 	STORE_OUTPUT(ymm8, 15);
 
 2:
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
 
 #endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
new file mode 100644
index 00000000..70e10460
--- /dev/null
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -0,0 +1,1566 @@
+/* camellia-gfni-avx512-amd64.h - GFNI/AVX512 implementation of Camellia
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define zmm0_x xmm0
+#define zmm1_x xmm1
+#define zmm2_x xmm2
+#define zmm3_x xmm3
+#define zmm4_x xmm4
+#define zmm5_x xmm5
+#define zmm6_x xmm6
+#define zmm7_x xmm7
+#define zmm8_x xmm8
+#define zmm9_x xmm9
+#define zmm10_x xmm10
+#define zmm11_x xmm11
+#define zmm12_x xmm12
+#define zmm13_x xmm13
+#define zmm14_x xmm14
+#define zmm15_x xmm15
+
+#define zmm0_y ymm0
+#define zmm1_y ymm1
+#define zmm2_y ymm2
+#define zmm3_y ymm3
+#define zmm4_y ymm4
+#define zmm5_y ymm5
+#define zmm6_y ymm6
+#define zmm7_y ymm7
+#define zmm8_y ymm8
+#define zmm9_y ymm9
+#define zmm10_y ymm10
+#define zmm11_y ymm11
+#define zmm12_y ymm12
+#define zmm13_y ymm13
+#define zmm14_y ymm14
+#define zmm15_y ymm15
+
+#define mem_ab_0 %zmm16
+#define mem_ab_1 %zmm17
+#define mem_ab_2 %zmm31
+#define mem_ab_3 %zmm18
+#define mem_ab_4 %zmm19
+#define mem_ab_5 %zmm20
+#define mem_ab_6 %zmm21
+#define mem_ab_7 %zmm22
+#define mem_cd_0 %zmm23
+#define mem_cd_1 %zmm24
+#define mem_cd_2 %zmm30
+#define mem_cd_3 %zmm25
+#define mem_cd_4 %zmm26
+#define mem_cd_5 %zmm27
+#define mem_cd_6 %zmm28
+#define mem_cd_7 %zmm29
+
+#define clear_vec4(v0,v1,v2,v3) \
+	vpxord v0, v0, v0; \
+	vpxord v1, v1, v1; \
+	vpxord v2, v2, v2; \
+	vpxord v3, v3, v3
+
+#define clear_zmm16_zmm31() \
+	clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
+	clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
+	clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
+	clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31)
+
+#define clear_regs() \
+	kxorq %k1, %k1, %k1; \
+	vzeroall; \
+	clear_zmm16_zmm31()
+
+/**********************************************************************
+  GFNI helper macros and constants
+ **********************************************************************/
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+	( (((a0) & 1) << 0) | \
+	  (((a1) & 1) << 1) | \
+	  (((a2) & 1) << 2) | \
+	  (((a3) & 1) << 3) | \
+	  (((a4) & 1) << 4) | \
+	  (((a5) & 1) << 5) | \
+	  (((a6) & 1) << 6) | \
+	  (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+	( ((l7) << (0 * 8)) | \
+	  ((l6) << (1 * 8)) | \
+	  ((l5) << (2 * 8)) | \
+	  ((l4) << (3 * 8)) | \
+	  ((l3) << (4 * 8)) | \
+	  ((l2) << (5 * 8)) | \
+	  ((l1) << (6 * 8)) | \
+	  ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14  BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2   BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3   BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+/**********************************************************************
+  64-way parallel camellia
+ **********************************************************************/
+
+/* roundsm64 (GFNI/AVX512 version)
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+		  t6, t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+	vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+	vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+	vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+	vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+	vpxor t7##_x, t7##_x, t7##_x; \
+	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+	\
+	/* prefilter sboxes */ \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+	\
+	/* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+	\
+	/* sbox GF8 inverse + postfilter sbox 3 */ \
+	vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+	vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+	\
+	/* sbox GF8 inverse + postfilter sbox 2 */ \
+	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+	\
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpshufb t7, t1, t1; \
+	vpsrldq $3, t0, t3; \
+	\
+	/* P-function */ \
+	vpxorq x5, x0, x0; \
+	vpxorq x6, x1, x1; \
+	vpxorq x7, x2, x2; \
+	vpxorq x4, x3, x3; \
+	\
+	vpshufb t7, t2, t2; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t7, t3, t3; \
+	vpsrldq $5, t0, t5; \
+	vpshufb t7, t4, t4; \
+	\
+	vpxorq x2, x4, x4; \
+	vpxorq x3, x5, x5; \
+	vpxorq x0, x6, x6; \
+	vpxorq x1, x7, x7; \
+	\
+	vpsrldq $6, t0, t6; \
+	vpshufb t7, t5, t5; \
+	vpshufb t7, t6, t6; \
+	\
+	vpxorq x7, x0, x0; \
+	vpxorq x4, x1, x1; \
+	vpxorq x5, x2, x2; \
+	vpxorq x6, x3, x3; \
+	\
+	vpxorq x3, x4, x4; \
+	vpxorq x0, x5, x5; \
+	vpxorq x1, x6, x6; \
+	vpxorq x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpternlogq $0x96, mem_cd##_5, t6, x1; \
+	\
+	vpsrldq $7, t0, t6; \
+	vpshufb t7, t0, t0; \
+	vpshufb t7, t6, t7; \
+	\
+	vpternlogq $0x96, mem_cd##_4, t7, x0; \
+	vpternlogq $0x96, mem_cd##_6, t5, x2; \
+	vpternlogq $0x96, mem_cd##_7, t4, x3; \
+	vpternlogq $0x96, mem_cd##_0, t3, x4; \
+	vpternlogq $0x96, mem_cd##_1, t2, x5; \
+	vpternlogq $0x96, mem_cd##_2, t1, x6; \
+	vpternlogq $0x96, mem_cd##_3, t0, x7;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+	\
+	vmovdqu64 x0, mem_cd##_4; \
+	vmovdqu64 x1, mem_cd##_5; \
+	vmovdqu64 x2, mem_cd##_6; \
+	vmovdqu64 x3, mem_cd##_7; \
+	vmovdqu64 x4, mem_cd##_0; \
+	vmovdqu64 x5, mem_cd##_1; \
+	vmovdqu64 x6, mem_cd##_2; \
+	vmovdqu64 x7, mem_cd##_3; \
+	\
+	roundsm64(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu64 x4, mem_ab##_4; \
+	vmovdqu64 x5, mem_ab##_5; \
+	vmovdqu64 x6, mem_ab##_6; \
+	vmovdqu64 x7, mem_ab##_7; \
+	vmovdqu64 x0, mem_ab##_0; \
+	vmovdqu64 x1, mem_ab##_1; \
+	vmovdqu64 x2, mem_ab##_2; \
+	vmovdqu64 x3, mem_ab##_3;
+
+#define enc_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN << 1)
+ *  t0, t1, t2, zero: (IN >> 7)
+ */
+#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \
+	vpcmpltb zero, v0, %k1; \
+	vpaddb v0, v0, v0; \
+	vpaddb one, zero, t0{%k1}{z}; \
+	\
+	vpcmpltb zero, v1, %k1; \
+	vpaddb v1, v1, v1; \
+	vpaddb one, zero, t1{%k1}{z}; \
+	\
+	vpcmpltb zero, v2, %k1; \
+	vpaddb v2, v2, v2; \
+	vpaddb one, zero, t2{%k1}{z}; \
+	\
+	vpcmpltb zero, v3, %k1; \
+	vpaddb v3, v3, v3; \
+	vpaddb one, zero, zero{%k1}{z};
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls64(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr, tmp) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+	vpbroadcastq .Lbyte_ones rRIP, tmp; \
+	vpxor tt3##_x, tt3##_x, tt3##_x; \
+	vpshufb tt3, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t0; \
+	\
+	vpandq l0, t0, t0; \
+	vpandq l1, t1, t1; \
+	vpandq l2, t2, t2; \
+	vpandq l3, t3, t3; \
+	\
+	rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
+	\
+	vpternlogq $0x96, tt2, t0, l4; \
+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+	vmovdqu64 l4, l##_4; \
+	vpternlogq $0x96, tt1, t1, l5; \
+	vmovdqu64 l5, l##_5; \
+	vpternlogq $0x96, tt0, t2, l6; \
+	vmovdqu64 l6, l##_6; \
+	vpternlogq $0x96, tt3, t3, l7; \
+	vmovdqu64 l7, l##_7; \
+	vpxor tt3##_x, tt3##_x, tt3##_x; \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vpshufb tt3, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t0; \
+	\
+	vpternlogq $0x1e, r##_4, t0, r##_0; \
+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+	vpternlogq $0x1e, r##_5, t1, r##_1; \
+	vpternlogq $0x1e, r##_6, t2, r##_2; \
+	vpternlogq $0x1e, r##_7, t3, r##_3; \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vpshufb tt3, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t0; \
+	\
+	vpandq r##_0, t0, t0; \
+	vpandq r##_1, t1, t1; \
+	vpandq r##_2, t2, t2; \
+	vpandq r##_3, t3, t3; \
+	\
+	rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
+	\
+	vpternlogq $0x96, tt2, t0, r##_4; \
+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+	vpternlogq $0x96, tt1, t1, r##_5; \
+	vpternlogq $0x96, tt0, t2, r##_6; \
+	vpternlogq $0x96, tt3, t3, r##_7; \
+	vpxor tt3##_x, tt3##_x, tt3##_x; \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vpshufb tt3, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt3, t0, t0; \
+	\
+	vpternlogq $0x1e, l4, t0, l0; \
+	vmovdqu64 l0, l##_0; \
+	vpternlogq $0x1e, l5, t1, l1; \
+	vmovdqu64 l1, l##_1; \
+	vpternlogq $0x1e, l6, t2, l2; \
+	vmovdqu64 l2, l##_2; \
+	vpternlogq $0x1e, l7, t3, l3; \
+	vmovdqu64 l3, l##_3;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	transpose_4x4(a0, a1, a2, a3, st0, st1); \
+	transpose_4x4(b0, b1, b2, b3, st0, st1); \
+	\
+	transpose_4x4(c0, c1, c2, c3, st0, st1); \
+	transpose_4x4(d0, d1, d2, d3, st0, st1); \
+	\
+	vbroadcasti64x2 .Lshufb_16x16b rRIP, st0; \
+	vpshufb st0, a0, a0; \
+	vpshufb st0, a1, a1; \
+	vpshufb st0, a2, a2; \
+	vpshufb st0, a3, a3; \
+	vpshufb st0, b0, b0; \
+	vpshufb st0, b1, b1; \
+	vpshufb st0, b2, b2; \
+	vpshufb st0, b3, b3; \
+	vpshufb st0, c0, c0; \
+	vpshufb st0, c1, c1; \
+	vpshufb st0, c2, c2; \
+	vpshufb st0, c3, c3; \
+	vpshufb st0, d0, d0; \
+	vpshufb st0, d1, d1; \
+	vpshufb st0, d2, d2; \
+	vpshufb st0, d3, d3; \
+	\
+	transpose_4x4(a0, b0, c0, d0, st0, st1); \
+	transpose_4x4(a1, b1, c1, d1, st0, st1); \
+	\
+	transpose_4x4(a2, b2, c2, d2, st0, st1); \
+	transpose_4x4(a3, b3, c3, d3, st0, st1); \
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack64_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
+	\
+	vpxorq 0 * 64(rio), x0, y7; \
+	vpxorq 1 * 64(rio), x0, y6; \
+	vpxorq 2 * 64(rio), x0, y5; \
+	vpxorq 3 * 64(rio), x0, y4; \
+	vpxorq 4 * 64(rio), x0, y3; \
+	vpxorq 5 * 64(rio), x0, y2; \
+	vpxorq 6 * 64(rio), x0, y1; \
+	vpxorq 7 * 64(rio), x0, y0; \
+	vpxorq 8 * 64(rio), x0, x7; \
+	vpxorq 9 * 64(rio), x0, x6; \
+	vpxorq 10 * 64(rio), x0, x5; \
+	vpxorq 11 * 64(rio), x0, x4; \
+	vpxorq 12 * 64(rio), x0, x3; \
+	vpxorq 13 * 64(rio), x0, x2; \
+	vpxorq 14 * 64(rio), x0, x1; \
+	vpxorq 15 * 64(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack64_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, tmp0, tmp1) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, tmp0, tmp1); \
+	\
+	vmovdqu64 x0, mem_ab##_0; \
+	vmovdqu64 x1, mem_ab##_1; \
+	vmovdqu64 x2, mem_ab##_2; \
+	vmovdqu64 x3, mem_ab##_3; \
+	vmovdqu64 x4, mem_ab##_4; \
+	vmovdqu64 x5, mem_ab##_5; \
+	vmovdqu64 x6, mem_ab##_6; \
+	vmovdqu64 x7, mem_ab##_7; \
+	vmovdqu64 y0, mem_cd##_0; \
+	vmovdqu64 y1, mem_cd##_1; \
+	vmovdqu64 y2, mem_cd##_2; \
+	vmovdqu64 y3, mem_cd##_3; \
+	vmovdqu64 y4, mem_cd##_4; \
+	vmovdqu64 y5, mem_cd##_5; \
+	vmovdqu64 y6, mem_cd##_6; \
+	vmovdqu64 y7, mem_cd##_7;
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, tmp0, tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, tmp0, tmp1); \
+	\
+	vpbroadcastq key, tmp0; \
+	vpshufb .Lpack_bswap rRIP, tmp0, tmp0; \
+	\
+	vpxorq tmp0, y7, y7; \
+	vpxorq tmp0, y6, y6; \
+	vpxorq tmp0, y5, y5; \
+	vpxorq tmp0, y4, y4; \
+	vpxorq tmp0, y3, y3; \
+	vpxorq tmp0, y2, y2; \
+	vpxorq tmp0, y1, y1; \
+	vpxorq tmp0, y0, y0; \
+	vpxorq tmp0, x7, x7; \
+	vpxorq tmp0, x6, x6; \
+	vpxorq tmp0, x5, x5; \
+	vpxorq tmp0, x4, x4; \
+	vpxorq tmp0, x3, x3; \
+	vpxorq tmp0, x2, x2; \
+	vpxorq tmp0, x1, x1; \
+	vpxorq tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu64 x0, 0 * 64(rio); \
+	vmovdqu64 x1, 1 * 64(rio); \
+	vmovdqu64 x2, 2 * 64(rio); \
+	vmovdqu64 x3, 3 * 64(rio); \
+	vmovdqu64 x4, 4 * 64(rio); \
+	vmovdqu64 x5, 5 * 64(rio); \
+	vmovdqu64 x6, 6 * 64(rio); \
+	vmovdqu64 x7, 7 * 64(rio); \
+	vmovdqu64 y0, 8 * 64(rio); \
+	vmovdqu64 y1, 9 * 64(rio); \
+	vmovdqu64 y2, 10 * 64(rio); \
+	vmovdqu64 y3, 11 * 64(rio); \
+	vmovdqu64 y4, 12 * 64(rio); \
+	vmovdqu64 y5, 13 * 64(rio); \
+	vmovdqu64 y6, 14 * 64(rio); \
+	vmovdqu64 y7, 15 * 64(rio);
+
+.text
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+_gcry_camellia_gfni_avx512__constants:
+ELF(.type   _gcry_camellia_gfni_avx512__constants,@object;)
+
+.align 64
+.Lpack_bswap:
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+.Lcounter0123_lo:
+	.quad 0, 0
+	.quad 1, 0
+	.quad 2, 0
+	.quad 3, 0
+
+.align 16
+.Lcounter4444_lo:
+	.quad 4, 0
+.Lcounter8888_lo:
+	.quad 8, 0
+.Lcounter16161616_lo:
+	.quad 16, 0
+.Lcounter1111_hi:
+	.quad 0, 1
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+	vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+	vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+	vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+	vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+.Lbyte_ones:
+	.byte 1, 1, 1, 1, 1, 1, 1, 1
+
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Bit-matrix from "θ₁(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+	.quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(0, 0, 1, 1, 0, 0, 1, 0),
+		    BV8(1, 1, 0, 1, 0, 0, 0, 0),
+		    BV8(1, 0, 1, 1, 0, 0, 1, 1),
+		    BV8(0, 0, 0, 0, 1, 1, 0, 0),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 1, 0, 1, 1, 0, 0),
+		    BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "θ₄(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+	.quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 0, 0),
+		    BV8(1, 0, 1, 0, 0, 0, 0, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 1, 1),
+		    BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(0, 1, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "ψ₁(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+	.quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 1, 0),
+		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
+		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 1, 1),
+		    BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "ψ₂(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 1, 0),
+		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
+		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "ψ₃(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+	.quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
+		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 1, 1),
+		    BV8(0, 0, 0, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;)
+
+.align 8
+ELF(.type   __camellia_gfni_avx512_enc_blk64,@function;)
+
+__camellia_gfni_avx512_enc_blk64:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%zmm0..%zmm15: 64 plaintext blocks
+	 * output:
+	 *	%zmm0..%zmm15: 64 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+	CFI_STARTPROC();
+
+	leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+	inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		      %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		      %zmm15, mem_ab, mem_cd, %zmm30, %zmm31);
+
+.align 8
+.Lenc_loop:
+	enc_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		     %zmm15, mem_ab, mem_cd, 0);
+
+	cmpq %r8, CTX;
+	je .Lenc_done;
+	leaq (8 * 8)(CTX), CTX;
+
+	fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+	      mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+	      %zmm15,
+	      ((key_table) + 0)(CTX),
+	      ((key_table) + 4)(CTX),
+	      ((key_table) + 8)(CTX),
+	      ((key_table) + 12)(CTX),
+	      %zmm31);
+	jmp .Lenc_loop;
+
+.align 8
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu64 mem_cd_0, %zmm8;
+	vmovdqu64 mem_cd_1, %zmm9;
+	vmovdqu64 mem_cd_2, %zmm10;
+	vmovdqu64 mem_cd_3, %zmm11;
+	vmovdqu64 mem_cd_4, %zmm12;
+	vmovdqu64 mem_cd_5, %zmm13;
+	vmovdqu64 mem_cd_6, %zmm14;
+	vmovdqu64 mem_cd_7, %zmm15;
+
+	outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		    %zmm15, ((key_table) + 8 * 8)(%r8), %zmm30, %zmm31);
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __camellia_gfni_avx512_enc_blk64,.-__camellia_gfni_avx512_enc_blk64;)
+
+.align 8
+ELF(.type   __camellia_gfni_avx512_dec_blk64,@function;)
+
+__camellia_gfni_avx512_dec_blk64:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%zmm0..%zmm15: 64 encrypted blocks
+	 * output:
+	 *	%zmm0..%zmm15: 64 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+	CFI_STARTPROC();
+
+	movq %r8, %rcx;
+	movq CTX, %r8
+	leaq (-8 * 8)(CTX, %rcx, 8), CTX;
+
+	inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		      %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		      %zmm15, mem_ab, mem_cd, %zmm30, %zmm31);
+
+.align 8
+.Ldec_loop:
+	dec_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		     %zmm15, mem_ab, mem_cd, 0);
+
+	cmpq %r8, CTX;
+	je .Ldec_done;
+
+	fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+	      mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+	      %zmm15,
+	      ((key_table) + 8)(CTX),
+	      ((key_table) + 12)(CTX),
+	      ((key_table) + 0)(CTX),
+	      ((key_table) + 4)(CTX),
+	      %zmm31);
+
+	leaq (-8 * 8)(CTX), CTX;
+	jmp .Ldec_loop;
+
+.align 8
+.Ldec_done:
+	/* load CD for output */
+	vmovdqu64 mem_cd_0, %zmm8;
+	vmovdqu64 mem_cd_1, %zmm9;
+	vmovdqu64 mem_cd_2, %zmm10;
+	vmovdqu64 mem_cd_3, %zmm11;
+	vmovdqu64 mem_cd_4, %zmm12;
+	vmovdqu64 mem_cd_5, %zmm13;
+	vmovdqu64 mem_cd_6, %zmm14;
+	vmovdqu64 mem_cd_7, %zmm15;
+
+	outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		    %zmm15, (key_table)(CTX), %zmm30, %zmm31);
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __camellia_gfni_avx512_dec_blk64,.-__camellia_gfni_avx512_dec_blk64;)
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+	vpaddq lo_counter, in, out; \
+	vpcmpuq $1, lo_counter, out, %k1; \
+	kaddb %k1, %k1, %k1; \
+	vpaddq hi_counter1, out, out{%k1};
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_ctr_enc
+ELF(.type   _gcry_camellia_gfni_avx512_ctr_enc,@function;)
+
+_gcry_camellia_gfni_avx512_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (64 blocks)
+	 *	%rdx: src (64 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
+	vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
+	vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+	vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+	vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+	vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+	/* load IV and byteswap */
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	vbroadcasti64x2 (%rcx), %zmm0;
+	vpshufb %zmm19, %zmm0, %zmm0;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 64), %r11;
+	ja .Lload_ctr_carry;
+
+	/* construct IVs */
+	vpaddq %zmm21, %zmm0, %zmm15;  /* +0:+1:+2:+3 */
+	vpaddq %zmm22, %zmm15, %zmm14; /* +4:+5:+6:+7 */
+	vpaddq %zmm23, %zmm15, %zmm13; /* +8:+9:+10:+11 */
+	vpaddq %zmm23, %zmm14, %zmm12; /* +12:+13:+14:+15 */
+	vpaddq %zmm24, %zmm15, %zmm11; /* +16... */
+	vpaddq %zmm24, %zmm14, %zmm10; /* +20... */
+	vpaddq %zmm24, %zmm13, %zmm9; /* +24... */
+	vpaddq %zmm24, %zmm12, %zmm8; /* +28... */
+	vpaddq %zmm24, %zmm11, %zmm7; /* +32... */
+	vpaddq %zmm24, %zmm10, %zmm6; /* +36... */
+	vpaddq %zmm24, %zmm9, %zmm5; /* +40... */
+	vpaddq %zmm24, %zmm8, %zmm4; /* +44... */
+	vpaddq %zmm24, %zmm7, %zmm3; /* +48... */
+	vpaddq %zmm24, %zmm6, %zmm2; /* +52... */
+	vpaddq %zmm24, %zmm5, %zmm1; /* +56... */
+	vpaddq %zmm24, %zmm4, %zmm0; /* +60... */
+	jmp .Lload_ctr_done;
+
+.align 4
+.Lload_ctr_carry:
+	/* construct IVs */
+	add_le128(%zmm15, %zmm0, %zmm21, %zmm25);  /* +0:+1:+2:+3 */
+	add_le128(%zmm14, %zmm15, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+	add_le128(%zmm13, %zmm15, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+	add_le128(%zmm12, %zmm14, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+	add_le128(%zmm11, %zmm15, %zmm24, %zmm25); /* +16... */
+	add_le128(%zmm10, %zmm14, %zmm24, %zmm25); /* +20... */
+	add_le128(%zmm9, %zmm13, %zmm24, %zmm25); /* +24... */
+	add_le128(%zmm8, %zmm12, %zmm24, %zmm25); /* +28... */
+	add_le128(%zmm7, %zmm11, %zmm24, %zmm25); /* +32... */
+	add_le128(%zmm6, %zmm10, %zmm24, %zmm25); /* +36... */
+	add_le128(%zmm5, %zmm9, %zmm24, %zmm25); /* +40... */
+	add_le128(%zmm4, %zmm8, %zmm24, %zmm25); /* +44... */
+	add_le128(%zmm3, %zmm7, %zmm24, %zmm25); /* +48... */
+	add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */
+	add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */
+	add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */
+
+.align 4
+.Lload_ctr_done:
+	vpbroadcastq (key_table)(CTX), %zmm16;
+	vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+	/* Byte-swap IVs and update counter. */
+	addq $64, %r11;
+	adcq $0, %r10;
+	vpshufb %zmm19, %zmm15, %zmm15;
+	vpshufb %zmm19, %zmm14, %zmm14;
+	vpshufb %zmm19, %zmm13, %zmm13;
+	vpshufb %zmm19, %zmm12, %zmm12;
+	vpshufb %zmm19, %zmm11, %zmm11;
+	vpshufb %zmm19, %zmm10, %zmm10;
+	vpshufb %zmm19, %zmm9, %zmm9;
+	vpshufb %zmm19, %zmm8, %zmm8;
+	bswapq %r11;
+	bswapq %r10;
+	vpshufb %zmm19, %zmm7, %zmm7;
+	vpshufb %zmm19, %zmm6, %zmm6;
+	vpshufb %zmm19, %zmm5, %zmm5;
+	vpshufb %zmm19, %zmm4, %zmm4;
+	vpshufb %zmm19, %zmm3, %zmm3;
+	vpshufb %zmm19, %zmm2, %zmm2;
+	vpshufb %zmm19, %zmm1, %zmm1;
+	vpshufb %zmm19, %zmm0, %zmm0;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+
+	/* inpack64_pre: */
+	vpxorq %zmm0, %zmm16, %zmm0;
+	vpxorq %zmm1, %zmm16, %zmm1;
+	vpxorq %zmm2, %zmm16, %zmm2;
+	vpxorq %zmm3, %zmm16, %zmm3;
+	vpxorq %zmm4, %zmm16, %zmm4;
+	vpxorq %zmm5, %zmm16, %zmm5;
+	vpxorq %zmm6, %zmm16, %zmm6;
+	vpxorq %zmm7, %zmm16, %zmm7;
+	vpxorq %zmm8, %zmm16, %zmm8;
+	vpxorq %zmm9, %zmm16, %zmm9;
+	vpxorq %zmm10, %zmm16, %zmm10;
+	vpxorq %zmm11, %zmm16, %zmm11;
+	vpxorq %zmm12, %zmm16, %zmm12;
+	vpxorq %zmm13, %zmm16, %zmm13;
+	vpxorq %zmm14, %zmm16, %zmm14;
+	vpxorq %zmm15, %zmm16, %zmm15;
+
+	call __camellia_gfni_avx512_enc_blk64;
+
+	vpxorq 0 * 64(%rdx), %zmm7, %zmm7;
+	vpxorq 1 * 64(%rdx), %zmm6, %zmm6;
+	vpxorq 2 * 64(%rdx), %zmm5, %zmm5;
+	vpxorq 3 * 64(%rdx), %zmm4, %zmm4;
+	vpxorq 4 * 64(%rdx), %zmm3, %zmm3;
+	vpxorq 5 * 64(%rdx), %zmm2, %zmm2;
+	vpxorq 6 * 64(%rdx), %zmm1, %zmm1;
+	vpxorq 7 * 64(%rdx), %zmm0, %zmm0;
+	vpxorq 8 * 64(%rdx), %zmm15, %zmm15;
+	vpxorq 9 * 64(%rdx), %zmm14, %zmm14;
+	vpxorq 10 * 64(%rdx), %zmm13, %zmm13;
+	vpxorq 11 * 64(%rdx), %zmm12, %zmm12;
+	vpxorq 12 * 64(%rdx), %zmm11, %zmm11;
+	vpxorq 13 * 64(%rdx), %zmm10, %zmm10;
+	vpxorq 14 * 64(%rdx), %zmm9, %zmm9;
+	vpxorq 15 * 64(%rdx), %zmm8, %zmm8;
+
+	write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+		     %zmm8, %rsi);
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_cbc_dec
+ELF(.type   _gcry_camellia_gfni_avx512_cbc_dec,@function;)
+
+_gcry_camellia_gfni_avx512_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (64 blocks)
+	 *	%rdx: src (64 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	movq %rcx, %r9;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack64_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		     %zmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	call __camellia_gfni_avx512_dec_blk64;
+
+	/* XOR output with IV */
+	vmovdqu64 (%r9), %xmm16;
+	vinserti64x2 $1, (0 * 16)(%rdx), %ymm16, %ymm16;
+	vinserti64x4 $1, (1 * 16)(%rdx), %zmm16, %zmm16;
+	vpxorq %zmm16, %zmm7, %zmm7;
+	vpxorq (0 * 64 + 48)(%rdx), %zmm6, %zmm6;
+	vpxorq (1 * 64 + 48)(%rdx), %zmm5, %zmm5;
+	vpxorq (2 * 64 + 48)(%rdx), %zmm4, %zmm4;
+	vpxorq (3 * 64 + 48)(%rdx), %zmm3, %zmm3;
+	vpxorq (4 * 64 + 48)(%rdx), %zmm2, %zmm2;
+	vpxorq (5 * 64 + 48)(%rdx), %zmm1, %zmm1;
+	vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm0;
+	vpxorq (7 * 64 + 48)(%rdx), %zmm15, %zmm15;
+	vpxorq (8 * 64 + 48)(%rdx), %zmm14, %zmm14;
+	vpxorq (9 * 64 + 48)(%rdx), %zmm13, %zmm13;
+	vpxorq (10 * 64 + 48)(%rdx), %zmm12, %zmm12;
+	vpxorq (11 * 64 + 48)(%rdx), %zmm11, %zmm11;
+	vpxorq (12 * 64 + 48)(%rdx), %zmm10, %zmm10;
+	vpxorq (13 * 64 + 48)(%rdx), %zmm9, %zmm9;
+	vpxorq (14 * 64 + 48)(%rdx), %zmm8, %zmm8;
+	vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16;
+
+	write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+		     %zmm8, %rsi);
+
+	/* store new IV */
+	vmovdqu64 %xmm16, (0)(%r9);
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_cbc_dec,.-_gcry_camellia_gfni_avx512_cbc_dec;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_cfb_dec
+ELF(.type   _gcry_camellia_gfni_avx512_cfb_dec,@function;)
+
+_gcry_camellia_gfni_avx512_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	/* inpack64_pre: */
+	vpbroadcastq (key_table)(CTX), %zmm0;
+	vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+	vmovdqu64 (%rcx), %xmm15;
+	vinserti64x2 $1, (%rdx), %ymm15, %ymm15;
+	vinserti64x4 $1, 16(%rdx), %zmm15, %zmm15;
+	vpxorq %zmm15, %zmm0, %zmm15;
+	vpxorq (0 * 64 + 48)(%rdx), %zmm0, %zmm14;
+	vpxorq (1 * 64 + 48)(%rdx), %zmm0, %zmm13;
+	vpxorq (2 * 64 + 48)(%rdx), %zmm0, %zmm12;
+	vpxorq (3 * 64 + 48)(%rdx), %zmm0, %zmm11;
+	vpxorq (4 * 64 + 48)(%rdx), %zmm0, %zmm10;
+	vpxorq (5 * 64 + 48)(%rdx), %zmm0, %zmm9;
+	vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm8;
+	vpxorq (7 * 64 + 48)(%rdx), %zmm0, %zmm7;
+	vpxorq (8 * 64 + 48)(%rdx), %zmm0, %zmm6;
+	vpxorq (9 * 64 + 48)(%rdx), %zmm0, %zmm5;
+	vpxorq (10 * 64 + 48)(%rdx), %zmm0, %zmm4;
+	vpxorq (11 * 64 + 48)(%rdx), %zmm0, %zmm3;
+	vpxorq (12 * 64 + 48)(%rdx), %zmm0, %zmm2;
+	vpxorq (13 * 64 + 48)(%rdx), %zmm0, %zmm1;
+	vpxorq (14 * 64 + 48)(%rdx), %zmm0, %zmm0;
+	vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16;
+	vmovdqu64 %xmm16, (%rcx); /* store new IV */
+
+	call __camellia_gfni_avx512_enc_blk64;
+
+	vpxorq 0 * 64(%rdx), %zmm7, %zmm7;
+	vpxorq 1 * 64(%rdx), %zmm6, %zmm6;
+	vpxorq 2 * 64(%rdx), %zmm5, %zmm5;
+	vpxorq 3 * 64(%rdx), %zmm4, %zmm4;
+	vpxorq 4 * 64(%rdx), %zmm3, %zmm3;
+	vpxorq 5 * 64(%rdx), %zmm2, %zmm2;
+	vpxorq 6 * 64(%rdx), %zmm1, %zmm1;
+	vpxorq 7 * 64(%rdx), %zmm0, %zmm0;
+	vpxorq 8 * 64(%rdx), %zmm15, %zmm15;
+	vpxorq 9 * 64(%rdx), %zmm14, %zmm14;
+	vpxorq 10 * 64(%rdx), %zmm13, %zmm13;
+	vpxorq 11 * 64(%rdx), %zmm12, %zmm12;
+	vpxorq 12 * 64(%rdx), %zmm11, %zmm11;
+	vpxorq 13 * 64(%rdx), %zmm10, %zmm10;
+	vpxorq 14 * 64(%rdx), %zmm9, %zmm9;
+	vpxorq 15 * 64(%rdx), %zmm8, %zmm8;
+
+	write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+		     %zmm8, %rsi);
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_cfb_dec,.-_gcry_camellia_gfni_avx512_cfb_dec;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_ocb_enc
+ELF(.type   _gcry_camellia_gfni_avx512_ocb_enc,@function;)
+
+_gcry_camellia_gfni_avx512_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (64 blocks)
+	 *	%rdx: src (64 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[64])
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+	pushq %r15;
+	CFI_PUSH(%r15);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+
+	vmovdqu64 (%rcx), %xmm30;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg, zplain) \
+	  vmovdqu64 (n * 64)(%rdx), zplain; \
+	  vpxorq (l0reg), %xmm30, %xmm16; \
+	  vpxorq (l1reg), %xmm16, %xmm30; \
+	  vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \
+	  vpxorq (l2reg), %xmm30, %xmm30; \
+	  vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \
+	  vpxorq (l3reg), %xmm30, %xmm30; \
+	  vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \
+	  vpxorq zplain, %zmm31, %zmm31; \
+	  vpxorq zplain, %zmm16, zreg; \
+	  vmovdqu64 %zmm16, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+	  movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+	  movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+	  movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+	  movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+	  movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+	  movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+	  movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+	  movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+	OCB_LOAD_PTRS(0);
+	OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15, %zmm20);
+	OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14, %zmm21);
+	OCB_LOAD_PTRS(2);
+	OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13, %zmm22);
+	vpternlogq $0x96, %zmm20, %zmm21, %zmm22;
+	OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12, %zmm23);
+	OCB_LOAD_PTRS(4);
+	OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11, %zmm24);
+	OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10, %zmm25);
+	vpternlogq $0x96, %zmm23, %zmm24, %zmm25;
+	OCB_LOAD_PTRS(6);
+	OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9, %zmm20);
+	OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8, %zmm21);
+	OCB_LOAD_PTRS(8);
+	OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7, %zmm26);
+	vpternlogq $0x96, %zmm20, %zmm21, %zmm26;
+	OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6, %zmm23);
+	OCB_LOAD_PTRS(10);
+	OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5, %zmm24);
+	OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4, %zmm27);
+	vpternlogq $0x96, %zmm23, %zmm24, %zmm27;
+	OCB_LOAD_PTRS(12);
+	OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3, %zmm20);
+	OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2, %zmm21);
+	OCB_LOAD_PTRS(14);
+	OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1, %zmm23);
+	vpternlogq $0x96, %zmm20, %zmm21, %zmm23;
+	OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0, %zmm24);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+	vpbroadcastq (key_table)(CTX), %zmm16;
+	vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+	vpternlogq $0x96, %zmm24, %zmm22, %zmm25;
+	vpternlogq $0x96, %zmm26, %zmm27, %zmm23;
+	vpxorq %zmm25, %zmm23, %zmm20;
+	vextracti64x4 $1, %zmm20, %ymm21;
+	vpxorq %ymm21, %ymm20, %ymm20;
+	vextracti64x2 $1, %ymm20, %xmm21;
+	vpternlogq $0x96, (%r8), %xmm21, %xmm20;
+	vmovdqu64 %xmm30, (%rcx);
+	vmovdqu64 %xmm20, (%r8);
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	/* inpack64_pre: */
+	vpxorq %zmm0, %zmm16, %zmm0;
+	vpxorq %zmm1, %zmm16, %zmm1;
+	vpxorq %zmm2, %zmm16, %zmm2;
+	vpxorq %zmm3, %zmm16, %zmm3;
+	vpxorq %zmm4, %zmm16, %zmm4;
+	vpxorq %zmm5, %zmm16, %zmm5;
+	vpxorq %zmm6, %zmm16, %zmm6;
+	vpxorq %zmm7, %zmm16, %zmm7;
+	vpxorq %zmm8, %zmm16, %zmm8;
+	vpxorq %zmm9, %zmm16, %zmm9;
+	vpxorq %zmm10, %zmm16, %zmm10;
+	vpxorq %zmm11, %zmm16, %zmm11;
+	vpxorq %zmm12, %zmm16, %zmm12;
+	vpxorq %zmm13, %zmm16, %zmm13;
+	vpxorq %zmm14, %zmm16, %zmm14;
+	vpxorq %zmm15, %zmm16, %zmm15;
+
+	call __camellia_gfni_avx512_enc_blk64;
+
+	vpxorq 0 * 64(%rsi), %zmm7, %zmm7;
+	vpxorq 1 * 64(%rsi), %zmm6, %zmm6;
+	vpxorq 2 * 64(%rsi), %zmm5, %zmm5;
+	vpxorq 3 * 64(%rsi), %zmm4, %zmm4;
+	vpxorq 4 * 64(%rsi), %zmm3, %zmm3;
+	vpxorq 5 * 64(%rsi), %zmm2, %zmm2;
+	vpxorq 6 * 64(%rsi), %zmm1, %zmm1;
+	vpxorq 7 * 64(%rsi), %zmm0, %zmm0;
+	vpxorq 8 * 64(%rsi), %zmm15, %zmm15;
+	vpxorq 9 * 64(%rsi), %zmm14, %zmm14;
+	vpxorq 10 * 64(%rsi), %zmm13, %zmm13;
+	vpxorq 11 * 64(%rsi), %zmm12, %zmm12;
+	vpxorq 12 * 64(%rsi), %zmm11, %zmm11;
+	vpxorq 13 * 64(%rsi), %zmm10, %zmm10;
+	vpxorq 14 * 64(%rsi), %zmm9, %zmm9;
+	vpxorq 15 * 64(%rsi), %zmm8, %zmm8;
+
+	write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+		     %zmm8, %rsi);
+
+	popq %rbx;
+	CFI_RESTORE(%rbx);
+	popq %r15;
+	CFI_RESTORE(%r15);
+	popq %r14;
+	CFI_RESTORE(%r14);
+	popq %r13;
+	CFI_RESTORE(%r12);
+	popq %r12;
+	CFI_RESTORE(%r13);
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ocb_enc,.-_gcry_camellia_gfni_avx512_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_ocb_dec
+ELF(.type   _gcry_camellia_gfni_avx512_ocb_dec,@function;)
+
+_gcry_camellia_gfni_avx512_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (64 blocks)
+	 *	%rdx: src (64 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[64])
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+	pushq %r15;
+	CFI_PUSH(%r15);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r8;
+	CFI_PUSH(%r8);
+
+	vmovdqu64 (%rcx), %xmm30;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg) \
+	  vpxorq (l0reg), %xmm30, %xmm16; \
+	  vpxorq (l1reg), %xmm16, %xmm30; \
+	  vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \
+	  vpxorq (l2reg), %xmm30, %xmm30; \
+	  vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \
+	  vpxorq (l3reg), %xmm30, %xmm30; \
+	  vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \
+	  vpxorq (n * 64)(%rdx), %zmm16, zreg; \
+	  vmovdqu64 %zmm16, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+	  movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+	  movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+	  movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+	  movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+	  movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+	  movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+	  movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+	  movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+	OCB_LOAD_PTRS(0);
+	OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15);
+	OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14);
+	OCB_LOAD_PTRS(2);
+	OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13);
+	OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12);
+	OCB_LOAD_PTRS(4);
+	OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11);
+	OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10);
+	OCB_LOAD_PTRS(6);
+	OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9);
+	OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8);
+	OCB_LOAD_PTRS(8);
+	OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7);
+	OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6);
+	OCB_LOAD_PTRS(10);
+	OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5);
+	OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4);
+	OCB_LOAD_PTRS(12);
+	OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3);
+	OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2);
+	OCB_LOAD_PTRS(14);
+	OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1);
+	OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+	vmovdqu64 %xmm30, (%rcx);
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	vpbroadcastq (key_table)(CTX, %r8, 8), %zmm16;
+	vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+	/* inpack64_pre: */
+	vpxorq %zmm0, %zmm16, %zmm0;
+	vpxorq %zmm1, %zmm16, %zmm1;
+	vpxorq %zmm2, %zmm16, %zmm2;
+	vpxorq %zmm3, %zmm16, %zmm3;
+	vpxorq %zmm4, %zmm16, %zmm4;
+	vpxorq %zmm5, %zmm16, %zmm5;
+	vpxorq %zmm6, %zmm16, %zmm6;
+	vpxorq %zmm7, %zmm16, %zmm7;
+	vpxorq %zmm8, %zmm16, %zmm8;
+	vpxorq %zmm9, %zmm16, %zmm9;
+	vpxorq %zmm10, %zmm16, %zmm10;
+	vpxorq %zmm11, %zmm16, %zmm11;
+	vpxorq %zmm12, %zmm16, %zmm12;
+	vpxorq %zmm13, %zmm16, %zmm13;
+	vpxorq %zmm14, %zmm16, %zmm14;
+	vpxorq %zmm15, %zmm16, %zmm15;
+
+	call __camellia_gfni_avx512_dec_blk64;
+
+	vpxorq 0 * 64(%rsi), %zmm7, %zmm7;
+	vpxorq 1 * 64(%rsi), %zmm6, %zmm6;
+	vpxorq 2 * 64(%rsi), %zmm5, %zmm5;
+	vpxorq 3 * 64(%rsi), %zmm4, %zmm4;
+	vpxorq 4 * 64(%rsi), %zmm3, %zmm3;
+	vpxorq 5 * 64(%rsi), %zmm2, %zmm2;
+	vpxorq 6 * 64(%rsi), %zmm1, %zmm1;
+	vpxorq 7 * 64(%rsi), %zmm0, %zmm0;
+	vpxorq 8 * 64(%rsi), %zmm15, %zmm15;
+	vpxorq 9 * 64(%rsi), %zmm14, %zmm14;
+	vpxorq 10 * 64(%rsi), %zmm13, %zmm13;
+	vpxorq 11 * 64(%rsi), %zmm12, %zmm12;
+	vpxorq 12 * 64(%rsi), %zmm11, %zmm11;
+	vpxorq 13 * 64(%rsi), %zmm10, %zmm10;
+	vpxorq 14 * 64(%rsi), %zmm9, %zmm9;
+	vpxorq 15 * 64(%rsi), %zmm8, %zmm8;
+
+	write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+		     %zmm8, %rsi);
+
+	popq %r8;
+	CFI_RESTORE(%r8);
+
+	/* Checksum_i = Checksum_{i-1} xor C_i  */
+	vpternlogq $0x96, %zmm7, %zmm6, %zmm5;
+	vpternlogq $0x96, %zmm4, %zmm3, %zmm2;
+	vpternlogq $0x96, %zmm1, %zmm0, %zmm15;
+	vpternlogq $0x96, %zmm14, %zmm13, %zmm12;
+	vpternlogq $0x96, %zmm11, %zmm10, %zmm9;
+	vpternlogq $0x96, %zmm5, %zmm2, %zmm15;
+	vpternlogq $0x96, %zmm12, %zmm9, %zmm8;
+	vpxorq %zmm15, %zmm8, %zmm8;
+
+	vextracti64x4 $1, %zmm8, %ymm0;
+	vpxor %ymm0, %ymm8, %ymm8;
+	vextracti128 $1, %ymm8, %xmm0;
+	vpternlogq $0x96, (%r8), %xmm0, %xmm8;
+	vmovdqu64 %xmm8, (%r8);
+
+	popq %rbx;
+	CFI_RESTORE(%rbx);
+	popq %r15;
+	CFI_RESTORE(%r15);
+	popq %r14;
+	CFI_RESTORE(%r14);
+	popq %r13;
+	CFI_RESTORE(%r12);
+	popq %r12;
+	CFI_RESTORE(%r13);
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ocb_dec,.-_gcry_camellia_gfni_avx512_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_enc_blk64
+ELF(.type   _gcry_camellia_gfni_avx512_enc_blk64,@function;)
+
+_gcry_camellia_gfni_avx512_enc_blk64:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (64 blocks)
+	 *	%rdx: src (64 blocks)
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+	xorl %eax, %eax;
+
+	vpbroadcastq (key_table)(CTX), %zmm0;
+	vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+
+	vpxorq (0) * 64(%rdx), %zmm0, %zmm15;
+	vpxorq (1) * 64(%rdx), %zmm0, %zmm14;
+	vpxorq (2) * 64(%rdx), %zmm0, %zmm13;
+	vpxorq (3) * 64(%rdx), %zmm0, %zmm12;
+	vpxorq (4) * 64(%rdx), %zmm0, %zmm11;
+	vpxorq (5) * 64(%rdx), %zmm0, %zmm10;
+	vpxorq (6) * 64(%rdx), %zmm0, %zmm9;
+	vpxorq (7) * 64(%rdx), %zmm0, %zmm8;
+	vpxorq (8) * 64(%rdx), %zmm0, %zmm7;
+	vpxorq (9) * 64(%rdx), %zmm0, %zmm6;
+	vpxorq (10) * 64(%rdx), %zmm0, %zmm5;
+	vpxorq (11) * 64(%rdx), %zmm0, %zmm4;
+	vpxorq (12) * 64(%rdx), %zmm0, %zmm3;
+	vpxorq (13) * 64(%rdx), %zmm0, %zmm2;
+	vpxorq (14) * 64(%rdx), %zmm0, %zmm1;
+	vpxorq (15) * 64(%rdx), %zmm0, %zmm0;
+
+	call __camellia_gfni_avx512_enc_blk64;
+
+	vmovdqu64 %zmm7, (0) * 64(%rsi);
+	vmovdqu64 %zmm6, (1) * 64(%rsi);
+	vmovdqu64 %zmm5, (2) * 64(%rsi);
+	vmovdqu64 %zmm4, (3) * 64(%rsi);
+	vmovdqu64 %zmm3, (4) * 64(%rsi);
+	vmovdqu64 %zmm2, (5) * 64(%rsi);
+	vmovdqu64 %zmm1, (6) * 64(%rsi);
+	vmovdqu64 %zmm0, (7) * 64(%rsi);
+	vmovdqu64 %zmm15, (8) * 64(%rsi);
+	vmovdqu64 %zmm14, (9) * 64(%rsi);
+	vmovdqu64 %zmm13, (10) * 64(%rsi);
+	vmovdqu64 %zmm12, (11) * 64(%rsi);
+	vmovdqu64 %zmm11, (12) * 64(%rsi);
+	vmovdqu64 %zmm10, (13) * 64(%rsi);
+	vmovdqu64 %zmm9, (14) * 64(%rsi);
+	vmovdqu64 %zmm8, (15) * 64(%rsi);
+
+	clear_regs();
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_enc_blk64,.-_gcry_camellia_gfni_avx512_enc_blk64;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_dec_blk64
+ELF(.type   _gcry_camellia_gfni_avx512_dec_blk64,@function;)
+
+_gcry_camellia_gfni_avx512_dec_blk64:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (64 blocks)
+	 *	%rdx: src (64 blocks)
+	 */
+	CFI_STARTPROC();
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+	xorl %eax, %eax;
+
+	vpbroadcastq (key_table)(CTX, %r8, 8), %zmm0;
+	vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+
+	vpxorq (0) * 64(%rdx), %zmm0, %zmm15;
+	vpxorq (1) * 64(%rdx), %zmm0, %zmm14;
+	vpxorq (2) * 64(%rdx), %zmm0, %zmm13;
+	vpxorq (3) * 64(%rdx), %zmm0, %zmm12;
+	vpxorq (4) * 64(%rdx), %zmm0, %zmm11;
+	vpxorq (5) * 64(%rdx), %zmm0, %zmm10;
+	vpxorq (6) * 64(%rdx), %zmm0, %zmm9;
+	vpxorq (7) * 64(%rdx), %zmm0, %zmm8;
+	vpxorq (8) * 64(%rdx), %zmm0, %zmm7;
+	vpxorq (9) * 64(%rdx), %zmm0, %zmm6;
+	vpxorq (10) * 64(%rdx), %zmm0, %zmm5;
+	vpxorq (11) * 64(%rdx), %zmm0, %zmm4;
+	vpxorq (12) * 64(%rdx), %zmm0, %zmm3;
+	vpxorq (13) * 64(%rdx), %zmm0, %zmm2;
+	vpxorq (14) * 64(%rdx), %zmm0, %zmm1;
+	vpxorq (15) * 64(%rdx), %zmm0, %zmm0;
+
+	call __camellia_gfni_avx512_dec_blk64;
+
+	vmovdqu64 %zmm7, (0) * 64(%rsi);
+	vmovdqu64 %zmm6, (1) * 64(%rsi);
+	vmovdqu64 %zmm5, (2) * 64(%rsi);
+	vmovdqu64 %zmm4, (3) * 64(%rsi);
+	vmovdqu64 %zmm3, (4) * 64(%rsi);
+	vmovdqu64 %zmm2, (5) * 64(%rsi);
+	vmovdqu64 %zmm1, (6) * 64(%rsi);
+	vmovdqu64 %zmm0, (7) * 64(%rsi);
+	vmovdqu64 %zmm15, (8) * 64(%rsi);
+	vmovdqu64 %zmm14, (9) * 64(%rsi);
+	vmovdqu64 %zmm13, (10) * 64(%rsi);
+	vmovdqu64 %zmm12, (11) * 64(%rsi);
+	vmovdqu64 %zmm11, (12) * 64(%rsi);
+	vmovdqu64 %zmm10, (13) * 64(%rsi);
+	vmovdqu64 %zmm9, (14) * 64(%rsi);
+	vmovdqu64 %zmm8, (15) * 64(%rsi);
+
+	clear_regs();
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_dec_blk64,.-_gcry_camellia_gfni_avx512_dec_blk64;)
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 00e23750..a854b82d 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -1,1395 +1,1628 @@
 /* camellia-glue.c - Glue for the Camellia cipher
  * Copyright (C) 2007 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  * 02110-1301, USA.
  */
 
 /* I put all the libgcrypt-specific stuff in this file to keep the
    camellia.c/camellia.h files exactly as provided by NTT.  If they
    update their code, this should make it easier to bring the changes
    in. - dshaw
 
    There is one small change which needs to be done: Include the
    following code at the top of camellia.h: */
 #if 0
 
 /* To use Camellia with libraries it is often useful to keep the name
  * space of the library clean.  The following macro is thus useful:
  *
  *     #define CAMELLIA_EXT_SYM_PREFIX foo_
  *
  * This prefixes all external symbols with "foo_".
  */
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
 #ifdef CAMELLIA_EXT_SYM_PREFIX
 #define CAMELLIA_PREFIX1(x,y) x ## y
 #define CAMELLIA_PREFIX2(x,y) CAMELLIA_PREFIX1(x,y)
 #define CAMELLIA_PREFIX(x)    CAMELLIA_PREFIX2(CAMELLIA_EXT_SYM_PREFIX,x)
 #define Camellia_Ekeygen      CAMELLIA_PREFIX(Camellia_Ekeygen)
 #define Camellia_EncryptBlock CAMELLIA_PREFIX(Camellia_EncryptBlock)
 #define Camellia_DecryptBlock CAMELLIA_PREFIX(Camellia_DecryptBlock)
 #define camellia_decrypt128   CAMELLIA_PREFIX(camellia_decrypt128)
 #define camellia_decrypt256   CAMELLIA_PREFIX(camellia_decrypt256)
 #define camellia_encrypt128   CAMELLIA_PREFIX(camellia_encrypt128)
 #define camellia_encrypt256   CAMELLIA_PREFIX(camellia_encrypt256)
 #define camellia_setup128     CAMELLIA_PREFIX(camellia_setup128)
 #define camellia_setup192     CAMELLIA_PREFIX(camellia_setup192)
 #define camellia_setup256     CAMELLIA_PREFIX(camellia_setup256)
 #endif /*CAMELLIA_EXT_SYM_PREFIX*/
 
 #endif /* Code sample. */
 
 
 #include <config.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "camellia.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
 #include "cipher-selftest.h"
 #include "bulkhelp.h"
 
 /* Helper macro to force alignment to 16 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
 #else
 # define ATTR_ALIGNED_16
 #endif
 
 /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
 #undef USE_AESNI_AVX
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX 1
 # endif
 #endif
 
 /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
 #undef USE_AESNI_AVX2
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX2 1
 # endif
 #endif
 
 /* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */
 #undef USE_VAES_AVX2
 #if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
 # define USE_VAES_AVX2 1
 #endif
 
 /* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
 #undef USE_GFNI_AVX2
 #if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
 # define USE_GFNI_AVX2 1
 #endif
 
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(USE_GFNI_AVX2) && defined(ENABLE_AVX512_SUPPORT)
+# define USE_GFNI_AVX512 1
+#endif
+
 typedef struct
 {
   KEY_TABLE_TYPE keytable;
   int keybitlength;
 #ifdef USE_AESNI_AVX
   unsigned int use_aesni_avx:1;	/* AES-NI/AVX implementation shall be used.  */
 #endif /*USE_AESNI_AVX*/
 #ifdef USE_AESNI_AVX2
   unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
   unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used.  */
   unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used.  */
+  unsigned int use_gfni_avx512:1; /* GFNI/AVX512 implementation shall be used.  */
 #endif /*USE_AESNI_AVX2*/
 } CAMELLIA_context;
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16)
 # else
 #  define ASM_FUNC_ABI
 #  define ASM_EXTRA_STACK 0
 # endif
 #endif
 
 #ifdef USE_AESNI_AVX
 /* Assembler implementations of Camellia using AES-NI and AVX.  Process data
-   in 16 block same time.
+   in 16 blocks same time.
  */
 extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
 					     const unsigned char *abuf,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 					    const unsigned char *key,
 					    unsigned int keylen) ASM_FUNC_ABI;
 
 static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
                                         2 * sizeof(void *) + ASM_EXTRA_STACK;
 
 #endif
 
 #ifdef USE_AESNI_AVX2
 /* Assembler implementations of Camellia using AES-NI and AVX2.  Process data
-   in 32 block same time.
+   in 32 blocks same time.
  */
 extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
 					      unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
 					      unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
 					      unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
 					      unsigned char *offset,
 					      unsigned char *checksum,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
 					      unsigned char *offset,
 					      unsigned char *checksum,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
 					       const unsigned char *abuf,
 					       unsigned char *offset,
 					       unsigned char *checksum,
 					       const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
                                                   unsigned char *out,
                                                   const unsigned char *in,
                                                   unsigned int nblocks)
                                                   ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
                                                   unsigned char *out,
                                                   const unsigned char *in,
                                                   unsigned int nblocks)
                                                   ASM_FUNC_ABI;
 
 static const int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
                                          2 * sizeof(void *) + ASM_EXTRA_STACK;
 
 #endif
 
 #ifdef USE_VAES_AVX2
 /* Assembler implementations of Camellia using VAES and AVX2.  Process data
-   in 32 block same time.
+   in 32 blocks same time.
  */
 extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_cfb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_ocb_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_ocb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
 					      const unsigned char *abuf,
 					      unsigned char *offset,
 					      unsigned char *checksum,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
                                                  unsigned char *out,
                                                  const unsigned char *in,
                                                  unsigned int nblocks)
                                                  ASM_FUNC_ABI;
 
 extern void _gcry_camellia_vaes_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
                                                  unsigned char *out,
                                                  const unsigned char *in,
                                                  unsigned int nblocks)
                                                  ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_GFNI_AVX2
 /* Assembler implementations of Camellia using GFNI and AVX2.  Process data
-   in 32 block same time.
+   in 32 blocks same time.
  */
 extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *offset,
 					     unsigned char *checksum,
 					     const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
 					      const unsigned char *abuf,
 					      unsigned char *offset,
 					      unsigned char *checksum,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
                                                  unsigned char *out,
                                                  const unsigned char *in,
                                                  unsigned int nblocks)
                                                  ASM_FUNC_ABI;
 
 extern void _gcry_camellia_gfni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
                                                  unsigned char *out,
                                                  const unsigned char *in,
                                                  unsigned int nblocks)
                                                  ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_GFNI_AVX512
+/* Assembler implementations of Camellia using GFNI and AVX512.  Process data
+   in 64 blocks same time.
+ */
+extern void _gcry_camellia_gfni_avx512_ctr_enc(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_cbc_dec(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_cfb_dec(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_ocb_enc(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *offset,
+                                               unsigned char *checksum,
+                                               const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_ocb_dec(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *offset,
+                                               unsigned char *checksum,
+                                               const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_enc_blk64(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in)
+                                                 ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_dec_blk64(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in)
+                                                 ASM_FUNC_ABI;
+
+/* Stack not used by AVX512 implementation. */
+static const int avx512_burn_stack_depth = 0;
+#endif
+
 static const char *selftest(void);
 
 static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
 				    void *outbuf_arg, const void *inbuf_arg,
 				    size_t nblocks);
 static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
 				    void *outbuf_arg, const void *inbuf_arg,
 				    size_t nblocks);
 static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
 				    void *outbuf_arg, const void *inbuf_arg,
 				    size_t nblocks);
 static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
                                       void *outbuf_arg, const void *inbuf_arg,
                                       size_t nblocks, int encrypt);
 static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 					const void *inbuf_arg, size_t nblocks,
 					int encrypt);
 static size_t _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 				       size_t nblocks);
 
 static gcry_err_code_t
 camellia_setkey(void *c, const byte *key, unsigned keylen,
                 cipher_bulk_ops_t *bulk_ops)
 {
   CAMELLIA_context *ctx=c;
   static int initialized=0;
   static const char *selftest_failed=NULL;
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \
     || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2)
   unsigned int hwf = _gcry_get_hw_features ();
 #endif
 
   if(keylen!=16 && keylen!=24 && keylen!=32)
     return GPG_ERR_INV_KEYLEN;
 
   if(!initialized)
     {
       initialized=1;
       selftest_failed=selftest();
       if(selftest_failed)
 	log_error("%s\n",selftest_failed);
     }
 
   if(selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
 #ifdef USE_AESNI_AVX
   ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
 #endif
 #ifdef USE_AESNI_AVX2
   ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
   ctx->use_vaes_avx2 = 0;
   ctx->use_gfni_avx2 = 0;
+  ctx->use_gfni_avx512 = 0;
 #endif
 #ifdef USE_VAES_AVX2
   ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
 #endif
 #ifdef USE_GFNI_AVX2
   ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
 #endif
+#ifdef USE_GFNI_AVX512
+  ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
 
   ctx->keybitlength=keylen*8;
 
   /* Setup bulk encryption routines.  */
   memset (bulk_ops, 0, sizeof(*bulk_ops));
   bulk_ops->cbc_dec = _gcry_camellia_cbc_dec;
   bulk_ops->cfb_dec = _gcry_camellia_cfb_dec;
   bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
   bulk_ops->ocb_auth  = _gcry_camellia_ocb_auth;
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2)
     bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
 #endif
 
   if (0)
     { }
 #ifdef USE_AESNI_AVX
   else if (ctx->use_aesni_avx)
     _gcry_camellia_aesni_avx_keygen(ctx, key, keylen);
   else
 #endif
     {
       Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
       _gcry_burn_stack
         ((19+34+34)*sizeof(u32)+2*sizeof(void*) /* camellia_setup256 */
          +(4+32)*sizeof(u32)+2*sizeof(void*)    /* camellia_setup192 */
          +0+sizeof(int)+2*sizeof(void*)         /* Camellia_Ekeygen */
          +3*2*sizeof(void*)                     /* Function calls.  */
          );
     }
 
 #ifdef USE_GFNI_AVX2
   if (ctx->use_gfni_avx2)
     {
       /* Disable AESNI & VAES implementations when GFNI implementation is
        * enabled. */
 #ifdef USE_AESNI_AVX
       ctx->use_aesni_avx = 0;
 #endif
 #ifdef USE_AESNI_AVX2
       ctx->use_aesni_avx2 = 0;
 #endif
 #ifdef USE_VAES_AVX2
       ctx->use_vaes_avx2 = 0;
 #endif
     }
 #endif
 
   return 0;
 }
 
 #ifdef USE_ARM_ASM
 
 /* Assembly implementations of Camellia. */
 extern void _gcry_camellia_arm_encrypt_block(const KEY_TABLE_TYPE keyTable,
 					       byte *outbuf, const byte *inbuf,
 					       const int keybits);
 
 extern void _gcry_camellia_arm_decrypt_block(const KEY_TABLE_TYPE keyTable,
 					       byte *outbuf, const byte *inbuf,
 					       const int keybits);
 
 static void Camellia_EncryptBlock(const int keyBitLength,
 				  const unsigned char *plaintext,
 				  const KEY_TABLE_TYPE keyTable,
 				  unsigned char *cipherText)
 {
   _gcry_camellia_arm_encrypt_block(keyTable, cipherText, plaintext,
 				     keyBitLength);
 }
 
 static void Camellia_DecryptBlock(const int keyBitLength,
 				  const unsigned char *cipherText,
 				  const KEY_TABLE_TYPE keyTable,
 				  unsigned char *plaintext)
 {
   _gcry_camellia_arm_decrypt_block(keyTable, plaintext, cipherText,
 				     keyBitLength);
 }
 
 #ifdef __aarch64__
 #  define CAMELLIA_encrypt_stack_burn_size (0)
 #  define CAMELLIA_decrypt_stack_burn_size (0)
 #else
 #  define CAMELLIA_encrypt_stack_burn_size (15*4)
 #  define CAMELLIA_decrypt_stack_burn_size (15*4)
 #endif
 
 static unsigned int
 camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
 {
   CAMELLIA_context *ctx = c;
   Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
   return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
 }
 
 static unsigned int
 camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
 {
   CAMELLIA_context *ctx=c;
   Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
   return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
 }
 
 #else /*USE_ARM_ASM*/
 
 static unsigned int
 camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
 {
   CAMELLIA_context *ctx=c;
 
   Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
 
 #define CAMELLIA_encrypt_stack_burn_size \
   (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
      +4*sizeof(u32)+4*sizeof(u32) \
      +2*sizeof(u32*)+4*sizeof(u32) \
      +2*2*sizeof(void*) /* Function calls.  */ \
     )
 
   return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
 }
 
 static unsigned int
 camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
 {
   CAMELLIA_context *ctx=c;
 
   Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
 
 #define CAMELLIA_decrypt_stack_burn_size \
     (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
      +4*sizeof(u32)+4*sizeof(u32) \
      +2*sizeof(u32*)+4*sizeof(u32) \
      +2*2*sizeof(void*) /* Function calls.  */ \
     )
 
   return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
 }
 
 #endif /*!USE_ARM_ASM*/
 
 
 static unsigned int
 camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
                           unsigned int num_blks)
 {
   const CAMELLIA_context *ctx = priv;
   unsigned int stack_burn_size = 0;
 
   gcry_assert (num_blks <= 32);
 
 #ifdef USE_GFNI_AVX2
   if (ctx->use_gfni_avx2 && num_blks >= 3)
     {
       /* 3 or more parallel block GFNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_VAES_AVX2
   if (ctx->use_vaes_avx2 && num_blks >= 6)
     {
       /* 6 or more parallel block VAES processing is faster than
        * generic C implementation.  */
       _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2 && num_blks >= 6)
     {
       /* 6 or more parallel block AESNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 
   while (num_blks)
     {
       stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf);
       outbuf += CAMELLIA_BLOCK_SIZE;
       inbuf += CAMELLIA_BLOCK_SIZE;
       num_blks--;
     }
 
   return stack_burn_size;
 }
 
+static unsigned int
+camellia_encrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf,
+                          unsigned int num_blks)
+{
+  const CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+  unsigned int nburn;
+
+  gcry_assert (num_blks <= 64);
+
+#ifdef USE_GFNI_AVX512
+  if (num_blks == 64 && ctx->use_gfni_avx512)
+    {
+      _gcry_camellia_gfni_avx512_enc_blk64 (ctx, outbuf, inbuf);
+      return avx512_burn_stack_depth;
+    }
+#endif
+
+  do
+    {
+      unsigned int curr_blks = num_blks > 32 ? 32 : num_blks;
+      nburn = camellia_encrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += curr_blks * 16;
+      inbuf += curr_blks * 16;
+      num_blks -= curr_blks;
+    }
+  while (num_blks > 0);
+
+  return stack_burn_size;
+}
 
 static unsigned int
 camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
                           unsigned int num_blks)
 {
   const CAMELLIA_context *ctx = priv;
   unsigned int stack_burn_size = 0;
 
   gcry_assert (num_blks <= 32);
 
 #ifdef USE_GFNI_AVX2
   if (ctx->use_gfni_avx2 && num_blks >= 3)
     {
       /* 3 or more parallel block GFNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_VAES_AVX2
   if (ctx->use_vaes_avx2 && num_blks >= 6)
     {
       /* 6 or more parallel block VAES processing is faster than
        * generic C implementation.  */
       _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2 && num_blks >= 6)
     {
       /* 6 or more parallel block AESNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 
   while (num_blks)
     {
       stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf);
       outbuf += CAMELLIA_BLOCK_SIZE;
       inbuf += CAMELLIA_BLOCK_SIZE;
       num_blks--;
     }
 
   return stack_burn_size;
 }
 
+static unsigned int
+camellia_decrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf,
+                          unsigned int num_blks)
+{
+  const CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+  unsigned int nburn;
+
+  gcry_assert (num_blks <= 64);
+
+#ifdef USE_GFNI_AVX512
+  if (num_blks == 64 && ctx->use_gfni_avx512)
+    {
+      _gcry_camellia_gfni_avx512_dec_blk64 (ctx, outbuf, inbuf);
+      return avx512_burn_stack_depth;
+    }
+#endif
+
+  do
+    {
+      unsigned int curr_blks = num_blks > 32 ? 32 : num_blks;
+      nburn = camellia_decrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += curr_blks * 16;
+      inbuf += curr_blks * 16;
+      num_blks -= curr_blks;
+    }
+  while (num_blks > 0);
+
+  return stack_burn_size;
+}
+
 
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
    of size CAMELLIA_BLOCK_SIZE. */
 static void
 _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
                        void *outbuf_arg, const void *inbuf_arg,
                        size_t nblocks)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+
+      /* Process data in 64 block chunks. */
+      while (nblocks >= 64)
+        {
+          _gcry_camellia_gfni_avx512_ctr_enc (ctx, outbuf, inbuf, ctr);
+          nblocks -= 64;
+          outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+          did_use_gfni_avx512 = 1;
+        }
+
+      if (did_use_gfni_avx512)
+        {
+          if (burn_stack_depth < avx512_burn_stack_depth)
+            burn_stack_depth = avx512_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
       typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn =
 	  _gcry_camellia_aesni_avx2_ctr_enc;
 
 #ifdef USE_VAES_AVX2
       if (ctx->use_vaes_avx2)
 	bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc;
 #endif
 #ifdef USE_GFNI_AVX2
       if (ctx->use_gfni_avx2)
 	bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
 	  bulk_ctr_fn (ctx, outbuf, inbuf, ctr);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
           did_use_aesni_avx2 = 1;
         }
 
       if (did_use_aesni_avx2)
         {
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
-      /* TODO: use caching instead? */
     }
 #endif
 
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
       int did_use_aesni_avx = 0;
 
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
         {
           _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr);
 
           nblocks -= 16;
           outbuf += 16 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
           did_use_aesni_avx = 1;
         }
 
       if (did_use_aesni_avx)
         {
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
-      /* TODO: use caching instead? */
     }
 #endif
 
   /* Process remaining blocks. */
   if (nblocks)
     {
       byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
       unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
       size_t nburn;
 
       nburn = bulk_ctr_enc_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
                                nblocks, ctr, tmpbuf,
                                sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
 
   if (burn_stack_depth)
     _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CBC mode.  This function is only
    intended for the bulk encryption feature of cipher.c. */
 static void
 _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
                        void *outbuf_arg, const void *inbuf_arg,
                        size_t nblocks)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+
+      /* Process data in 64 block chunks. */
+      while (nblocks >= 64)
+        {
+          _gcry_camellia_gfni_avx512_cbc_dec (ctx, outbuf, inbuf, iv);
+          nblocks -= 64;
+          outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+          did_use_gfni_avx512 = 1;
+        }
+
+      if (did_use_gfni_avx512)
+        {
+          if (burn_stack_depth < avx512_burn_stack_depth)
+            burn_stack_depth = avx512_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
       typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn =
 	  _gcry_camellia_aesni_avx2_cbc_dec;
 
 #ifdef USE_VAES_AVX2
       if (ctx->use_vaes_avx2)
 	bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec;
 #endif
 #ifdef USE_GFNI_AVX2
       if (ctx->use_gfni_avx2)
 	bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
 	  bulk_cbc_fn (ctx, outbuf, inbuf, iv);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
           did_use_aesni_avx2 = 1;
         }
 
       if (did_use_aesni_avx2)
         {
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
       int did_use_aesni_avx = 0;
 
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
         {
           _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv);
 
           nblocks -= 16;
           outbuf += 16 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
           did_use_aesni_avx = 1;
         }
 
       if (did_use_aesni_avx)
         {
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
   /* Process remaining blocks. */
   if (nblocks)
     {
       byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
       unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
       size_t nburn;
 
       nburn = bulk_cbc_dec_128(ctx, camellia_decrypt_blk1_32, outbuf, inbuf,
                                nblocks, iv, tmpbuf,
                                sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
 
   if (burn_stack_depth)
     _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CFB mode.  This function is only
    intended for the bulk encryption feature of cipher.c. */
 static void
 _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
                        void *outbuf_arg, const void *inbuf_arg,
                        size_t nblocks)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+
+      /* Process data in 64 block chunks. */
+      while (nblocks >= 64)
+        {
+          _gcry_camellia_gfni_avx512_cfb_dec (ctx, outbuf, inbuf, iv);
+          nblocks -= 64;
+          outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+          did_use_gfni_avx512 = 1;
+        }
+
+      if (did_use_gfni_avx512)
+        {
+          if (burn_stack_depth < avx512_burn_stack_depth)
+            burn_stack_depth = avx512_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
       typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn =
 	  _gcry_camellia_aesni_avx2_cfb_dec;
 
 #ifdef USE_VAES_AVX2
       if (ctx->use_vaes_avx2)
 	bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec;
 #endif
 #ifdef USE_GFNI_AVX2
       if (ctx->use_gfni_avx2)
 	bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
 	  bulk_cfb_fn (ctx, outbuf, inbuf, iv);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
           did_use_aesni_avx2 = 1;
         }
 
       if (did_use_aesni_avx2)
         {
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
       int did_use_aesni_avx = 0;
 
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
         {
           _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
 
           nblocks -= 16;
           outbuf += 16 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
           did_use_aesni_avx = 1;
         }
 
       if (did_use_aesni_avx)
         {
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
   /* Process remaining blocks. */
   if (nblocks)
     {
       byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
       unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
       size_t nburn;
 
       nburn = bulk_cfb_dec_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
                                nblocks, iv, tmpbuf,
                                sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
 
   if (burn_stack_depth)
     _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk encryption/decryption of complete blocks in XTS mode. */
 static void
 _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
                           void *outbuf_arg, const void *inbuf_arg,
                           size_t nblocks, int encrypt)
 {
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
   /* Process remaining blocks. */
   if (nblocks)
     {
-      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 64];
       unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
       size_t nburn;
 
-      nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_32
-                                              : camellia_decrypt_blk1_32,
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64
+                                              : camellia_decrypt_blk1_64,
                                  outbuf, inbuf, nblocks, tweak, tmpbuf,
                                  sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
                                  &tmp_used);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
 
   if (burn_stack_depth)
     _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
 static size_t
 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			  const void *inbuf_arg, size_t nblocks, int encrypt)
 {
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
 
 #else
   (void)c;
   (void)outbuf_arg;
   (void)inbuf_arg;
   (void)encrypt;
 #endif
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+      u64 Ls[64];
+      u64 *l;
+
+      if (nblocks >= 64)
+	{
+	  typeof (&_gcry_camellia_gfni_avx512_ocb_dec) bulk_ocb_fn =
+	      encrypt ? _gcry_camellia_gfni_avx512_ocb_enc
+		      : _gcry_camellia_gfni_avx512_ocb_dec;
+          l = bulk_ocb_prepare_L_pointers_array_blk64 (c, Ls, blkn);
+
+	  /* Process data in 64 block chunks. */
+	  while (nblocks >= 64)
+	    {
+	      blkn += 64;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 64);
+
+	      bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+	      nblocks -= 64;
+	      outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+	      did_use_gfni_avx512 = 1;
+	    }
+	}
+
+      if (did_use_gfni_avx512)
+	{
+	  if (burn_stack_depth < avx2_burn_stack_depth)
+	    burn_stack_depth = avx2_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
       u64 Ls[32];
       u64 *l;
 
       if (nblocks >= 32)
 	{
 	  typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn =
 	      encrypt ? _gcry_camellia_aesni_avx2_ocb_enc
 		      : _gcry_camellia_aesni_avx2_ocb_dec;
 
 #ifdef USE_VAES_AVX2
 	  if (ctx->use_vaes_avx2)
 	    bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc
 				  : _gcry_camellia_vaes_avx2_ocb_dec;
 #endif
 #ifdef USE_GFNI_AVX2
 	  if (ctx->use_gfni_avx2)
 	    bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc
 				  : _gcry_camellia_gfni_avx2_ocb_dec;
 #endif
           l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
 
 	  /* Process data in 32 block chunks. */
 	  while (nblocks >= 32)
 	    {
 	      blkn += 32;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
 
 	      bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
 
 	      nblocks -= 32;
 	      outbuf += 32 * CAMELLIA_BLOCK_SIZE;
 	      inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
 	      did_use_aesni_avx2 = 1;
 	    }
 	}
 
       if (did_use_aesni_avx2)
 	{
 	  if (burn_stack_depth < avx2_burn_stack_depth)
 	    burn_stack_depth = avx2_burn_stack_depth;
 	}
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
       int did_use_aesni_avx = 0;
       u64 Ls[16];
       u64 *l;
 
       if (nblocks >= 16)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
 	  /* Process data in 16 block chunks. */
 	  while (nblocks >= 16)
 	    {
 	      blkn += 16;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
 
 	      if (encrypt)
 		_gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
 						c->u_ctr.ctr, Ls);
 	      else
 		_gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
 						c->u_ctr.ctr, Ls);
 
 	      nblocks -= 16;
 	      outbuf += 16 * CAMELLIA_BLOCK_SIZE;
 	      inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
 	      did_use_aesni_avx = 1;
 	    }
 	}
 
       if (did_use_aesni_avx)
 	{
 	  if (burn_stack_depth < avx_burn_stack_depth)
 	    burn_stack_depth = avx_burn_stack_depth;
 	}
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   /* Process remaining blocks. */
   if (nblocks)
     {
       byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
       unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
       size_t nburn;
 
       nburn = bulk_ocb_crypt_128 (c, ctx, encrypt ? camellia_encrypt_blk1_32
                                                   : camellia_decrypt_blk1_32,
                                   outbuf, inbuf, nblocks, &blkn, encrypt,
                                   tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
                                   &tmp_used);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
       nblocks = 0;
     }
 
   c->u_mode.ocb.data_nblocks = blkn;
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
 #endif
 
   return nblocks;
 }
 
 /* Bulk authentication of complete blocks in OCB mode. */
 static size_t
 _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			 size_t nblocks)
 {
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   int burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
 #else
   (void)c;
   (void)abuf_arg;
 #endif
 
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
       u64 Ls[32];
       u64 *l;
 
       if (nblocks >= 32)
 	{
 	  typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn =
 	      _gcry_camellia_aesni_avx2_ocb_auth;
 
 #ifdef USE_VAES_AVX2
 	  if (ctx->use_vaes_avx2)
 	    bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth;
 #endif
 #ifdef USE_GFNI_AVX2
 	  if (ctx->use_gfni_avx2)
 	    bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth;
 #endif
 
           l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
 
 	  /* Process data in 32 block chunks. */
 	  while (nblocks >= 32)
 	    {
 	      blkn += 32;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
 
 	      bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset,
 			    c->u_mode.ocb.aad_sum, Ls);
 
 	      nblocks -= 32;
 	      abuf += 32 * CAMELLIA_BLOCK_SIZE;
 	      did_use_aesni_avx2 = 1;
 	    }
 	}
 
       if (did_use_aesni_avx2)
 	{
 	  if (burn_stack_depth < avx2_burn_stack_depth)
 	    burn_stack_depth = avx2_burn_stack_depth;
 	}
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
       int did_use_aesni_avx = 0;
       u64 Ls[16];
       u64 *l;
 
       if (nblocks >= 16)
 	{
           l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
 	  /* Process data in 16 block chunks. */
 	  while (nblocks >= 16)
 	    {
 	      blkn += 16;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
 
 	      _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf,
 						c->u_mode.ocb.aad_offset,
 						c->u_mode.ocb.aad_sum, Ls);
 
 	      nblocks -= 16;
 	      abuf += 16 * CAMELLIA_BLOCK_SIZE;
 	      did_use_aesni_avx = 1;
 	    }
 	}
 
       if (did_use_aesni_avx)
 	{
 	  if (burn_stack_depth < avx_burn_stack_depth)
 	    burn_stack_depth = avx_burn_stack_depth;
 	}
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   /* Process remaining blocks. */
   if (nblocks)
     {
       byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
       unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
       size_t nburn;
 
       nburn = bulk_ocb_auth_128 (c, ctx, camellia_encrypt_blk1_32,
                                  abuf, nblocks, &blkn, tmpbuf,
                                  sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
                                  &tmp_used);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
       nblocks = 0;
     }
 
   c->u_mode.ocb.aad_nblocks = blkn;
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
 #endif
 
   return nblocks;
 }
 
 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 static const char*
 selftest_ctr_128 (void)
 {
-  const int nblocks = 32+16+1;
+  const int nblocks = 64+32+16+1;
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
   return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, nblocks, blocksize, context_size);
 }
 
 /* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
    Returns NULL on success. */
 static const char*
 selftest_cbc_128 (void)
 {
-  const int nblocks = 32+16+2;
+  const int nblocks = 64+32+16+2;
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
   return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, nblocks, blocksize, context_size);
 }
 
 /* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
    Returns NULL on success. */
 static const char*
 selftest_cfb_128 (void)
 {
-  const int nblocks = 32+16+2;
+  const int nblocks = 64+32+16+2;
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
   return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, nblocks, blocksize, context_size);
 }
 
 static const char *
 selftest(void)
 {
   CAMELLIA_context ctx;
   byte scratch[16];
   cipher_bulk_ops_t bulk_ops;
   const char *r;
 
   /* These test vectors are from RFC-3713 */
   static const byte plaintext[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
       0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
     };
   static const byte key_128[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
       0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
     };
   static const byte ciphertext_128[]=
     {
       0x67,0x67,0x31,0x38,0x54,0x96,0x69,0x73,
       0x08,0x57,0x06,0x56,0x48,0xea,0xbe,0x43
     };
   static const byte key_192[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,0x98,
       0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77
     };
   static const byte ciphertext_192[]=
     {
       0xb4,0x99,0x34,0x01,0xb3,0xe9,0x96,0xf8,
       0x4e,0xe5,0xce,0xe7,0xd7,0x9b,0x09,0xb9
     };
   static const byte key_256[]=
     {
       0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,
       0x98,0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,
       0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
     };
   static const byte ciphertext_256[]=
     {
       0x9a,0xcc,0x23,0x7d,0xff,0x16,0xd7,0x6c,
       0x20,0xef,0x7c,0x91,0x9e,0x3a,0x75,0x09
     };
 
   camellia_setkey(&ctx,key_128,sizeof(key_128),&bulk_ops);
   camellia_encrypt(&ctx,scratch,plaintext);
   if(memcmp(scratch,ciphertext_128,sizeof(ciphertext_128))!=0)
     return "CAMELLIA-128 test encryption failed.";
   camellia_decrypt(&ctx,scratch,scratch);
   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
     return "CAMELLIA-128 test decryption failed.";
 
   camellia_setkey(&ctx,key_192,sizeof(key_192),&bulk_ops);
   camellia_encrypt(&ctx,scratch,plaintext);
   if(memcmp(scratch,ciphertext_192,sizeof(ciphertext_192))!=0)
     return "CAMELLIA-192 test encryption failed.";
   camellia_decrypt(&ctx,scratch,scratch);
   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
     return "CAMELLIA-192 test decryption failed.";
 
   camellia_setkey(&ctx,key_256,sizeof(key_256),&bulk_ops);
   camellia_encrypt(&ctx,scratch,plaintext);
   if(memcmp(scratch,ciphertext_256,sizeof(ciphertext_256))!=0)
     return "CAMELLIA-256 test encryption failed.";
   camellia_decrypt(&ctx,scratch,scratch);
   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
     return "CAMELLIA-256 test decryption failed.";
 
   if ( (r = selftest_ctr_128 ()) )
     return r;
 
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
   if ( (r = selftest_cfb_128 ()) )
     return r;
 
   return NULL;
 }
 
 /* These oids are from
    <http://info.isl.ntt.co.jp/crypt/eng/camellia/specifications_oid.html>,
    retrieved May 1, 2007. */
 
 static const gcry_cipher_oid_spec_t camellia128_oids[] =
   {
     {"1.2.392.200011.61.1.1.1.2", GCRY_CIPHER_MODE_CBC},
     {"0.3.4401.5.3.1.9.1", GCRY_CIPHER_MODE_ECB},
     {"0.3.4401.5.3.1.9.3", GCRY_CIPHER_MODE_OFB},
     {"0.3.4401.5.3.1.9.4", GCRY_CIPHER_MODE_CFB},
     { NULL }
   };
 
 static const gcry_cipher_oid_spec_t camellia192_oids[] =
   {
     {"1.2.392.200011.61.1.1.1.3", GCRY_CIPHER_MODE_CBC},
     {"0.3.4401.5.3.1.9.21", GCRY_CIPHER_MODE_ECB},
     {"0.3.4401.5.3.1.9.23", GCRY_CIPHER_MODE_OFB},
     {"0.3.4401.5.3.1.9.24", GCRY_CIPHER_MODE_CFB},
     { NULL }
   };
 
 static const gcry_cipher_oid_spec_t camellia256_oids[] =
   {
     {"1.2.392.200011.61.1.1.1.4", GCRY_CIPHER_MODE_CBC},
     {"0.3.4401.5.3.1.9.41", GCRY_CIPHER_MODE_ECB},
     {"0.3.4401.5.3.1.9.43", GCRY_CIPHER_MODE_OFB},
     {"0.3.4401.5.3.1.9.44", GCRY_CIPHER_MODE_CFB},
     { NULL }
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_camellia128 =
   {
     GCRY_CIPHER_CAMELLIA128, {0, 0},
     "CAMELLIA128",NULL,camellia128_oids,CAMELLIA_BLOCK_SIZE,128,
     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_camellia192 =
   {
     GCRY_CIPHER_CAMELLIA192, {0, 0},
     "CAMELLIA192",NULL,camellia192_oids,CAMELLIA_BLOCK_SIZE,192,
     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_camellia256 =
   {
     GCRY_CIPHER_CAMELLIA256, {0, 0},
     "CAMELLIA256",NULL,camellia256_oids,CAMELLIA_BLOCK_SIZE,256,
     sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
   };
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index da24286e..8b4d7499 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -1,300 +1,300 @@
 /* chacha20-amd64-avx512.S  -  AVX512 implementation of ChaCha20 cipher
  *
  * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 .text
 
 #include "asm-common-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
 #define DST   %rsi
 #define SRC   %rdx
 #define NBLKS %rcx
 #define ROUND %eax
 
 /* vector registers */
 #define X0 %zmm0
 #define X1 %zmm1
 #define X2 %zmm2
 #define X3 %zmm3
 #define X4 %zmm4
 #define X5 %zmm5
 #define X6 %zmm6
 #define X7 %zmm7
 #define X8 %zmm8
 #define X9 %zmm9
 #define X10 %zmm10
 #define X11 %zmm11
 #define X12 %zmm12
 #define X13 %zmm13
 #define X14 %zmm14
 #define X15 %zmm15
 
 #define TMP0 %zmm16
 #define TMP1 %zmm17
 
 #define COUNTER_ADD %zmm18
 
 #define X12_SAVE %zmm19
 #define X13_SAVE %zmm20
 
 #define S0 %zmm21
 #define S1 %zmm22
 #define S2 %zmm23
 #define S3 %zmm24
 #define S4 %zmm25
 #define S5 %zmm26
 #define S6 %zmm27
 #define S7 %zmm28
 #define S8 %zmm29
 #define S14 %zmm30
 #define S15 %zmm31
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1, x0, x1; \
 	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2, t2, x2;
 
 /* 4x4 128-bit matrix transpose */
 #define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \
 	vshufi32x4 $0xee, x1, x0, t2; \
 	vshufi32x4 $0x44, x1, x0, x0; \
 	\
 	vshufi32x4 $0x44, x3, x2, t1; \
 	vshufi32x4 $0xee, x3, x2, x2; \
 	\
 	vshufi32x4 $0xdd, t1, x0, x1; \
 	vshufi32x4 $0x88, t1, x0, x0; \
 	\
 	vshufi32x4 $0xdd, x2, t2, x3; \
 	vshufi32x4 $0x88, x2, t2, x2;
 
 #define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \
 	vpxord (offset + 0 * (add))(src), x0, x0; \
 	vpxord (offset + 1 * (add))(src), x4, x4; \
 	vpxord (offset + 2 * (add))(src), x8, x8; \
 	vpxord (offset + 3 * (add))(src), x12, x12; \
 	vmovdqu32 x0, (offset + 0 * (add))(dst); \
 	vmovdqu32 x4, (offset + 1 * (add))(dst); \
 	vmovdqu32 x8, (offset + 2 * (add))(dst); \
 	vmovdqu32 x12, (offset + 3 * (add))(dst);
 
 #define xor_src_dst(dst, src, offset, xreg) \
 	vpxord offset(src), xreg, xreg; \
 	vmovdqu32 xreg, offset(dst);
 
 #define clear_vec4(v0,v1,v2,v3) \
 	vpxord v0, v0, v0; \
 	vpxord v1, v1, v1; \
 	vpxord v2, v2, v2; \
 	vpxord v3, v3, v3;
 
 #define clear_zmm16_zmm31() \
 	clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
 	clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
 	clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
 	clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31);
 
 /**********************************************************************
   16-way chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c)	\
 	vprold $(c), v1, v1;	\
 	vprold $(c), v2, v2;
 
 #define XOR(ds,s) \
 	vpxord s, ds, ds;
 
 #define PLUS(ds,s) \
 	vpaddd s, ds, ds;
 
 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE2(d1, d2, 16);				\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12);				\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE2(d1, d2, 8);					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 7);
 
 .align 64
 ELF(.type _gcry_chacha20_amd64_avx512_data,@object;)
 _gcry_chacha20_amd64_avx512_data:
 .Linc_counter:
 	.byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 .Lone:
 	.long 1,0,0,0
 ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)
 
 .align 16
 .globl _gcry_chacha20_amd64_avx512_blocks16
 ELF(.type _gcry_chacha20_amd64_avx512_blocks16,@function;)
 _gcry_chacha20_amd64_avx512_blocks16:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 16)
 	 */
 	CFI_STARTPROC();
 
 	vpxord %xmm16, %xmm16, %xmm16;
 	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
 
 	vpmovzxbd .Linc_counter rRIP, COUNTER_ADD;
 
 	/* Preload state */
 	vpbroadcastd (0 * 4)(INPUT), S0;
 	vpbroadcastd (1 * 4)(INPUT), S1;
 	vpbroadcastd (2 * 4)(INPUT), S2;
 	vpbroadcastd (3 * 4)(INPUT), S3;
 	vpbroadcastd (4 * 4)(INPUT), S4;
 	vpbroadcastd (5 * 4)(INPUT), S5;
 	vpbroadcastd (6 * 4)(INPUT), S6;
 	vpbroadcastd (7 * 4)(INPUT), S7;
 	vpbroadcastd (8 * 4)(INPUT), S8;
 	vpbroadcastd (14 * 4)(INPUT), S14;
 	vpbroadcastd (15 * 4)(INPUT), S15;
 
 .align 16
 .Loop16:
 	movl $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
 	vpbroadcastd (12 * 4)(INPUT), X12;
 	vpbroadcastd (13 * 4)(INPUT), X13;
 	vpaddd COUNTER_ADD, X12, X12;
 	vpcmpud $6, X12, COUNTER_ADD, %k2;
 	vpaddd .Lone rRIP {1to16}, X13, X13{%k2};
 	vmovdqa32 X12, X12_SAVE;
 	vmovdqa32 X13, X13_SAVE;
 
 	/* Load vectors */
 	vmovdqa32 S0, X0;
 	vmovdqa32 S4, X4;
 	vmovdqa32 S8, X8;
 	vmovdqa32 S1, X1;
 	vmovdqa32 S5, X5;
 	vpbroadcastd (9 * 4)(INPUT), X9;
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13)
 	vmovdqa32 S2, X2;
 	vmovdqa32 S6, X6;
 	vpbroadcastd (10 * 4)(INPUT), X10;
 	vmovdqa32 S14, X14;
 	vmovdqa32 S3, X3;
 	vmovdqa32 S7, X7;
 	vpbroadcastd (11 * 4)(INPUT), X11;
 	vmovdqa32 S15, X15;
 
 	/* Update counter */
 	addq $16, (12 * 4)(INPUT);
 	jmp .Lround2_entry;
 
 .align 16
 .Lround2:
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14)
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13)
 .Lround2_entry:
 	subl $2, ROUND;
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12)
 	jnz .Lround2;
 
 .Lround2_end:
 	PLUS(X0, S0);
 	PLUS(X1, S1);
 	PLUS(X5, S5);
 	PLUS(X6, S6);
 	PLUS(X10, (10 * 4)(INPUT){1to16});
 	PLUS(X11, (11 * 4)(INPUT){1to16});
 	PLUS(X15, S15);
 	PLUS(X12, X12_SAVE);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14)
 
 	PLUS(X2, S2);
 	PLUS(X3, S3);
 	PLUS(X4, S4);
 	PLUS(X7, S7);
 	transpose_4x4(X0, X1, X2, X3, TMP0, TMP1);
 	transpose_4x4(X4, X5, X6, X7, TMP0, TMP1);
 	PLUS(X8, S8);
 	PLUS(X9, (9 * 4)(INPUT){1to16});
 	PLUS(X13, X13_SAVE);
 	PLUS(X14, S14);
 	transpose_4x4(X8, X9, X10, X11, TMP0, TMP1);
 	transpose_4x4(X12, X13, X14, X15, TMP0, TMP1);
 
 	transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1);
 	xor_src_dst_4x4(DST, SRC, (64 * 0), (64 * 4), X0, X4, X8, X12);
 	transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
 	xor_src_dst_4x4(DST, SRC, (64 * 1), (64 * 4), X1, X5, X9, X13);
 	transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
 	xor_src_dst_4x4(DST, SRC, (64 * 2), (64 * 4), X2, X6, X10, X14);
 	transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
 	xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15);
 
 	subq $16, NBLKS;
 	leaq (16 * 64)(SRC), SRC;
 	leaq (16 * 64)(DST), DST;
 	jnz .Loop16;
 
 	/* clear the used vector registers */
 	clear_zmm16_zmm31();
-	kmovd %eax, %k2;
+	kxord %k2, %k2, %k2;
 	vzeroall; /* clears ZMM0-ZMM15 */
 
 	/* eax zeroed by round loop. */
 	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_avx512_blocks16,
 	  .-_gcry_chacha20_amd64_avx512_blocks16;)
 
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 48892777..72303e1e 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1,1625 +1,1625 @@
 /*
 ;;
 ;; Copyright (c) 2021-2022, Intel Corporation
 ;;
 ;; Redistribution and use in source and binary forms, with or without
 ;; modification, are permitted provided that the following conditions are met:
 ;;
 ;;     * Redistributions of source code must retain the above copyright notice,
 ;;       this list of conditions and the following disclaimer.
 ;;     * Redistributions in binary form must reproduce the above copyright
 ;;       notice, this list of conditions and the following disclaimer in the
 ;;       documentation and/or other materials provided with the distribution.
 ;;     * Neither the name of Intel Corporation nor the names of its contributors
 ;;       may be used to endorse or promote products derived from this software
 ;;       without specific prior written permission.
 ;;
 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;
 */
 /*
  * From:
  *  https://github.com/intel/intel-ipsec-mb/blob/f0cad21a644231c0f5d4af51f56061a5796343fb/lib/avx512/poly_fma_avx512.asm
  *
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX512)
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 ELF(.type _gcry_poly1305_avx512_consts,@object)
 _gcry_poly1305_avx512_consts:
 
 .align 64
 .Lmask_44:
   .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
   .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
 
 .align 64
 .Lmask_42:
   .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
   .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
 
 .align 64
 .Lhigh_bit:
   .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
   .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
 
 .Lbyte_len_to_mask_table:
   .short 0x0000, 0x0001, 0x0003, 0x0007
   .short 0x000f, 0x001f, 0x003f, 0x007f
   .short 0x00ff, 0x01ff, 0x03ff, 0x07ff
   .short 0x0fff, 0x1fff, 0x3fff, 0x7fff
   .short 0xffff
 
 .align 64
 .Lbyte64_len_to_mask_table:
   .quad 0x0000000000000000, 0x0000000000000001
   .quad 0x0000000000000003, 0x0000000000000007
   .quad 0x000000000000000f, 0x000000000000001f
   .quad 0x000000000000003f, 0x000000000000007f
   .quad 0x00000000000000ff, 0x00000000000001ff
   .quad 0x00000000000003ff, 0x00000000000007ff
   .quad 0x0000000000000fff, 0x0000000000001fff
   .quad 0x0000000000003fff, 0x0000000000007fff
   .quad 0x000000000000ffff, 0x000000000001ffff
   .quad 0x000000000003ffff, 0x000000000007ffff
   .quad 0x00000000000fffff, 0x00000000001fffff
   .quad 0x00000000003fffff, 0x00000000007fffff
   .quad 0x0000000000ffffff, 0x0000000001ffffff
   .quad 0x0000000003ffffff, 0x0000000007ffffff
   .quad 0x000000000fffffff, 0x000000001fffffff
   .quad 0x000000003fffffff, 0x000000007fffffff
   .quad 0x00000000ffffffff, 0x00000001ffffffff
   .quad 0x00000003ffffffff, 0x00000007ffffffff
   .quad 0x0000000fffffffff, 0x0000001fffffffff
   .quad 0x0000003fffffffff, 0x0000007fffffffff
   .quad 0x000000ffffffffff, 0x000001ffffffffff
   .quad 0x000003ffffffffff, 0x000007ffffffffff
   .quad 0x00000fffffffffff, 0x00001fffffffffff
   .quad 0x00003fffffffffff, 0x00007fffffffffff
   .quad 0x0000ffffffffffff, 0x0001ffffffffffff
   .quad 0x0003ffffffffffff, 0x0007ffffffffffff
   .quad 0x000fffffffffffff, 0x001fffffffffffff
   .quad 0x003fffffffffffff, 0x007fffffffffffff
   .quad 0x00ffffffffffffff, 0x01ffffffffffffff
   .quad 0x03ffffffffffffff, 0x07ffffffffffffff
   .quad 0x0fffffffffffffff, 0x1fffffffffffffff
   .quad 0x3fffffffffffffff, 0x7fffffffffffffff
   .quad 0xffffffffffffffff
 
 .Lqword_high_bit_mask:
   .short 0, 0x1, 0x5, 0x15, 0x55, 0x57, 0x5f, 0x7f, 0xff
 
 ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
 
 #define raxd eax
 #define rbxd ebx
 #define rcxd ecx
 #define rdxd edx
 #define rsid esi
 #define rdid edi
 #define rbpd ebp
 #define rspd esp
 #define __DWORD(X) X##d
 #define DWORD(R) __DWORD(R)
 
 #define arg1    rdi
 #define arg2    rsi
 #define arg3    rdx
 #define arg4    rcx
 
 #define job     arg1
 #define gp1     rsi
 #define gp2     rcx
 
 /* ;; don't use rdx and rax - they are needed for multiply operation */
 #define gp3     rbp
 #define gp4     r8
 #define gp5     r9
 #define gp6     r10
 #define gp7     r11
 #define gp8     r12
 #define gp9     r13
 #define gp10    r14
 #define gp11    r15
 
 #define len     gp11
 #define msg     gp10
 
 #define POLY1305_BLOCK_SIZE 16
 
 #define STACK_r_save         0
 #define STACK_r_save_size    (6 * 64)
 #define STACK_gpr_save       (STACK_r_save + STACK_r_save_size)
 #define STACK_gpr_save_size  (8 * 8)
 #define STACK_rsp_save       (STACK_gpr_save + STACK_gpr_save_size)
 #define STACK_rsp_save_size  (1 * 8)
 #define STACK_SIZE           (STACK_rsp_save + STACK_rsp_save_size)
 
 #define A2_ZERO(...) /**/
 #define A2_ZERO_INVERT(...) __VA_ARGS__
 #define A2_NOT_ZERO(...) __VA_ARGS__
 #define A2_NOT_ZERO_INVERT(...) /**/
 
 #define clear_zmm(vec) vpxord vec, vec, vec
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Computes hash for message length being multiple of block size
 ;; =============================================================================
 ;; Combining 64-bit x 64-bit multiplication with reduction steps
 ;;
 ;; NOTES:
 ;;   1) A2 here is only two bits so anything above is subject of reduction.
 ;;      Constant C1 = R1 + (R1 >> 2) simplifies multiply with less operations
 ;;   2) Magic 5x comes from mod 2^130-5 property and incorporating
 ;;      reduction into multiply phase.
 ;;      See "Cheating at modular arithmetic" and "Poly1305's prime: 2^130 - 5"
 ;;      paragraphs at https://loup-vaillant.fr/tutorials/poly1305-design for more details.
 ;;
 ;; Flow of the code below is as follows:
 ;;
 ;;          A2        A1        A0
 ;;        x           R1        R0
 ;;   -----------------------------
 ;;       A2×R0     A1×R0     A0×R0
 ;;   +             A0×R1
 ;;   +           5xA2xR1   5xA1xR1
 ;;   -----------------------------
 ;;     [0|L2L] [L1H|L1L] [L0H|L0L]
 ;;
 ;;   Registers:  T3:T2     T1:A0
 ;;
 ;; Completing the multiply and adding (with carry) 3x128-bit limbs into
 ;; 192-bits again (3x64-bits):
 ;; A0 = L0L
 ;; A1 = L0H + L1L
 ;; T3 = L1H + L2L
 ; A0     [in/out] GPR with accumulator bits 63:0
 ; A1     [in/out] GPR with accumulator bits 127:64
 ; A2     [in/out] GPR with accumulator bits 195:128
 ; R0     [in] GPR with R constant bits 63:0
 ; R1     [in] GPR with R constant bits 127:64
 ; C1     [in] C1 = R1 + (R1 >> 2)
 ; T1     [clobbered] GPR register
 ; T2     [clobbered] GPR register
 ; T3     [clobbered] GPR register
 ; GP_RAX [clobbered] RAX register
 ; GP_RDX [clobbered] RDX register
 ; IF_A2  [in] Used if input A2 is not 0
 */
 #define POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, C1, T1, T2, T3, GP_RAX, GP_RDX, IF_A2) \
 	/* T3:T2 = (A0 * R1) */ \
 	mov     GP_RAX, R1; \
 	mul     A0; \
 	mov     T2, GP_RAX; \
 	mov     GP_RAX, R0; \
 	mov     T3, GP_RDX; \
 	\
 	/* T1:A0 = (A0 * R0) */ \
 	mul     A0; \
 	mov     A0, GP_RAX; /* A0 not used in other operations */ \
 	mov     GP_RAX, R0; \
 	mov     T1, GP_RDX; \
 	\
 	/* T3:T2 += (A1 * R0) */ \
 	mul     A1; \
 	add     T2, GP_RAX; \
 	mov     GP_RAX, C1; \
 	adc     T3, GP_RDX; \
 	\
 	/* T1:A0 += (A1 * R1x5) */ \
 	mul     A1; \
 	IF_A2(mov A1, A2); /* use A1 for A2 */ \
 	add     A0, GP_RAX; \
 	adc     T1, GP_RDX; \
 	\
 	/* NOTE: A2 is clamped to 2-bits, */ \
 	/*       R1/R0 is clamped to 60-bits, */ \
 	/*       their product is less than 2^64. */ \
 	\
 	IF_A2(/* T3:T2 += (A2 * R1x5) */); \
 	IF_A2(imul    A1, C1); \
 	IF_A2(add     T2, A1); \
 	IF_A2(mov     A1, T1); /* T1:A0 => A1:A0 */ \
 	IF_A2(adc     T3, 0); \
 	\
 	IF_A2(/* T3:A1 += (A2 * R0) */); \
 	IF_A2(imul    A2, R0); \
 	IF_A2(add     A1, T2); \
 	IF_A2(adc     T3, A2); \
 	\
 	IF_A2##_INVERT(/* If A2 == 0, just move and add T1-T2 to A1 */); \
 	IF_A2##_INVERT(mov     A1, T1); \
 	IF_A2##_INVERT(add     A1, T2); \
 	IF_A2##_INVERT(adc     T3, 0); \
 	\
 	/* At this point, 3 64-bit limbs are in T3:A1:A0 */ \
 	/* T3 can span over more than 2 bits so final partial reduction step is needed. */ \
 	\
 	/* Partial reduction (just to fit into 130 bits) */ \
 	/*    A2 = T3 & 3 */ \
 	/*    k = (T3 & ~3) + (T3 >> 2) */ \
 	/*         Y    x4  +  Y    x1 */ \
 	/*    A2:A1:A0 += k */ \
 	\
 	/* Result will be in A2:A1:A0 */ \
 	mov     T1, T3; \
 	mov     DWORD(A2), DWORD(T3); \
 	and     T1, ~3; \
 	shr     T3, 2; \
 	and     DWORD(A2), 3; \
 	add     T1, T3; \
 	\
 	/* A2:A1:A0 += k (kept in T1) */ \
 	add     A0, T1; \
 	adc     A1, 0; \
 	adc     DWORD(A2), 0
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Computes hash for 8 16-byte message blocks,
 ;; and adds new message blocks to accumulator.
 ;;
 ;; It first multiplies all 8 blocks with powers of R:
 ;;
 ;;      a2      a1      a0
 ;; ×    b2      b1      b0
 ;; ---------------------------------------
 ;;     a2×b0   a1×b0   a0×b0
 ;; +   a1×b1   a0×b1 5×a2×b1
 ;; +   a0×b2 5×a2×b2 5×a1×b2
 ;; ---------------------------------------
 ;;        p2      p1      p0
 ;;
 ;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
 ;; multiplying by 5 in case of the carry of p2.
 ;;
 ;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
 ;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
 ;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
 ;R0    [in] ZMM register (R0) to include the 1st limb of R
 ;R1    [in] ZMM register (R1) to include the 2nd limb of R
 ;R2    [in] ZMM register (R2) to include the 3rd limb of R
 ;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
 ;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
 ;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
 ;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
 ;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
 ;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
 ;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
 ;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
 ;ZTMP1 [clobbered] Temporary ZMM register
 */
 #define POLY1305_MUL_REDUCE_VEC(A0, A1, A2, R0, R1, R2, R1P, R2P, P0_L, P0_H, \
 				P1_L, P1_H, P2_L, P2_H, ZTMP1) \
 	/* ;; Reset accumulator */ \
 	vpxorq  P0_L, P0_L, P0_L; \
 	vpxorq  P0_H, P0_H, P0_H; \
 	vpxorq  P1_L, P1_L, P1_L; \
 	vpxorq  P1_H, P1_H, P1_H; \
 	vpxorq  P2_L, P2_L, P2_L; \
 	vpxorq  P2_H, P2_H, P2_H; \
 	\
 	/* ; Reset accumulator and calculate products */ \
 	vpmadd52luq P0_L, A2, R1P; \
 	vpmadd52huq P0_H, A2, R1P; \
 	vpmadd52luq P1_L, A2, R2P; \
 	vpmadd52huq P1_H, A2, R2P; \
 	vpmadd52luq P2_L, A2, R0; \
 	vpmadd52huq P2_H, A2, R0; \
 	\
 	vpmadd52luq P1_L, A0, R1; \
 	vpmadd52huq P1_H, A0, R1; \
 	vpmadd52luq P2_L, A0, R2; \
 	vpmadd52huq P2_H, A0, R2; \
 	vpmadd52luq P0_L, A0, R0; \
 	vpmadd52huq P0_H, A0, R0; \
 	\
 	vpmadd52luq P0_L, A1, R2P; \
 	vpmadd52huq P0_H, A1, R2P; \
 	vpmadd52luq P1_L, A1, R0; \
 	vpmadd52huq P1_H, A1, R0; \
 	vpmadd52luq P2_L, A1, R1; \
 	vpmadd52huq P2_H, A1, R1; \
 	\
 	/* ; Carry propagation (first pass) */ \
 	vpsrlq  ZTMP1, P0_L, 44; \
 	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpsllq  P0_H, P0_H, 8; \
 	vpaddq  P0_H, P0_H, ZTMP1; \
 	vpaddq  P1_L, P1_L, P0_H; \
 	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpsrlq  ZTMP1, P1_L, 44; \
 	vpsllq  P1_H, P1_H, 8; \
 	vpaddq  P1_H, P1_H, ZTMP1; \
 	vpaddq  P2_L, P2_L, P1_H; \
 	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
 	vpsrlq  ZTMP1, P2_L, 42; \
 	vpsllq  P2_H, P2_H, 10; \
 	vpaddq  P2_H, P2_H, ZTMP1; \
 	\
 	/* ; Carry propagation (second pass) */ \
 	\
 	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
 	vpaddq  A0, A0, P2_H; \
 	vpsllq  P2_H, P2_H, 2; \
 	vpaddq  A0, A0, P2_H; \
 	vpsrlq  ZTMP1, A0, 44; \
 	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
 	vpaddq  A1, A1, ZTMP1;
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Computes hash for 16 16-byte message blocks,
 ;; and adds new message blocks to accumulator,
 ;; interleaving this computation with the loading and splatting
 ;; of new data.
 ;;
 ;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
 ;; and 8 blocks from B0-B2, multiplied by R0-R2)
 ;;
 ;;      a2      a1      a0
 ;; ×    b2      b1      b0
 ;; ---------------------------------------
 ;;     a2×b0   a1×b0   a0×b0
 ;; +   a1×b1   a0×b1 5×a2×b1
 ;; +   a0×b2 5×a2×b2 5×a1×b2
 ;; ---------------------------------------
 ;;        p2      p1      p0
 ;;
 ;; Then, it propagates the carry (higher bits after bit 43)
 ;; from lower limbs into higher limbs,
 ;; multiplying by 5 in case of the carry of p2, and adds
 ;; the results to A0-A2 and B0-B2.
 ;;
 ;; =============================================================================
 ;A0    [in/out] ZMM register containing 1st 44-bit limb of blocks 1-8
 ;A1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 1-8
 ;A2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 1-8
 ;B0    [in/out] ZMM register containing 1st 44-bit limb of blocks 9-16
 ;B1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 9-16
 ;B2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 9-16
 ;R0    [in] ZMM register (R0) to include the 1st limb of R
 ;R1    [in] ZMM register (R1) to include the 2nd limb of R
 ;R2    [in] ZMM register (R2) to include the 3rd limb of R
 ;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
 ;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
 ;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
 ;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
 ;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
 ;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
 ;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
 ;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
 ;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
 ;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
 ;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
 ;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
 ;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
 ;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
 ;ZTMP1 [clobbered] Temporary ZMM register
 ;ZTMP2 [clobbered] Temporary ZMM register
 ;ZTMP3 [clobbered] Temporary ZMM register
 ;ZTMP4 [clobbered] Temporary ZMM register
 ;ZTMP5 [clobbered] Temporary ZMM register
 ;ZTMP6 [clobbered] Temporary ZMM register
 ;ZTMP7 [clobbered] Temporary ZMM register
 ;ZTMP8 [clobbered] Temporary ZMM register
 ;ZTMP9 [clobbered] Temporary ZMM register
 ;MSG   [in/out] Pointer to message
 ;LEN   [in/out] Length left of message
 */
 #define POLY1305_MSG_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, \
 				      R2P, P0_L, P0_H, P1_L, P1_H, P2_L, P2_H, \
 				      Q0_L, Q0_H, Q1_L, Q1_H, Q2_L, Q2_H, \
 				      ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, \
 				      ZTMP6, ZTMP7, ZTMP8, ZTMP9, MSG, LEN) \
 	/* ;; Reset accumulator */ \
 	vpxorq  P0_L, P0_L, P0_L; \
 	vpxorq  P0_H, P0_H, P0_H; \
 	vpxorq  P1_L, P1_L, P1_L; \
 	vpxorq  P1_H, P1_H, P1_H; \
 	vpxorq  P2_L, P2_L, P2_L; \
 	vpxorq  P2_H, P2_H, P2_H; \
 	vpxorq  Q0_L, Q0_L, Q0_L; \
 	vpxorq  Q0_H, Q0_H, Q0_H; \
 	vpxorq  Q1_L, Q1_L, Q1_L; \
 	vpxorq  Q1_H, Q1_H, Q1_H; \
 	vpxorq  Q2_L, Q2_L, Q2_L; \
 	vpxorq  Q2_H, Q2_H, Q2_H; \
 	\
 	/* ;; This code interleaves hash computation with input loading/splatting */ \
 	\
 		/* ; Calculate products */ \
 		vpmadd52luq P0_L, A2, R1P; \
 		vpmadd52huq P0_H, A2, R1P; \
 	/* ;; input loading of new blocks */ \
 	add     MSG, POLY1305_BLOCK_SIZE*16; \
 	sub     LEN, POLY1305_BLOCK_SIZE*16; \
 	\
 		vpmadd52luq Q0_L, B2, R1P; \
 		vpmadd52huq Q0_H, B2, R1P; \
 		\
 		vpmadd52luq P1_L, A2, R2P; \
 		vpmadd52huq P1_H, A2, R2P; \
 	/* ; Load next block of data (128 bytes) */ \
 	vmovdqu64 ZTMP5, [MSG]; \
 	vmovdqu64 ZTMP2, [MSG + 64]; \
 	\
 		vpmadd52luq Q1_L, B2, R2P; \
 		vpmadd52huq Q1_H, B2, R2P; \
 	\
 	/* ; Interleave new blocks of data */ \
 	vpunpckhqdq ZTMP3, ZTMP5, ZTMP2; \
 	vpunpcklqdq ZTMP5, ZTMP5, ZTMP2; \
 	\
 		vpmadd52luq P0_L, A0, R0; \
 		vpmadd52huq P0_H, A0, R0; \
 	/* ; Highest 42-bit limbs of new blocks */ \
 	vpsrlq  ZTMP6, ZTMP3, 24; \
 	vporq   ZTMP6, ZTMP6, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
 	\
 		vpmadd52luq Q0_L, B0, R0; \
 		vpmadd52huq Q0_H, B0, R0; \
 		\
 	/* ; Middle 44-bit limbs of new blocks */ \
 	vpsrlq  ZTMP2, ZTMP5, 44; \
 	vpsllq  ZTMP4, ZTMP3, 20; \
 	\
 		vpmadd52luq P2_L, A2, R0; \
 		vpmadd52huq P2_H, A2, R0; \
 	vpternlogq ZTMP2, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	/* ; Lowest 44-bit limbs of new blocks */ \
 	vpandq  ZTMP5, ZTMP5, [.Lmask_44 ADD_RIP]; \
 	\
 		vpmadd52luq Q2_L, B2, R0; \
 		vpmadd52huq Q2_H, B2, R0; \
 		\
 	/* ; Load next block of data (128 bytes) */ \
 	vmovdqu64 ZTMP8, [MSG + 64*2]; \
 	vmovdqu64 ZTMP9, [MSG + 64*3]; \
 	\
 		vpmadd52luq P1_L, A0, R1; \
 		vpmadd52huq P1_H, A0, R1; \
 	/* ; Interleave new blocks of data */ \
 	vpunpckhqdq ZTMP3, ZTMP8, ZTMP9; \
 	vpunpcklqdq ZTMP8, ZTMP8, ZTMP9; \
 	\
 		vpmadd52luq Q1_L, B0, R1; \
 		vpmadd52huq Q1_H, B0, R1; \
 	\
 	/* ; Highest 42-bit limbs of new blocks */ \
 	vpsrlq  ZTMP7, ZTMP3, 24; \
 	vporq   ZTMP7, ZTMP7, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
 	\
 		vpmadd52luq P0_L, A1, R2P; \
 		vpmadd52huq P0_H, A1, R2P; \
 		\
 	/* ; Middle 44-bit limbs of new blocks */ \
 	vpsrlq  ZTMP9, ZTMP8, 44; \
 	vpsllq  ZTMP4, ZTMP3, 20; \
 	\
 		vpmadd52luq Q0_L, B1, R2P; \
 		vpmadd52huq Q0_H, B1, R2P; \
 		\
 	vpternlogq ZTMP9, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	/* ; Lowest 44-bit limbs of new blocks */ \
 	vpandq  ZTMP8, ZTMP8, [.Lmask_44 ADD_RIP]; \
 	\
 		vpmadd52luq P2_L, A0, R2; \
 		vpmadd52huq P2_H, A0, R2; \
 	/* ; Carry propagation (first pass) */ \
 	vpsrlq  ZTMP1, P0_L, 44; \
 	vpsllq  P0_H, P0_H, 8; \
 		vpmadd52luq Q2_L, B0, R2; \
 		vpmadd52huq Q2_H, B0, R2; \
 		\
 	vpsrlq  ZTMP3, Q0_L, 44; \
 	vpsllq  Q0_H, Q0_H, 8; \
 	\
 		vpmadd52luq P1_L, A1, R0; \
 		vpmadd52huq P1_H, A1, R0; \
 	/* ; Carry propagation (first pass) - continue */ \
 	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  P0_H, P0_H, ZTMP1; \
 		vpmadd52luq Q1_L, B1, R0; \
 		vpmadd52huq Q1_H, B1, R0; \
 	\
 	vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  Q0_H, Q0_H, ZTMP3; \
 	\
 		vpmadd52luq P2_L, A1, R1; \
 		vpmadd52huq P2_H, A1, R1; \
 	/* ; Carry propagation (first pass) - continue */ \
 	vpaddq  P1_L, P1_L, P0_H; \
 	vpsllq  P1_H, P1_H, 8; \
 	vpsrlq  ZTMP1, P1_L, 44; \
 		vpmadd52luq Q2_L, B1, R1; \
 		vpmadd52huq Q2_H, B1, R1; \
 	\
 	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  Q1_L, Q1_L, Q0_H; \
 	vpsllq  Q1_H, Q1_H, 8; \
 	vpsrlq  ZTMP3, Q1_L, 44; \
 	vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	\
 	vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
 	vpaddq  P2_L, P2_L, ZTMP1; \
 	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
 	vpaddq  A2, A2, ZTMP6; /* ; Add highest bits from new blocks to accumulator */ \
 	vpsrlq  ZTMP1, P2_L, 42; \
 	vpsllq  P2_H, P2_H, 10; \
 	vpaddq  P2_H, P2_H, ZTMP1; \
 	\
 	vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
 	vpaddq  Q2_L, Q2_L, ZTMP3; \
 	vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
 	vpaddq  B2, B2, ZTMP7; /* ; Add highest bits from new blocks to accumulator */ \
 	vpsrlq  ZTMP3, Q2_L, 42; \
 	vpsllq  Q2_H, Q2_H, 10; \
 	vpaddq  Q2_H, Q2_H, ZTMP3; \
 	\
 	/* ; Carry propagation (second pass) */ \
 	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
 	vpaddq  A0, A0, P2_H; \
 	vpsllq  P2_H, P2_H, 2; \
 	vpaddq  A0, A0, P2_H; \
 	vpaddq  B0, B0, Q2_H; \
 	vpsllq  Q2_H, Q2_H, 2; \
 	vpaddq  B0, B0, Q2_H; \
 	\
 	vpsrlq  ZTMP1, A0, 44; \
 	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
 	vpaddq  A0, A0, ZTMP5; /* ; Add low 42-bit bits from new blocks to accumulator */ \
 	vpaddq  A1, A1, ZTMP2; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
 	vpaddq  A1, A1, ZTMP1; \
 	vpsrlq  ZTMP3, B0, 44; \
 	vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
 	vpaddq  B0, B0, ZTMP8; /* ; Add low 42-bit bits from new blocks to accumulator */ \
 	vpaddq  B1, B1, ZTMP9; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
 	vpaddq  B1, B1, ZTMP3
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Computes hash for 16 16-byte message blocks.
 ;;
 ;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
 ;; and 8 blocks from B0-B2, multiplied by R0-R2 and S0-S2)
 ;;
 ;;
 ;;      a2      a1      a0
 ;; ×    b2      b1      b0
 ;; ---------------------------------------
 ;;     a2×b0   a1×b0   a0×b0
 ;; +   a1×b1   a0×b1 5×a2×b1
 ;; +   a0×b2 5×a2×b2 5×a1×b2
 ;; ---------------------------------------
 ;;        p2      p1      p0
 ;;
 ;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
 ;; multiplying by 5 in case of the carry of p2.
 ;;
 ;; =============================================================================
 ;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
 ;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
 ;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
 ;B0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
 ;B1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
 ;B2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
 ;R0    [in] ZMM register (R0) to include the 1st limb in IDX
 ;R1    [in] ZMM register (R1) to include the 2nd limb in IDX
 ;R2    [in] ZMM register (R2) to include the 3rd limb in IDX
 ;R1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
 ;R2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
 ;S0    [in] ZMM register (R0) to include the 1st limb in IDX
 ;S1    [in] ZMM register (R1) to include the 2nd limb in IDX
 ;S2    [in] ZMM register (R2) to include the 3rd limb in IDX
 ;S1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
 ;S2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
 ;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
 ;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
 ;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
 ;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
 ;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
 ;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
 ;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
 ;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
 ;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
 ;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
 ;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
 ;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
 ;ZTMP1 [clobbered] Temporary ZMM register
 ;ZTMP2 [clobbered] Temporary ZMM register
 */
 #define POLY1305_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, R2P,\
 				  S0, S1, S2, S1P, S2P, P0_L, P0_H, P1_L, P1_H,\
 				  P2_L, P2_H, Q0_L, Q0_H, Q1_L, Q1_H, Q2_L,\
 				  Q2_H, ZTMP1, ZTMP2) \
 	/* ;; Reset accumulator */ \
 	vpxorq  P0_L, P0_L, P0_L; \
 	vpxorq  P0_H, P0_H, P0_H; \
 	vpxorq  P1_L, P1_L, P1_L; \
 	vpxorq  P1_H, P1_H, P1_H; \
 	vpxorq  P2_L, P2_L, P2_L; \
 	vpxorq  P2_H, P2_H, P2_H; \
 	vpxorq  Q0_L, Q0_L, Q0_L; \
 	vpxorq  Q0_H, Q0_H, Q0_H; \
 	vpxorq  Q1_L, Q1_L, Q1_L; \
 	vpxorq  Q1_H, Q1_H, Q1_H; \
 	vpxorq  Q2_L, Q2_L, Q2_L; \
 	vpxorq  Q2_H, Q2_H, Q2_H; \
 	\
 	/* ;; This code interleaves hash computation with input loading/splatting */ \
 	\
 	/* ; Calculate products */ \
 	vpmadd52luq P0_L, A2, R1P; \
 	vpmadd52huq P0_H, A2, R1P; \
 	\
 	vpmadd52luq Q0_L, B2, S1P; \
 	vpmadd52huq Q0_H, B2, S1P; \
 	\
 	vpmadd52luq P1_L, A2, R2P; \
 	vpmadd52huq P1_H, A2, R2P; \
 	\
 	vpmadd52luq Q1_L, B2, S2P; \
 	vpmadd52huq Q1_H, B2, S2P; \
 	\
 	vpmadd52luq P0_L, A0, R0; \
 	vpmadd52huq P0_H, A0, R0; \
 	\
 	vpmadd52luq Q0_L, B0, S0; \
 	vpmadd52huq Q0_H, B0, S0; \
 	\
 	vpmadd52luq P2_L, A2, R0; \
 	vpmadd52huq P2_H, A2, R0; \
 	vpmadd52luq Q2_L, B2, S0; \
 	vpmadd52huq Q2_H, B2, S0; \
 	\
 	vpmadd52luq P1_L, A0, R1; \
 	vpmadd52huq P1_H, A0, R1; \
 	vpmadd52luq Q1_L, B0, S1; \
 	vpmadd52huq Q1_H, B0, S1; \
 	\
 	vpmadd52luq P0_L, A1, R2P; \
 	vpmadd52huq P0_H, A1, R2P; \
 	\
 	vpmadd52luq Q0_L, B1, S2P; \
 	vpmadd52huq Q0_H, B1, S2P; \
 	\
 	vpmadd52luq P2_L, A0, R2; \
 	vpmadd52huq P2_H, A0, R2; \
 	\
 	vpmadd52luq Q2_L, B0, S2; \
 	vpmadd52huq Q2_H, B0, S2; \
 	\
 	/* ; Carry propagation (first pass) */ \
 	vpsrlq  ZTMP1, P0_L, 44; \
 	vpsllq  P0_H, P0_H, 8; \
 	vpsrlq  ZTMP2, Q0_L, 44; \
 	vpsllq  Q0_H, Q0_H, 8; \
 	\
 	vpmadd52luq P1_L, A1, R0; \
 	vpmadd52huq P1_H, A1, R0; \
 	vpmadd52luq Q1_L, B1, S0; \
 	vpmadd52huq Q1_H, B1, S0; \
 	\
 	/* ; Carry propagation (first pass) - continue */ \
 	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  P0_H, P0_H, ZTMP1; \
 	vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  Q0_H, Q0_H, ZTMP2; \
 	\
 	vpmadd52luq P2_L, A1, R1; \
 	vpmadd52huq P2_H, A1, R1; \
 	vpmadd52luq Q2_L, B1, S1; \
 	vpmadd52huq Q2_H, B1, S1; \
 	\
 	/* ; Carry propagation (first pass) - continue */ \
 	vpaddq  P1_L, P1_L, P0_H; \
 	vpsllq  P1_H, P1_H, 8; \
 	vpsrlq  ZTMP1, P1_L, 44; \
 	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  Q1_L, Q1_L, Q0_H; \
 	vpsllq  Q1_H, Q1_H, 8; \
 	vpsrlq  ZTMP2, Q1_L, 44; \
 	vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	\
 	vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
 	vpaddq  P2_L, P2_L, ZTMP1; \
 	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
 	vpsrlq  ZTMP1, P2_L, 42; \
 	vpsllq  P2_H, P2_H, 10; \
 	vpaddq  P2_H, P2_H, ZTMP1; \
 	\
 	vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
 	vpaddq  Q2_L, Q2_L, ZTMP2; \
 	vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
 	vpsrlq  ZTMP2, Q2_L, 42; \
 	vpsllq  Q2_H, Q2_H, 10; \
 	vpaddq  Q2_H, Q2_H, ZTMP2; \
 	\
 	/* ; Carry propagation (second pass) */ \
 	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
 	vpaddq  A0, A0, P2_H; \
 	vpsllq  P2_H, P2_H, 2; \
 	vpaddq  A0, A0, P2_H; \
 	vpaddq  B0, B0, Q2_H; \
 	vpsllq  Q2_H, Q2_H, 2; \
 	vpaddq  B0, B0, Q2_H; \
 	\
 	vpsrlq  ZTMP1, A0, 44; \
 	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
 	vpaddq  A1, A1, ZTMP1; \
 	vpsrlq  ZTMP2, B0, 44; \
 	vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
 	vpaddq  B1, B1, ZTMP2;
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Shuffle data blocks, so they match the right power of R.
 ;; Powers of R are in this order: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
 ;; Data blocks are coming in this order: A0 A4 A1 A5 A2 A6 A3 A7
 ;; Generally the computation is: A0*R^8 + A1*R^7 + A2*R^6 + A3*R^5 +
 ;;                               A4*R^4 + A5*R^3 + A6*R^2 + A7*R
 ;; When there are less data blocks, less powers of R are used, so data needs to
 ;; be shuffled. Example: if 4 blocks are left, only A0-A3 are available and only
 ;; R-R^4 are used (A0*R^4 + A1*R^3 + A2*R^2 + A3*R), so A0-A3 need to be shifted
 ;; =============================================================================
 ;A_L      [in/out] 0-43 bits of input data
 ;A_M      [in/out] 44-87 bits of input data
 ;A_H      [in/out] 88-129 bits of input data
 ;TMP      [clobbered] Temporary GP register
 ;N_BLOCKS [in] Number of remaining input blocks
 */
 #define SHUFFLE_DATA_SMASK_1 0x39
 #define SHUFFLE_DATA_KMASK_1 0xffff
 #define SHUFFLE_DATA_SMASK_2 0x4E
 #define SHUFFLE_DATA_KMASK_2 0xffff
 #define SHUFFLE_DATA_SMASK_3 0x93
 #define SHUFFLE_DATA_KMASK_3 0xffff
 #define SHUFFLE_DATA_KMASK_4 0xffff
 #define SHUFFLE_DATA_SMASK_5 0x39
 #define SHUFFLE_DATA_KMASK_5 0xfff0
 #define SHUFFLE_DATA_SMASK_6 0x4E
 #define SHUFFLE_DATA_KMASK_6 0xff00
 #define SHUFFLE_DATA_SMASK_7 0x93
 #define SHUFFLE_DATA_KMASK_7 0xf000
 
 #define SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, N_BLOCKS) \
 	mov     TMP, SHUFFLE_DATA_KMASK_##N_BLOCKS; \
 	kmovq   k1, TMP; \
 	vpshufd A_L{k1}, A_L, 0x4E; \
 	vpshufd A_M{k1}, A_M, 0x4E; \
 	vpshufd A_H{k1}, A_H, 0x4E; \
 	vshufi64x2 A_L, A_L, A_L, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
 	vshufi64x2 A_M, A_M, A_M, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
 	vshufi64x2 A_H, A_H, A_H, SHUFFLE_DATA_SMASK_##N_BLOCKS
 
 #define SHUFFLE_DATA_BLOCKS_1(A_L, A_M, A_H, TMP) \
 	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 1)
 
 #define SHUFFLE_DATA_BLOCKS_2(A_L, A_M, A_H, TMP) \
 	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 2)
 
 #define SHUFFLE_DATA_BLOCKS_3(A_L, A_M, A_H, TMP) \
 	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 3)
 
 #define SHUFFLE_DATA_BLOCKS_4(A_L, A_M, A_H, TMP) \
 	mov     TMP, SHUFFLE_DATA_KMASK_4; \
 	kmovq   k1, TMP; \
 	vpshufd A_L{k1}, A_L, 0x4E; \
 	vpshufd A_M{k1}, A_M, 0x4E; \
 	vpshufd A_H{k1}, A_H, 0x4E;
 
 #define SHUFFLE_DATA_BLOCKS_5(A_L, A_M, A_H, TMP) \
 	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 5)
 
 #define SHUFFLE_DATA_BLOCKS_6(A_L, A_M, A_H, TMP) \
 	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 6)
 
 #define SHUFFLE_DATA_BLOCKS_7(A_L, A_M, A_H, TMP) \
 	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 7)
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Computes hash for message length being multiple of block size
 ;; =============================================================================
 ;MSG    [in/out] GPR pointer to input message (updated)
 ;LEN    [in/out] GPR in: length in bytes / out: length mod 16
 ;A0     [in/out] accumulator bits 63..0
 ;A1     [in/out] accumulator bits 127..64
 ;A2     [in/out] accumulator bits 195..128
 ;R0     [in] R constant bits 63..0
 ;R1     [in] R constant bits 127..64
 ;T0     [clobbered] GPR register
 ;T1     [clobbered] GPR register
 ;T2     [clobbered] GPR register
 ;T3     [clobbered] GPR register
 ;GP_RAX [clobbered] RAX register
 ;GP_RDX [clobbered] RDX register
 */
 #define POLY1305_BLOCKS(MSG, LEN, A0, A1, A2, R0, R1, T0, T1, T2, T3, \
 			GP_RAX, GP_RDX) \
 	/* ; Minimum of 256 bytes to run vectorized code */ \
 	cmp     LEN, POLY1305_BLOCK_SIZE*16; \
 	jb      .L_final_loop; \
 	\
 	/* ; Spread accumulator into 44-bit limbs in quadwords */ \
 	mov     T0, A0; \
 	and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (A[43:0]) */ \
 	vmovq   xmm5, T0; \
 	\
 	mov     T0, A1; \
 	shrd    A0, T0, 44; \
 	and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (A[77:52]) */ \
 	vmovq   xmm6, A0; \
 	\
 	shrd    A1, A2, 24; \
 	and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (A[129:88]) */ \
 	vmovq   xmm7, A1; \
 	\
 	/* ; Load first block of data (128 bytes) */ \
 	vmovdqu64 zmm0, [MSG]; \
 	vmovdqu64 zmm1, [MSG + 64]; \
 	\
 	/* ; Interleave the data to form 44-bit limbs */ \
 	/* ; */ \
 	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
 	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
 	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
 	vpunpckhqdq zmm15, zmm0, zmm1; \
 	vpunpcklqdq zmm13, zmm0, zmm1; \
 	\
 	vpsrlq  zmm14, zmm13, 44; \
 	vpsllq  zmm18, zmm15, 20; \
 	vpternlogq zmm14, zmm18, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	vpandq  zmm13, zmm13, [.Lmask_44 ADD_RIP]; \
 	vpsrlq  zmm15, zmm15, 24; \
 	\
 	/* ; Add 2^128 to all 8 final qwords of the message */ \
 	vporq   zmm15, zmm15, [.Lhigh_bit ADD_RIP]; \
 	\
 	vpaddq  zmm13, zmm13, zmm5; \
 	vpaddq  zmm14, zmm14, zmm6; \
 	vpaddq  zmm15, zmm15, zmm7; \
 	\
 	/* ; Load next blocks of data (128 bytes) */ \
 	vmovdqu64 zmm0, [MSG + 64*2]; \
 	vmovdqu64 zmm1, [MSG + 64*3]; \
 	\
 	/* ; Interleave the data to form 44-bit limbs */ \
 	/* ; */ \
 	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
 	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
 	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
 	vpunpckhqdq zmm18, zmm0, zmm1; \
 	vpunpcklqdq zmm16, zmm0, zmm1; \
 	\
 	vpsrlq  zmm17, zmm16, 44; \
 	vpsllq  zmm19, zmm18, 20; \
 	vpternlogq zmm17, zmm19, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	vpandq  zmm16, zmm16, [.Lmask_44 ADD_RIP]; \
 	vpsrlq  zmm18, zmm18, 24; \
 	\
 	/* ; Add 2^128 to all 8 final qwords of the message */ \
 	vporq   zmm18, zmm18, [.Lhigh_bit ADD_RIP]; \
 	\
 	/* ; Use memory in stack to save powers of R, before loading them into ZMM registers */ \
 	/* ; The first 16*8 bytes will contain the 16 bytes of the 8 powers of R */ \
 	/* ; The last 64 bytes will contain the last 2 bits of powers of R, spread in 8 qwords, */ \
 	/* ; to be OR'd with the highest qwords (in zmm26) */ \
 	vmovq   xmm3, R0; \
 	vpinsrq xmm3, xmm3, R1, 1; \
 	vinserti32x4 zmm1, zmm1, xmm3, 3; \
 	\
 	vpxorq  zmm0, zmm0, zmm0; \
 	vpxorq  zmm2, zmm2, zmm2; \
 	\
 	/* ; Calculate R^2 */ \
 	mov     T0, R1; \
 	shr     T0, 2; \
 	add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
 	\
 	mov     A0, R0; \
 	mov     A1, R1; \
 	\
 	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_ZERO); \
 	\
 	vmovq   xmm3, A0; \
 	vpinsrq xmm3, xmm3, A1, 1; \
 	vinserti32x4 zmm1, zmm1, xmm3, 2; \
 	\
 	vmovq   xmm4, A2; \
 	vinserti32x4 zmm2, zmm2, xmm4, 2; \
 	\
 	/* ; Calculate R^3 */ \
 	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
 	\
 	vmovq   xmm3, A0; \
 	vpinsrq xmm3, xmm3, A1, 1; \
 	vinserti32x4 zmm1, zmm1, xmm3, 1; \
 	\
 	vmovq   xmm4, A2; \
 	vinserti32x4 zmm2, zmm2, xmm4, 1; \
 	\
 	/* ; Calculate R^4 */ \
 	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
 	\
 	vmovq   xmm3, A0; \
 	vpinsrq xmm3, xmm3, A1, 1; \
 	vinserti32x4 zmm1, zmm1, xmm3, 0; \
 	\
 	vmovq   xmm4, A2; \
 	vinserti32x4 zmm2, zmm2, xmm4, 0; \
 	\
 	/* ; Move 2 MSbits to top 24 bits, to be OR'ed later */ \
 	vpsllq  zmm2, zmm2, 40; \
 	\
 	vpunpckhqdq zmm21, zmm1, zmm0; \
 	vpunpcklqdq zmm19, zmm1, zmm0; \
 	\
 	vpsrlq  zmm20, zmm19, 44; \
 	vpsllq  zmm4, zmm21, 20; \
 	vpternlogq zmm20, zmm4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	vpandq  zmm19, zmm19, [.Lmask_44 ADD_RIP]; \
 	vpsrlq  zmm21, zmm21, 24; \
 	\
 	/* ; zmm2 contains the 2 highest bits of the powers of R */ \
 	vporq   zmm21, zmm21, zmm2; \
 	\
 	/* ; Broadcast 44-bit limbs of R^4 */ \
 	mov     T0, A0; \
 	and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (R^4[43:0]) */ \
 	vpbroadcastq zmm22, T0; \
 	\
 	mov     T0, A1; \
 	shrd    A0, T0, 44; \
 	and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (R^4[87:44]) */ \
 	vpbroadcastq zmm23, A0; \
 	\
 	shrd    A1, A2, 24; \
 	and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (R^4[129:88]) */ \
 	vpbroadcastq zmm24, A1; \
 	\
 	/* ; Generate 4*5*R^4 */ \
 	vpsllq  zmm25, zmm23, 2; \
 	vpsllq  zmm26, zmm24, 2; \
 	\
 	/* ; 5*R^4 */ \
 	vpaddq  zmm25, zmm25, zmm23; \
 	vpaddq  zmm26, zmm26, zmm24; \
 	\
 	/* ; 4*5*R^4 */ \
 	vpsllq  zmm25, zmm25, 2; \
 	vpsllq  zmm26, zmm26, 2; \
 	\
 	vpslldq zmm29, zmm19, 8; \
 	vpslldq zmm30, zmm20, 8; \
 	vpslldq zmm31, zmm21, 8; \
 	\
 	/* ; Calculate R^8-R^5 */ \
 	POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
 				zmm22, zmm23, zmm24, \
 				zmm25, zmm26, \
 				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				zmm11); \
 	\
 	/* ; Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R */ \
 	vporq   zmm19, zmm19, zmm29; \
 	vporq   zmm20, zmm20, zmm30; \
 	vporq   zmm21, zmm21, zmm31; \
 	\
 	/* ; Broadcast R^8 */ \
 	vpbroadcastq zmm22, xmm19; \
 	vpbroadcastq zmm23, xmm20; \
 	vpbroadcastq zmm24, xmm21; \
 	\
 	/* ; Generate 4*5*R^8 */ \
 	vpsllq  zmm25, zmm23, 2; \
 	vpsllq  zmm26, zmm24, 2; \
 	\
 	/* ; 5*R^8 */ \
 	vpaddq  zmm25, zmm25, zmm23; \
 	vpaddq  zmm26, zmm26, zmm24; \
 	\
 	/* ; 4*5*R^8 */ \
 	vpsllq  zmm25, zmm25, 2; \
 	vpsllq  zmm26, zmm26, 2; \
 	\
 	cmp     LEN, POLY1305_BLOCK_SIZE*32; \
 	jb      .L_len_256_511; \
 	\
 	/* ; Store R^8-R for later use */ \
 	vmovdqa64 [rsp + STACK_r_save], zmm19; \
 	vmovdqa64 [rsp + STACK_r_save + 64], zmm20; \
 	vmovdqa64 [rsp + STACK_r_save + 64*2], zmm21; \
 	\
 	/* ; Calculate R^16-R^9 */ \
 	POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
 				zmm22, zmm23, zmm24, \
 				zmm25, zmm26, \
 				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				zmm11); \
 	\
 	/* ; Store R^16-R^9 for later use */ \
 	vmovdqa64 [rsp + STACK_r_save + 64*3], zmm19; \
 	vmovdqa64 [rsp + STACK_r_save + 64*4], zmm20; \
 	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm21; \
 	\
 	/* ; Broadcast R^16 */ \
 	vpbroadcastq zmm22, xmm19; \
 	vpbroadcastq zmm23, xmm20; \
 	vpbroadcastq zmm24, xmm21; \
 	\
 	/* ; Generate 4*5*R^16 */ \
 	vpsllq  zmm25, zmm23, 2; \
 	vpsllq  zmm26, zmm24, 2; \
 	\
 	/* ; 5*R^16 */ \
 	vpaddq  zmm25, zmm25, zmm23; \
 	vpaddq  zmm26, zmm26, zmm24; \
 	\
 	/* ; 4*5*R^16 */ \
 	vpsllq  zmm25, zmm25, 2; \
 	vpsllq  zmm26, zmm26, 2; \
 	\
 	mov     T0, LEN; \
 	and     T0, 0xffffffffffffff00; /* ; multiple of 256 bytes */ \
 	\
 .L_poly1305_blocks_loop: \
 	cmp     T0, POLY1305_BLOCK_SIZE*16; \
 	jbe     .L_poly1305_blocks_loop_end; \
 	\
 	/* ; zmm13-zmm18 contain the 16 blocks of message plus the previous accumulator */ \
 	/* ; zmm22-24 contain the 5x44-bit limbs of the powers of R */ \
 	/* ; zmm25-26 contain the 5x44-bit limbs of the powers of R' (5*4*R) */ \
 	POLY1305_MSG_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
 				      zmm22, zmm23, zmm24, zmm25, zmm26, \
 				      zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				      zmm19, zmm20, zmm21, zmm27, zmm28, zmm29, \
 				      zmm30, zmm31, zmm11, zmm0, zmm1, \
 				      zmm2, zmm3, zmm4, zmm12, MSG, T0); \
 	\
 	jmp     .L_poly1305_blocks_loop; \
 	\
 .L_poly1305_blocks_loop_end: \
 	\
 	/* ;; Need to multiply by r^16, r^15, r^14... r */ \
 	\
 	/* ; First multiply by r^16-r^9 */ \
 	\
 	/* ; Read R^16-R^9 */ \
 	vmovdqa64 zmm19, [rsp + STACK_r_save + 64*3]; \
 	vmovdqa64 zmm20, [rsp + STACK_r_save + 64*4]; \
 	vmovdqa64 zmm21, [rsp + STACK_r_save + 64*5]; \
 	/* ; Read R^8-R */ \
 	vmovdqa64 zmm22, [rsp + STACK_r_save]; \
 	vmovdqa64 zmm23, [rsp + STACK_r_save + 64]; \
 	vmovdqa64 zmm24, [rsp + STACK_r_save + 64*2]; \
 	\
 	/* ; zmm27 to have bits 87-44 of all 9-16th powers of R' in 8 qwords */ \
 	/* ; zmm28 to have bits 129-88 of all 9-16th powers of R' in 8 qwords */ \
 	vpsllq  zmm0, zmm20, 2; \
 	vpaddq  zmm27, zmm20, zmm0; /* ; R1' (R1*5) */ \
 	vpsllq  zmm1, zmm21, 2; \
 	vpaddq  zmm28, zmm21, zmm1; /* ; R2' (R2*5) */ \
 	\
 	/* ; 4*5*R */ \
 	vpsllq  zmm27, zmm27, 2; \
 	vpsllq  zmm28, zmm28, 2; \
 	\
 	/* ; Then multiply by r^8-r */ \
 	\
 	/* ; zmm25 to have bits 87-44 of all 1-8th powers of R' in 8 qwords */ \
 	/* ; zmm26 to have bits 129-88 of all 1-8th powers of R' in 8 qwords */ \
 	vpsllq  zmm2, zmm23, 2; \
 	vpaddq  zmm25, zmm23, zmm2; /* ; R1' (R1*5) */ \
 	vpsllq  zmm3, zmm24, 2; \
 	vpaddq  zmm26, zmm24, zmm3; /* ; R2' (R2*5) */ \
 	\
 	/* ; 4*5*R */ \
 	vpsllq  zmm25, zmm25, 2; \
 	vpsllq  zmm26, zmm26, 2; \
 	\
 	POLY1305_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
 				  zmm19, zmm20, zmm21, zmm27, zmm28, \
 				  zmm22, zmm23, zmm24, zmm25, zmm26, \
 				  zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, \
 				  zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm29); \
 	\
 	/* ;; Add all blocks (horizontally) */ \
 	vpaddq  zmm13, zmm13, zmm16; \
 	vpaddq  zmm14, zmm14, zmm17; \
 	vpaddq  zmm15, zmm15, zmm18; \
 	\
 	vextracti64x4   ymm0, zmm13, 1; \
 	vextracti64x4   ymm1, zmm14, 1; \
 	vextracti64x4   ymm2, zmm15, 1; \
 	\
 	vpaddq  ymm13, ymm13, ymm0; \
 	vpaddq  ymm14, ymm14, ymm1; \
 	vpaddq  ymm15, ymm15, ymm2; \
 	\
 	vextracti32x4   xmm10, ymm13, 1; \
 	vextracti32x4   xmm11, ymm14, 1; \
 	vextracti32x4   xmm12, ymm15, 1; \
 	\
 	vpaddq  xmm13, xmm13, xmm10; \
 	vpaddq  xmm14, xmm14, xmm11; \
 	vpaddq  xmm15, xmm15, xmm12; \
 	\
 	vpsrldq xmm10, xmm13, 8; \
 	vpsrldq xmm11, xmm14, 8; \
 	vpsrldq xmm12, xmm15, 8; \
 	\
 	/* ; Finish folding and clear second qword */ \
 	mov     T0, 0xfd; \
 	kmovq   k1, T0; \
 	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
 	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
 	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
 	\
 	add     MSG, POLY1305_BLOCK_SIZE*16; \
 	\
 	and     LEN, (POLY1305_BLOCK_SIZE*16 - 1); /* ; Get remaining lengths (LEN < 256 bytes) */ \
 	\
 .L_less_than_256: \
 	\
 	cmp     LEN, POLY1305_BLOCK_SIZE*8; \
 	jb      .L_less_than_128; \
 	\
 	/* ; Read next 128 bytes */ \
 	/* ; Load first block of data (128 bytes) */ \
 	vmovdqu64 zmm0, [MSG]; \
 	vmovdqu64 zmm1, [MSG + 64]; \
 	\
 	/* ; Interleave the data to form 44-bit limbs */ \
 	/* ; */ \
 	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
 	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
 	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
 	vpunpckhqdq zmm5, zmm0, zmm1; \
 	vpunpcklqdq zmm3, zmm0, zmm1; \
 	\
 	vpsrlq  zmm4, zmm3, 44; \
 	vpsllq  zmm8, zmm5, 20; \
 	vpternlogq zmm4, zmm8, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	vpandq  zmm3, zmm3, [.Lmask_44 ADD_RIP]; \
 	vpsrlq  zmm5, zmm5, 24; \
 	\
 	/* ; Add 2^128 to all 8 final qwords of the message */ \
 	vporq   zmm5, zmm5, [.Lhigh_bit ADD_RIP]; \
 	\
 	vpaddq  zmm13, zmm13, zmm3; \
 	vpaddq  zmm14, zmm14, zmm4; \
 	vpaddq  zmm15, zmm15, zmm5; \
 	\
 	add     MSG, POLY1305_BLOCK_SIZE*8; \
 	sub     LEN, POLY1305_BLOCK_SIZE*8; \
 	\
 	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
 				zmm22, zmm23, zmm24, \
 				zmm25, zmm26, \
 				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				zmm11); \
 	\
 	/* ;; Add all blocks (horizontally) */ \
 	vextracti64x4   ymm0, zmm13, 1; \
 	vextracti64x4   ymm1, zmm14, 1; \
 	vextracti64x4   ymm2, zmm15, 1; \
 	\
 	vpaddq  ymm13, ymm13, ymm0; \
 	vpaddq  ymm14, ymm14, ymm1; \
 	vpaddq  ymm15, ymm15, ymm2; \
 	\
 	vextracti32x4   xmm10, ymm13, 1; \
 	vextracti32x4   xmm11, ymm14, 1; \
 	vextracti32x4   xmm12, ymm15, 1; \
 	\
 	vpaddq  xmm13, xmm13, xmm10; \
 	vpaddq  xmm14, xmm14, xmm11; \
 	vpaddq  xmm15, xmm15, xmm12; \
 	\
 	vpsrldq xmm10, xmm13, 8; \
 	vpsrldq xmm11, xmm14, 8; \
 	vpsrldq xmm12, xmm15, 8; \
 	\
 	/* ; Finish folding and clear second qword */ \
 	mov     T0, 0xfd; \
 	kmovq   k1, T0; \
 	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
 	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
 	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
 	\
 .L_less_than_128: \
 	cmp     LEN, 32; /* ; If remaining bytes is <= 32, perform last blocks in scalar */ \
 	jbe     .L_simd_to_gp; \
 	\
 	mov     T0, LEN; \
 	and     T0, 0x3f; \
 	lea     T1, [.Lbyte64_len_to_mask_table ADD_RIP]; \
 	mov     T1, [T1 + 8*T0]; \
 	\
 	/* ; Load default byte masks */ \
 	mov     T2, 0xffffffffffffffff; \
 	xor     T3, T3; \
 	\
 	cmp     LEN, 64; \
 	cmovb   T2, T1; /* ; Load mask for first 64 bytes */ \
 	cmovg   T3, T1; /* ; Load mask for second 64 bytes */ \
 	\
 	kmovq   k1, T2; \
 	kmovq   k2, T3; \
 	vmovdqu8 zmm0{k1}{z}, [MSG]; \
 	vmovdqu8 zmm1{k2}{z}, [MSG + 64]; \
 	\
 	/* ; Pad last block message, if partial */ \
 	mov     T0, LEN; \
 	and     T0, 0x70; /* ; Multiple of 16 bytes */ \
 	/* ; Load last block of data (up to 112 bytes) */ \
 	shr     T0, 3; /* ; Get number of full qwords */ \
 	\
 	/* ; Interleave the data to form 44-bit limbs */ \
 	/* ; */ \
 	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
 	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
 	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
 	vpunpckhqdq zmm4, zmm0, zmm1; \
 	vpunpcklqdq zmm2, zmm0, zmm1; \
 	\
 	vpsrlq  zmm3, zmm2, 44; \
 	vpsllq  zmm28, zmm4, 20; \
 	vpternlogq zmm3, zmm28, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
 	\
 	vpandq  zmm2, zmm2, [.Lmask_44 ADD_RIP]; \
 	vpsrlq  zmm4, zmm4, 24; \
 	\
 	lea     T1, [.Lqword_high_bit_mask ADD_RIP]; \
 	kmovb   k1, [T1 + T0]; \
 	/* ; Add 2^128 to final qwords of the message (all full blocks and partial block, */ \
 	/* ; if "pad_to_16" is selected) */ \
 	vporq   zmm4{k1}, zmm4, [.Lhigh_bit ADD_RIP]; \
 	\
 	vpaddq  zmm13, zmm13, zmm2; \
 	vpaddq  zmm14, zmm14, zmm3; \
 	vpaddq  zmm15, zmm15, zmm4; \
 	\
 	mov     T0, LEN; \
 	add     T0, 15; \
 	shr     T0, 4;      /* ; Get number of 16-byte blocks (including partial blocks) */ \
 	xor     LEN, LEN; /* ; All length will be consumed */ \
 	\
 	/* ; No need to shuffle data blocks (data is in the right order) */ \
 	cmp     T0, 8; \
 	je      .L_end_shuffle; \
 	\
 	cmp     T0, 4; \
 	je      .L_shuffle_blocks_4; \
 	jb      .L_shuffle_blocks_3; \
 	\
 	/* ; Number of 16-byte blocks > 4 */ \
 	cmp     T0, 6; \
 	je      .L_shuffle_blocks_6; \
 	ja      .L_shuffle_blocks_7; \
 	jmp     .L_shuffle_blocks_5; \
 	\
 .L_shuffle_blocks_3: \
 	SHUFFLE_DATA_BLOCKS_3(zmm13, zmm14, zmm15, T1); \
 	jmp     .L_end_shuffle; \
 .L_shuffle_blocks_4: \
 	SHUFFLE_DATA_BLOCKS_4(zmm13, zmm14, zmm15, T1); \
 	jmp     .L_end_shuffle; \
 .L_shuffle_blocks_5: \
 	SHUFFLE_DATA_BLOCKS_5(zmm13, zmm14, zmm15, T1); \
 	jmp     .L_end_shuffle; \
 .L_shuffle_blocks_6: \
 	SHUFFLE_DATA_BLOCKS_6(zmm13, zmm14, zmm15, T1); \
 	jmp     .L_end_shuffle; \
 .L_shuffle_blocks_7: \
 	SHUFFLE_DATA_BLOCKS_7(zmm13, zmm14, zmm15, T1); \
 	\
 .L_end_shuffle: \
 	\
 	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
 	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
 	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
 	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
 				zmm22, zmm23, zmm24, \
 				zmm25, zmm26, \
 				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				zmm11); \
 	\
 	/* ;; Add all blocks (horizontally) */ \
 	vextracti64x4   ymm0, zmm13, 1; \
 	vextracti64x4   ymm1, zmm14, 1; \
 	vextracti64x4   ymm2, zmm15, 1; \
 	\
 	vpaddq  ymm13, ymm13, ymm0; \
 	vpaddq  ymm14, ymm14, ymm1; \
 	vpaddq  ymm15, ymm15, ymm2; \
 	\
 	vextracti32x4   xmm10, ymm13, 1; \
 	vextracti32x4   xmm11, ymm14, 1; \
 	vextracti32x4   xmm12, ymm15, 1; \
 	\
 	vpaddq  xmm13, xmm13, xmm10; \
 	vpaddq  xmm14, xmm14, xmm11; \
 	vpaddq  xmm15, xmm15, xmm12; \
 	\
 	vpsrldq xmm10, xmm13, 8; \
 	vpsrldq xmm11, xmm14, 8; \
 	vpsrldq xmm12, xmm15, 8; \
 	\
 	vpaddq  xmm13, xmm13, xmm10; \
 	vpaddq  xmm14, xmm14, xmm11; \
 	vpaddq  xmm15, xmm15, xmm12; \
 	\
 .L_simd_to_gp: \
 	/* ; Carry propagation */ \
 	vpsrlq  xmm0, xmm13, 44; \
 	vpandq  xmm13, xmm13, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  xmm14, xmm14, xmm0; \
 	vpsrlq  xmm0, xmm14, 44; \
 	vpandq  xmm14, xmm14, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
 	vpaddq  xmm15, xmm15, xmm0; \
 	vpsrlq  xmm0, xmm15, 42; \
 	vpandq  xmm15, xmm15, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
 	vpsllq  xmm1, xmm0, 2; \
 	vpaddq  xmm0, xmm0, xmm1; \
 	vpaddq  xmm13, xmm13, xmm0; \
 	\
 	/* ; Put together A */ \
 	vmovq   A0, xmm13; \
 	\
 	vmovq   T0, xmm14; \
 	mov     T1, T0; \
 	shl     T1, 44; \
 	or      A0, T1; \
 	\
 	shr     T0, 20; \
 	vmovq   A2, xmm15; \
 	mov     A1, A2; \
 	shl     A1, 24; \
 	or      A1, T0; \
 	shr     A2, 40; \
 	\
 	/* ; Clear powers of R */ \
 	vpxorq  zmm0, zmm0, zmm0; \
 	vmovdqa64 [rsp + STACK_r_save], zmm0; \
 	vmovdqa64 [rsp + STACK_r_save + 64], zmm0; \
 	vmovdqa64 [rsp + STACK_r_save + 64*2], zmm0; \
 	vmovdqa64 [rsp + STACK_r_save + 64*3], zmm0; \
 	vmovdqa64 [rsp + STACK_r_save + 64*4], zmm0; \
 	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \
 	\
 	vzeroall; \
 	clear_zmm(xmm16); clear_zmm(xmm20); clear_zmm(xmm24); clear_zmm(xmm28); \
 	clear_zmm(xmm17); clear_zmm(xmm21); clear_zmm(xmm25); clear_zmm(xmm29); \
 	clear_zmm(xmm18); clear_zmm(xmm22); clear_zmm(xmm26); clear_zmm(xmm30); \
 	clear_zmm(xmm19); clear_zmm(xmm23); clear_zmm(xmm27); clear_zmm(xmm31); \
 	\
 .L_final_loop: \
 	cmp     LEN, POLY1305_BLOCK_SIZE; \
 	jb      .L_poly1305_blocks_exit; \
 	\
 	/* ;; A += MSG[i] */ \
 	add     A0, [MSG + 0]; \
 	adc     A1, [MSG + 8]; \
 	adc     A2, 1; /* ;; no padding bit */ \
 	\
 	mov     T0, R1; \
 	shr     T0, 2; \
 	add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
 	\
 	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, \
 			    T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
 	\
 	add     MSG, POLY1305_BLOCK_SIZE; \
 	sub     LEN, POLY1305_BLOCK_SIZE; \
 	\
 	jmp     .L_final_loop; \
 	\
 .L_len_256_511: \
 	\
 	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
 	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
 	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
 	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
 				zmm22, zmm23, zmm24, \
 				zmm25, zmm26, \
 				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				zmm11); \
 	\
 	/* ; Then multiply by r^8-r */ \
 	\
 	/* ; zmm19-zmm21 contains R^8-R, need to move it to zmm22-24, */ \
 	/* ; as it might be used in other part of the code */ \
 	vmovdqa64 zmm22, zmm19; \
 	vmovdqa64 zmm23, zmm20; \
 	vmovdqa64 zmm24, zmm21; \
 	\
 	/* ; zmm25 to have bits 87-44 of all 8 powers of R' in 8 qwords */ \
 	/* ; zmm26 to have bits 129-88 of all 8 powers of R' in 8 qwords */ \
 	vpsllq  zmm0, zmm23, 2; \
 	vpaddq  zmm25, zmm23, zmm0; /* ; R1' (R1*5) */ \
 	vpsllq  zmm1, zmm24, 2; \
 	vpaddq  zmm26, zmm24, zmm1; /* ; R2' (R2*5) */ \
 	\
 	/* ; 4*5*R^8 */ \
 	vpsllq  zmm25, zmm25, 2; \
 	vpsllq  zmm26, zmm26, 2; \
 	\
 	vpaddq  zmm13, zmm13, zmm16; \
 	vpaddq  zmm14, zmm14, zmm17; \
 	vpaddq  zmm15, zmm15, zmm18; \
 	\
 	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
 	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
 	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
 	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
 				zmm22, zmm23, zmm24, \
 				zmm25, zmm26, \
 				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
 				zmm11); \
 	\
 	/* ;; Add all blocks (horizontally) */ \
 	vextracti64x4   ymm0, zmm13, 1; \
 	vextracti64x4   ymm1, zmm14, 1; \
 	vextracti64x4   ymm2, zmm15, 1; \
 	\
 	vpaddq  ymm13, ymm13, ymm0; \
 	vpaddq  ymm14, ymm14, ymm1; \
 	vpaddq  ymm15, ymm15, ymm2; \
 	\
 	vextracti32x4   xmm10, ymm13, 1; \
 	vextracti32x4   xmm11, ymm14, 1; \
 	vextracti32x4   xmm12, ymm15, 1; \
 	\
 	vpaddq  xmm13, xmm13, xmm10; \
 	vpaddq  xmm14, xmm14, xmm11; \
 	vpaddq  xmm15, xmm15, xmm12; \
 	\
 	vpsrldq xmm10, xmm13, 8; \
 	vpsrldq xmm11, xmm14, 8; \
 	vpsrldq xmm12, xmm15, 8; \
 	\
 	/* ; Finish folding and clear second qword */ \
 	mov     T0, 0xfd; \
 	kmovq   k1, T0; \
 	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
 	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
 	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
 	\
 	add     MSG, POLY1305_BLOCK_SIZE*16; \
 	sub     LEN, POLY1305_BLOCK_SIZE*16; \
 	\
 	jmp     .L_less_than_256; \
 .L_poly1305_blocks_exit: \
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Creates stack frame and saves registers
 ;; =============================================================================
 */
 #define FUNC_ENTRY() \
 	mov     rax, rsp; \
 	CFI_DEF_CFA_REGISTER(rax); \
 	sub     rsp, STACK_SIZE; \
 	and	rsp, -64; \
 	\
 	mov     [rsp + STACK_gpr_save + 8*0], rbx; \
 	mov     [rsp + STACK_gpr_save + 8*1], rbp; \
 	mov     [rsp + STACK_gpr_save + 8*2], r12; \
 	mov     [rsp + STACK_gpr_save + 8*3], r13; \
 	mov     [rsp + STACK_gpr_save + 8*4], r14; \
 	mov     [rsp + STACK_gpr_save + 8*5], r15; \
 	mov     [rsp + STACK_rsp_save], rax; \
 	CFI_CFA_ON_STACK(STACK_rsp_save, 0)
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; Restores registers and removes the stack frame
 ;; =============================================================================
 */
 #define FUNC_EXIT() \
 	mov     rbx, [rsp + STACK_gpr_save + 8*0]; \
 	mov     rbp, [rsp + STACK_gpr_save + 8*1]; \
 	mov     r12, [rsp + STACK_gpr_save + 8*2]; \
 	mov     r13, [rsp + STACK_gpr_save + 8*3]; \
 	mov     r14, [rsp + STACK_gpr_save + 8*4]; \
 	mov     r15, [rsp + STACK_gpr_save + 8*5]; \
 	mov     rsp, [rsp + STACK_rsp_save]; \
 	CFI_DEF_CFA_REGISTER(rsp)
 
 /*
 ;; =============================================================================
 ;; =============================================================================
 ;; void poly1305_aead_update_fma_avx512(const void *msg, const uint64_t msg_len,
 ;;                                      void *hash, const void *key)
 ;; arg1 - Input message
 ;; arg2 - Message length
 ;; arg3 - Input/output hash
 ;; arg4 - Poly1305 key
 */
 .align 32
 .globl _gcry_poly1305_amd64_avx512_blocks
 ELF(.type _gcry_poly1305_amd64_avx512_blocks,@function;)
 _gcry_poly1305_amd64_avx512_blocks:
 	CFI_STARTPROC()
 	vpxord xmm16, xmm16, xmm16;
 	vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */
 	FUNC_ENTRY()
 
 #define _a0 gp3
 #define _a0 gp3
 #define _a1 gp4
 #define _a2 gp5
 #define _r0 gp6
 #define _r1 gp7
 #define _len arg2
 #define _arg3 arg4             /* ; use rcx, arg3 = rdx */
 
 	/* ;; load R */
 	mov     _r0, [arg4 + 0 * 8]
 	mov     _r1, [arg4 + 1 * 8]
 
 	/* ;; load accumulator / current hash value */
 	/* ;; note: arg4 can't be used beyond this point */
 	mov     _arg3, arg3             /* ; note: _arg3 = arg4 (linux) */
 	mov     _a0, [_arg3 + 0 * 8]
 	mov     _a1, [_arg3 + 1 * 8]
 	mov     DWORD(_a2), [_arg3 + 2 * 8]    /* ; note: _a2 = arg4 (win) */
 
 	POLY1305_BLOCKS(arg1, _len, _a0, _a1, _a2, _r0, _r1,
 			gp10, gp11, gp8, gp9, rax, rdx)
 
 	/* ;; save accumulator back */
 	mov     [_arg3 + 0 * 8], _a0
 	mov     [_arg3 + 1 * 8], _a1
 	mov     [_arg3 + 2 * 8], DWORD(_a2)
 
 	FUNC_EXIT()
 	xor eax, eax
-	kmovw k1, eax
-	kmovw k2, eax
+	kxorw k1, k1, k1
+	kxorw k2, k2, k2
 	ret_spec_stop
 	CFI_ENDPROC()
 ELF(.size _gcry_poly1305_amd64_avx512_blocks,
 	  .-_gcry_poly1305_amd64_avx512_blocks;)
 
 #endif
 #endif
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index c0fdbc33..0e3f44ab 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -1,461 +1,461 @@
 /* sha512-avx512-amd64.c - amd64/AVX512 implementation of SHA-512 transform
  * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Based on implementation from file "sha512-avx2-bmi2-amd64.S":
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX512) && \
     defined(USE_SHA512)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
 #define Y_0 ymm0
 #define Y_1 ymm1
 #define Y_2 ymm2
 #define Y_3 ymm3
 
 #define YTMP0 ymm4
 #define YTMP1 ymm5
 #define YTMP2 ymm6
 #define YTMP3 ymm7
 #define YTMP4 ymm8
 #define XFER YTMP0
 
 #define BYTE_FLIP_MASK ymm9
 #define PERM_VPALIGNR_8 ymm10
 
 #define MASK_DC_00 k1
 
 #define INP rdi /* 1st arg */
 #define CTX rsi /* 2nd arg */
 #define NUM_BLKS rdx /* 3rd arg */
 #define SRND r8d
 #define RSP_SAVE r9
 
 #define TBL rcx
 
 #define a xmm11
 #define b xmm12
 #define c xmm13
 #define d xmm14
 #define e xmm15
 #define f xmm16
 #define g xmm17
 #define h xmm18
 
 #define y0 xmm19
 #define y1 xmm20
 #define y2 xmm21
 #define y3 xmm22
 
 /* Local variables (stack frame) */
 #define frame_XFER         0
 #define frame_XFER_size    (4*4*8)
 #define frame_size         (frame_XFER + frame_XFER_size)
 
 #define clear_reg(x) vpxorq x,x,x
 
 /* addm [mem], reg */
 /* Add reg to mem using reg-mem add and store */
 #define addm(p1, p2) \
 	vmovq	y0, p1; \
 	vpaddq	p2, p2, y0; \
 	vmovq	p1, p2;
 
 /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
 /* Load ymm with mem and byte swap each dword */
 #define COPY_YMM_AND_BSWAP(p1, p2, p3) \
 	vmovdqu p1, p2; \
 	vpshufb p1, p1, p3
 
 /* %macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL */
 /* YDST = {YSRC1, YSRC2} >> RVAL*8 */
 #define MY_VPALIGNR(YDST_SRC1, YSRC2, RVAL) \
 	vpermt2q YDST_SRC1, PERM_VPALIGNR_##RVAL, YSRC2;
 
 #define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
 	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \
 	 * d += h; \
 	 * h += Sum0 (a) + Maj (a, b, c); \
 	 * \
 	 * Ch(x, y, z) => ((x & y) + (~x & z)) \
 	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \
 	 */ \
 	\
 	vmovq y3, [XFERIN]; \
 	vmovdqa64 y2, e; \
 	vpaddq h, h, y3; \
 	vprorq y0, e, 41; \
 	vpternlogq y2, f, g, 0xca; /* Ch (e, f, g) */ \
 	vprorq y1, e, 18; \
 	vprorq y3, e, 14; \
 	vpaddq h, h, y2; \
 	vpternlogq y0, y1, y3, 0x96; /* Sum1 (e) */ \
 	vpaddq h, h, y0; /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]) */ \
 	vpaddq d, d, h; /* d += h */
 
 #define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
 	vmovdqa64 y1, a; \
 	vprorq y0, a, 39; \
 	vpternlogq y1, b, c, 0xe8; /* Maj (a, b, c) */ \
 	vprorq y2, a, 34; \
 	vprorq y3, a, 28; \
 	vpternlogq y0, y2, y3, 0x96; /* Sum0 (a) */ \
 	vpaddq h, h, y1; \
 	vpaddq h, h, y0; /* h += Sum0 (a) + Maj (a, b, c) */
 
 #define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \
 	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
 		vmovdqa		YTMP0, Y_3; \
 		vmovdqa		YTMP1, Y_1; \
 		/* Extract w[t-7] */; \
 		vpermt2q	YTMP0, PERM_VPALIGNR_8, Y_2	/* YTMP0 = W[-7] */; \
 		/* Calculate w[t-16] + w[t-7] */; \
 		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */; \
 		/* Extract w[t-15] */; \
 		vpermt2q	YTMP1, PERM_VPALIGNR_8, Y_0	/* YTMP1 = W[-15] */; \
 	ONE_ROUND_PART1(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
 		\
 		/* Calculate sigma0 */; \
 		\
 		/* Calculate w[t-15] ror 1 */; \
 		vprorq		YTMP3, YTMP1, 1;		/* YTMP3 = W[-15] ror 1 */; \
 		/* Calculate w[t-15] shr 7 */; \
 		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */; \
 	\
 	ONE_ROUND_PART2(a, b, c, d, e, f, g, h); \
 	\
 	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
 		/* Calculate w[t-15] ror 8 */; \
 		vprorq		YTMP1, YTMP1, 8			/* YTMP1 = W[-15] ror 8 */; \
 		/* XOR the three components */; \
 		vpternlogq	YTMP1, YTMP3, YTMP4, 0x96	/* YTMP1 = s0 = W[-15] ror 1 ^ W[-15] >> 7 ^ W[-15] ror 8 */; \
 		\
 		/* Add three components, w[t-16], w[t-7] and sigma0 */; \
 		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */; \
 	ONE_ROUND_PART1(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
 		/* Move to appropriate lanes for calculating w[16] and w[17] */; \
 		vshufi64x2	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \
 		\
 		/* Calculate w[16] and w[17] in both 128 bit lanes */; \
 		\
 		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \
 		vshufi64x2	YTMP2, Y_3, Y_3, 0b11		/* YTMP2 = W[-2] {BABA} */; \
 		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */; \
 	\
 	ONE_ROUND_PART2(h, a, b, c, d, e, f, g); \
 	\
 	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
 		vprorq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] ror 19 {BABA} */; \
 		vprorq		YTMP1, YTMP2, 61		/* YTMP3 = W[-2] ror 61 {BABA} */; \
 		vpternlogq	YTMP4, YTMP3, YTMP1, 0x96	/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \
 		\
 	ONE_ROUND_PART1(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
 		/* Add sigma1 to the other compunents to get w[16] and w[17] */; \
 		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */; \
 		\
 		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \
 		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */; \
 	\
 	ONE_ROUND_PART2(g, h, a, b, c, d, e, f); \
 	\
 	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
 		vprorq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] ror 19 {DC--} */; \
 		vprorq		YTMP1, Y_0, 61			/* YTMP1 = W[-2] ror 61 {DC--} */; \
 		vpternlogq	YTMP4, YTMP3, YTMP1, 0x96	/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \
 		\
 	ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \
 		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \
 		/* Form w[19, w[18], w17], w[16] */; \
 		vpaddq		Y_0{MASK_DC_00}, YTMP0, YTMP4	/* YTMP2 = {W[3], W[2], W[1], W[0]} */; \
 		\
 		vpaddq		XFER, Y_0, [TBL + (4+X)*32]; \
 		vmovdqa		[rsp + frame_XFER + X*32], XFER; \
 	ONE_ROUND_PART2(f, g, h, a, b, c, d, e)
 
 #define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \
 	ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \
 	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
 
 #define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \
 	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
 	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
 	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
 	ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_avx512(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks
 */
 .globl _gcry_sha512_transform_amd64_avx512
 ELF(.type _gcry_sha512_transform_amd64_avx512,@function;)
 .align 16
 _gcry_sha512_transform_amd64_avx512:
 	CFI_STARTPROC()
 	xor	eax, eax
 
 	cmp	rdx, 0
 	je	.Lnowork
 
 	/* Setup mask register for DC:BA merging. */
 	mov	eax, 0b1100
 	kmovd	MASK_DC_00, eax
 
 	/* Allocate Stack Space */
 	mov	RSP_SAVE, rsp
 	CFI_DEF_CFA_REGISTER(RSP_SAVE);
 	sub	rsp, frame_size
 	and	rsp, ~(0x40 - 1)
 
 	/*; load initial digest */
 	vmovq	a,[8*0 + CTX]
 	vmovq	b,[8*1 + CTX]
 	vmovq	c,[8*2 + CTX]
 	vmovq	d,[8*3 + CTX]
 	vmovq	e,[8*4 + CTX]
 	vmovq	f,[8*5 + CTX]
 	vmovq	g,[8*6 + CTX]
 	vmovq	h,[8*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vpmovzxbq PERM_VPALIGNR_8, [.LPERM_VPALIGNR_8 ADD_RIP]
 
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/*; byte swap first 16 dwords */
 	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
 	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
 	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
 	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
 	lea	INP, [INP + 128]
 
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa	[rsp + frame_XFER + 0*32], XFER
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa	[rsp + frame_XFER + 1*32], XFER
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa	[rsp + frame_XFER + 2*32], XFER
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa	[rsp + frame_XFER + 3*32], XFER
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
 	mov	SRND, 4
 
 .align 16
 .Loop0:
 	FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h)
 	FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d)
 	FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h)
 	FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d)
 	lea	TBL, [TBL + 4*32]
 
 	sub	SRND, 1
 	jne	.Loop0
 
 	sub	NUM_BLKS, 1
 	je	.Ldone_hash
 
 	lea	TBL, [.LK512 ADD_RIP]
 
 	/* load next block and byte swap */
 	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
 	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
 	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
 	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
 	lea	INP, [INP + 128]
 
 	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa	[rsp + frame_XFER + 0*32], XFER
 	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa	[rsp + frame_XFER + 1*32], XFER
 	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa	[rsp + frame_XFER + 2*32], XFER
 	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa	[rsp + frame_XFER + 3*32], XFER
 
 	addm([8*0 + CTX],a)
 	addm([8*1 + CTX],b)
 	addm([8*2 + CTX],c)
 	addm([8*3 + CTX],d)
 	addm([8*4 + CTX],e)
 	addm([8*5 + CTX],f)
 	addm([8*6 + CTX],g)
 	addm([8*7 + CTX],h)
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
 	mov	SRND, 4
 
 	jmp	.Loop0
 
 .Ldone_hash:
 	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
 	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
 	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
 	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
 
 	addm([8*0 + CTX],a)
 	xor	eax, eax /* burn stack */
 	addm([8*1 + CTX],b)
 	addm([8*2 + CTX],c)
 	addm([8*3 + CTX],d)
 	addm([8*4 + CTX],e)
 	addm([8*5 + CTX],f)
 	addm([8*6 + CTX],g)
 	addm([8*7 + CTX],h)
-	kmovd	MASK_DC_00, eax
+	kxord MASK_DC_00, MASK_DC_00, MASK_DC_00
 
 	vzeroall
 	vmovdqa	[rsp + frame_XFER + 0*32], ymm0 /* burn stack */
 	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
 	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
 	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
 	clear_reg(xmm16);
 	clear_reg(xmm17);
 	clear_reg(xmm18);
 	clear_reg(xmm19);
 	clear_reg(xmm20);
 	clear_reg(xmm21);
 	clear_reg(xmm22);
 
 	/* Restore Stack Pointer */
 	mov	rsp, RSP_SAVE
 	CFI_DEF_CFA_REGISTER(rsp)
 
 .Lnowork:
 	ret_spec_stop
 	CFI_ENDPROC()
 ELF(.size _gcry_sha512_transform_amd64_avx512,.-_gcry_sha512_transform_amd64_avx512)
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 /*;; Binary Data */
 
 ELF(.type _gcry_sha512_avx512_consts,@object)
 _gcry_sha512_avx512_consts:
 .align 64
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad	0xd192e819d6ef5218,0xd69906245565a910
 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
 	.quad	0x28db77f523047d84,0x32caab7b40c72493
 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .align 32
 .LPSHUFFLE_BYTE_FLIP_MASK:	.octa 0x08090a0b0c0d0e0f0001020304050607
 				.octa 0x18191a1b1c1d1e1f1011121314151617
 
 .align 4
 .LPERM_VPALIGNR_8:		.byte 5, 6, 7, 0
 ELF(.size _gcry_sha512_avx512_consts,.-_gcry_sha512_avx512_consts)
 
 #endif
 #endif
diff --git a/configure.ac b/configure.ac
index e63a7d6d..a7482cf3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,3397 +1,3400 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 # Copyright (C) 2012-2021  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # (Process this file with autoconf to produce a configure script.)
 AC_REVISION($Revision$)
 AC_PREREQ([2.69])
 min_automake_version="1.14"
 
 # To build a release you need to create a tag with the version number
 # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force".  Please
 # bump the version number immediately after the release and do another
 # commit and push so that the git magic is able to work.  See below
 # for the LT versions.
 m4_define([mym4_package],[libgcrypt])
 m4_define([mym4_major], [1])
 m4_define([mym4_minor], [11])
 m4_define([mym4_micro], [0])
 
 # Below is m4 magic to extract and compute the git revision number,
 # the decimalized short revision number, a beta version string and a
 # flag indicating a development version (mym4_isbeta).  Note that the
 # m4 processing is done by autoconf and not during the configure run.
 m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \
                            mym4_package mym4_major mym4_minor mym4_micro),[:]))
 m4_define([mym4_isbeta],       m4_argn(2, mym4_verslist))
 m4_define([mym4_version],      m4_argn(4, mym4_verslist))
 m4_define([mym4_revision],     m4_argn(7, mym4_verslist))
 m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist))
 m4_esyscmd([echo ]mym4_version[>VERSION])
 AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org])
 
 # LT Version numbers, remember to change them just *before* a release.
 # NOET NOTE - Already updated for a 1.11 series - NOTE NOTE
 #   (Code changed:			REVISION++)
 #   (Interfaces added/removed/changed:	CURRENT++, REVISION=0)
 #   (Interfaces added:			AGE++)
 #   (Interfaces removed:		AGE=0)
 #
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
 LIBGCRYPT_LT_CURRENT=25
 LIBGCRYPT_LT_AGE=5
 LIBGCRYPT_LT_REVISION=0
 ################################################
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
 AC_SUBST(LIBGCRYPT_LT_AGE)
 AC_SUBST(LIBGCRYPT_LT_REVISION)
 
 # If the API is changed in an incompatible way: increment the next counter.
 #
 # 1.6: ABI and API change but the change is to most users irrelevant
 #      and thus the API version number has not been incremented.
 LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
 NEED_GPG_ERROR_VERSION=1.27
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
 AM_INIT_AUTOMAKE([serial-tests dist-bzip2])
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_LIBOBJ_DIR([compat])
 AC_CANONICAL_HOST
 AM_MAINTAINER_MODE
 AM_SILENT_RULES
 
 AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory])
 
 AH_TOP([
 #ifndef _GCRYPT_CONFIG_H_INCLUDED
 #define _GCRYPT_CONFIG_H_INCLUDED
 
 /* Enable gpg-error's strerror macro for W32CE.  */
 #define GPG_ERR_ENABLE_ERRNO_MACROS 1
 ])
 
 AH_BOTTOM([
 #define _GCRYPT_IN_LIBGCRYPT 1
 
 /* Add .note.gnu.property section for Intel CET in assembler sources
    when CET is enabled.  */
 #if defined(__ASSEMBLER__) && defined(__CET__)
 # include <cet.h>
 #endif
 
 /* If the configure check for endianness has been disabled, get it from
    OS macros.  This is intended for making fat binary builds on OS X.  */
 #ifdef DISABLED_ENDIAN_CHECK
 # if defined(__BIG_ENDIAN__)
 #  define WORDS_BIGENDIAN 1
 # elif defined(__LITTLE_ENDIAN__)
 #  undef WORDS_BIGENDIAN
 # else
 #  error "No endianness found"
 # endif
 #endif /*DISABLED_ENDIAN_CHECK*/
 
 /* We basically use the original Camellia source.  Make sure the symbols
    properly prefixed.  */
 #define CAMELLIA_EXT_SYM_PREFIX _gcry_
 
 #endif /*_GCRYPT_CONFIG_H_INCLUDED*/
 ])
 
 AH_VERBATIM([_REENTRANT],
 [/* To allow the use of Libgcrypt in multithreaded programs we have to use
     special features from the library. */
 #ifndef _REENTRANT
 # define _REENTRANT 1
 #endif
 ])
 
 
 ######################
 ##  Basic checks.  ### (we need some results later on (e.g. $GCC)
 ######################
 
 AC_PROG_MAKE_SET
 missing_dir=`cd $ac_aux_dir && pwd`
 AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
 AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
 AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
 AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
 # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
 AC_PROG_CC
 AC_PROG_CPP
 AM_PROG_CC_C_O
 AM_PROG_AS
 AC_SEARCH_LIBS([strerror],[cposix])
 AC_PROG_INSTALL
 AC_PROG_AWK
 
 AC_USE_SYSTEM_EXTENSIONS
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
 dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has
 dnl the precedence over the run path, so that if a compatible MPFR library
 dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested
 dnl MPFR library will be this library instead of the MPFR library from the
 dnl build tree. Other OS with the same issue might be added later.
 dnl
 dnl References:
 dnl   https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732
 dnl   http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html
 dnl
 dnl We need to check whether --disable-new-dtags is supported as alternate
 dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc).
 dnl
 case $host in
   *-*-linux*)
     if test -n "$LD_LIBRARY_PATH"; then
       saved_LDFLAGS="$LDFLAGS"
       LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags"
       LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE"
       AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker)
       AC_LINK_IFELSE([AC_LANG_SOURCE([[
 int main (void) { return 0; }
       ]])],
       [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))],
       [AC_MSG_RESULT(no)
        LDADD_FOR_TESTS_KLUDGE=""
       ])
       LDFLAGS="$saved_LDFLAGS"
     fi
     ;;
 esac
 AC_SUBST([LDADD_FOR_TESTS_KLUDGE])
 
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \
                           mym4_minor mym4_micro)
 AC_SUBST(VERSION_NUMBER)
 
 # We need to compile and run a program on the build machine.
 AX_CC_FOR_BUILD
 
 
 LT_PREREQ([2.2.6])
 LT_INIT([win32-dll disable-static])
 LT_LANG([Windows Resource])
 
 
 ##########################
 ## General definitions. ##
 ##########################
 
 # Used by libgcrypt-config
 LIBGCRYPT_CONFIG_LIBS="-lgcrypt"
 LIBGCRYPT_CONFIG_CFLAGS=""
 LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 available_ciphers="$available_ciphers sm4"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
 available_pubkey_ciphers="dsa elgamal rsa ecc"
 enabled_pubkey_ciphers=""
 
 # Definitions for message digests.
 available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512"
 available_digests="$available_digests sha3 tiger whirlpool stribog blake2"
 available_digests="$available_digests sm3"
 enabled_digests=""
 
 # Definitions for kdfs (optional ones)
 available_kdfs="s2k pkdf2 scrypt"
 enabled_kdfs=""
 
 # Definitions for random modules.
 available_random_modules="getentropy linux egd unix"
 auto_random_modules="$available_random_modules"
 
 # Supported thread backends.
 LIBGCRYPT_THREAD_MODULES=""
 
 # Other definitions.
 have_w32_system=no
 have_w32ce_system=no
 have_pthread=no
 
 
 # Setup some stuff depending on host.
 case "${host}" in
     *-*-mingw32*)
       ac_cv_have_dev_random=no
       have_w32_system=yes
       case "${host}" in
         *-mingw32ce*)
             have_w32ce_system=yes
             available_random_modules="w32ce"
             ;;
         *)
             available_random_modules="w32"
             ;;
       esac
       AC_DEFINE(USE_ONLY_8DOT3,1,
                 [set this to limit filenames to the 8.3 format])
       AC_DEFINE(HAVE_DRIVE_LETTERS,1,
                 [defined if we must run on a stupid file system])
       AC_DEFINE(HAVE_DOSISH_SYSTEM,1,
                 [defined if we run on some of the PCDOS like systems
                  (DOS, Windoze. OS/2) with special properties like
                   no file modes])
       ;;
 
     i?86-emx-os2 | i?86-*-os2*emx)
         # OS/2 with the EMX environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     i?86-*-msdosdjgpp*)
         # DOS with the DJGPP environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     *-*-hpux*)
         if test -z "$GCC" ; then
             CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE"
         fi
         ;;
     *-dec-osf4*)
         if test -z "$GCC" ; then
             # Suppress all warnings
             # to get rid of the unsigned/signed char mismatch warnings.
             CFLAGS="$CFLAGS -w"
         fi
         ;;
     m68k-atari-mint)
         ;;
     *-apple-darwin*)
         AC_DEFINE(_DARWIN_C_SOURCE, 1,
                   Expose all libc features (__DARWIN_C_FULL).)
         AC_DEFINE(USE_POSIX_SPAWN_FOR_TESTS, 1,
                   [defined if we use posix_spawn in test program])
         AC_CHECK_HEADERS(spawn.h)
         ;;
     *)
       ;;
 esac
 
 if test "$have_w32_system" = yes; then
    AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system])
    if test "$have_w32ce_system" = yes; then
      AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE])
    fi
 fi
 AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes)
 AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes)
 
 
 
 # A printable OS Name is sometimes useful.
 case "${host}" in
     *-*-mingw32ce*)
         PRINTABLE_OS_NAME="W32CE"
         ;;
 
     *-*-mingw32*)
         PRINTABLE_OS_NAME="W32"
         ;;
 
     i?86-emx-os2 | i?86-*-os2*emx )
         PRINTABLE_OS_NAME="OS/2"
         ;;
 
     i?86-*-msdosdjgpp*)
         PRINTABLE_OS_NAME="MSDOS/DJGPP"
         ;;
 
     *-linux*)
         PRINTABLE_OS_NAME="GNU/Linux"
         ;;
 
     *)
         PRINTABLE_OS_NAME=`uname -s || echo "Unknown"`
         ;;
 esac
 
 NAME_OF_DEV_RANDOM="/dev/random"
 NAME_OF_DEV_URANDOM="/dev/urandom"
 
 AC_ARG_ENABLE(endian-check,
               AS_HELP_STRING([--disable-endian-check],
               [disable the endian check and trust the OS provided macros]),
 	      endiancheck=$enableval,endiancheck=yes)
 if test x"$endiancheck" = xyes ; then
   AC_C_BIGENDIAN
 else
   AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness])
 fi
 
 AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
 
 if test "$ac_cv_sizeof_unsigned_short" = "0" \
    || test "$ac_cv_sizeof_unsigned_int" = "0" \
    || test "$ac_cv_sizeof_unsigned_long" = "0"; then
     AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]);
 fi
 
 # Ensure that we have UINT64_C before we bother to check for uint64_t
 AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works],
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <inttypes.h>]],
        [[uint64_t foo=UINT64_C(42);]])],
      gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no))
 if test "$gnupg_cv_uint64_c_works" = "yes" ; then
    AC_CHECK_SIZEOF(uint64_t)
 fi
 
 # Do we have any 64-bit data types?
 if test "$ac_cv_sizeof_unsigned_int" != "8" \
    && test "$ac_cv_sizeof_unsigned_long" != "8" \
    && test "$ac_cv_sizeof_unsigned_long_long" != "8" \
    && test "$ac_cv_sizeof_uint64_t" != "8"; then
     AC_MSG_ERROR([[
 ***
 *** No 64-bit integer type available.
 *** It is not possible to build Libgcrypt on this platform.
 ***]])
 fi
 
 
 # If not specified otherwise, all available algorithms will be
 # included.
 default_ciphers="$available_ciphers"
 default_pubkey_ciphers="$available_pubkey_ciphers"
 default_digests="$available_digests"
 default_kdfs="$available_kdfs"
 # Blacklist MD2 by default
 default_digests=`echo $default_digests | sed -e 's/md2//g'`
 
 # Substitutions to set generated files in a Emacs buffer to read-only.
 AC_SUBST(emacs_local_vars_begin, ['Local Variables:'])
 AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t'])
 AC_SUBST(emacs_local_vars_end, ['End:'])
 
 ############################
 ## Command line switches. ##
 ############################
 
 # Implementation of the --enable-ciphers switch.
 AC_ARG_ENABLE(ciphers,
 	      AS_HELP_STRING([--enable-ciphers=ciphers],
                              [select the symmetric ciphers to include]),
 	      [enabled_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_ciphers=""])
 if test "x$enabled_ciphers" = "x" \
    -o "$enabled_ciphers" = "yes"  \
    -o "$enabled_ciphers" = "no"; then
    enabled_ciphers=$default_ciphers
 fi
 AC_MSG_CHECKING([which symmetric ciphers to include])
 for cipher in $enabled_ciphers; do
     LIST_MEMBER($cipher, $available_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported cipher "$cipher" specified])
     fi
 done
 AC_MSG_RESULT([$enabled_ciphers])
 
 # Implementation of the --enable-pubkey-ciphers switch.
 AC_ARG_ENABLE(pubkey-ciphers,
 	      AS_HELP_STRING([--enable-pubkey-ciphers=ciphers],
                              [select the public-key ciphers to include]),
 	      [enabled_pubkey_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_pubkey_ciphers=""])
 if test "x$enabled_pubkey_ciphers" = "x" \
    -o "$enabled_pubkey_ciphers" = "yes"  \
    -o "$enabled_pubkey_ciphers" = "no"; then
    enabled_pubkey_ciphers=$default_pubkey_ciphers
 fi
 AC_MSG_CHECKING([which public-key ciphers to include])
 for cipher in $enabled_pubkey_ciphers; do
     LIST_MEMBER($cipher, $available_pubkey_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported public-key cipher specified])
     fi
 done
 AC_MSG_RESULT([$enabled_pubkey_ciphers])
 
 # Implementation of the --enable-digests switch.
 AC_ARG_ENABLE(digests,
 	      AS_HELP_STRING([--enable-digests=digests],
                              [select the message digests to include]),
 	      [enabled_digests=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_digests=""])
 if test "x$enabled_digests" = "x" \
    -o "$enabled_digests" = "yes"  \
    -o "$enabled_digests" = "no"; then
    enabled_digests=$default_digests
 fi
 AC_MSG_CHECKING([which message digests to include])
 for digest in $enabled_digests; do
     LIST_MEMBER($digest, $available_digests)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported message digest specified])
     fi
 done
 AC_MSG_RESULT([$enabled_digests])
 
 # Implementation of the --enable-kdfs switch.
 AC_ARG_ENABLE(kdfs,
       AS_HELP_STRING([--enable-kfds=kdfs],
                      [select the KDFs to include]),
       [enabled_kdfs=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
       [enabled_kdfs=""])
 if test "x$enabled_kdfs" = "x" \
    -o "$enabled_kdfs" = "yes"  \
    -o "$enabled_kdfs" = "no"; then
    enabled_kdfs=$default_kdfs
 fi
 AC_MSG_CHECKING([which key derivation functions to include])
 for kdf in $enabled_kdfs; do
     LIST_MEMBER($kdf, $available_kdfs)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported key derivation function specified])
     fi
 done
 AC_MSG_RESULT([$enabled_kdfs])
 
 # Implementation of the --enable-random switch.
 AC_ARG_ENABLE(random,
 	      AS_HELP_STRING([--enable-random=name],
                              [select which random number generator to use]),
 	      [random=`echo $enableval | tr '[A-Z]' '[a-z]'`],
 	      [])
 if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then
     random=default
 fi
 AC_MSG_CHECKING([which random module to use])
 if test "$random" != "default" -a "$random" != "auto"; then
     LIST_MEMBER($random, $available_random_modules)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported random module specified])
     fi
 fi
 AC_MSG_RESULT($random)
 
 # Implementation of the --disable-dev-random switch.
 AC_MSG_CHECKING([whether use of /dev/random is requested])
 AC_ARG_ENABLE(dev-random,
 [  --disable-dev-random    disable the use of dev random],
     try_dev_random=$enableval, try_dev_random=yes)
 AC_MSG_RESULT($try_dev_random)
 
 # Implementation of the --with-egd-socket switch.
 AC_ARG_WITH(egd-socket,
     [  --with-egd-socket=NAME  Use NAME for the EGD socket)],
             egd_socket_name="$withval", egd_socket_name="" )
 AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested])
 AC_ARG_ENABLE([asm],
               AS_HELP_STRING([--disable-asm],
                              [Disable MPI and cipher assembler modules]),
               [try_asm_modules=$enableval],
               [try_asm_modules=yes])
 AC_MSG_RESULT($try_asm_modules)
 if test "$try_asm_modules" != yes ; then
     AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure])
 fi
 
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
               AS_HELP_STRING([--enable-large-data-tests],
                  [Enable the real long ruinning large data tests]),
 	      large_data_tests=$enableval,large_data_tests=no)
 AC_MSG_RESULT($large_data_tests)
 AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests)
 
 # Implementation of --enable-force-soft-hwfeatures
 AC_MSG_CHECKING([whether 'soft' HW feature bits are forced on])
 AC_ARG_ENABLE([force-soft-hwfeatures],
               AS_HELP_STRING([--enable-force-soft-hwfeatures],
                              [Enable forcing 'soft' HW feature bits on]),
               [force_soft_hwfeatures=$enableval],
               [force_soft_hwfeatures=no])
 AC_MSG_RESULT($force_soft_hwfeatures)
 
 
 # Implementation of the --with-capabilities switch.
 # Check whether we want to use Linux capabilities
 AC_MSG_CHECKING([whether use of capabilities is requested])
 AC_ARG_WITH(capabilities,
             AS_HELP_STRING([--with-capabilities],
                            [Use linux capabilities [default=no]]),
             [use_capabilities="$withval"],[use_capabilities=no])
 AC_MSG_RESULT($use_capabilities)
 
 # Implementation of the --enable-hmac-binary-check.
 AC_MSG_CHECKING([whether a HMAC binary check is requested])
 AC_ARG_ENABLE(hmac-binary-check,
               AS_HELP_STRING([--enable-hmac-binary-check],
                              [Enable library integrity check]),
               [use_hmac_binary_check="$enableval"],
               [use_hmac_binary_check=no])
 AC_MSG_RESULT($use_hmac_binary_check)
 if test "$use_hmac_binary_check" = no ; then
     DEF_HMAC_BINARY_CHECK=''
 else
     AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1,
               [Define to support an HMAC based integrity check])
     AC_CHECK_TOOL(OBJCOPY, [objcopy])
     AC_CHECK_TOOL(READELF, [readelf])
     if test "$use_hmac_binary_check" != yes ; then
         DEF_HMAC_BINARY_CHECK=-DKEY_FOR_BINARY_CHECK="'\"$use_hmac_binary_check\"'"
     fi
 fi
 AM_CONDITIONAL(USE_HMAC_BINARY_CHECK, test "x$use_hmac_binary_check" != xno)
 AC_SUBST(DEF_HMAC_BINARY_CHECK)
 
 # Implementation of the --with-fips-module-version.
 AC_ARG_WITH(fips-module-version,
             AS_HELP_STRING([--with-fips-module-version=VERSION],
                            [Specify the FIPS module version for the build]),
             fips_module_version="$withval", fips_module_version="" )
 AC_DEFINE_UNQUOTED(FIPS_MODULE_VERSION, "$fips_module_version",
                    [Define FIPS module version for certification])
 
 # Implementation of the --disable-jent-support switch.
 AC_MSG_CHECKING([whether jitter entropy support is requested])
 AC_ARG_ENABLE(jent-support,
               AS_HELP_STRING([--disable-jent-support],
                         [Disable support for the Jitter entropy collector]),
 	      jentsupport=$enableval,jentsupport=yes)
 AC_MSG_RESULT($jentsupport)
 
 # Implementation of the --disable-padlock-support switch.
 AC_MSG_CHECKING([whether padlock support is requested])
 AC_ARG_ENABLE(padlock-support,
               AS_HELP_STRING([--disable-padlock-support],
                         [Disable support for the PadLock Engine of VIA processors]),
 	      padlocksupport=$enableval,padlocksupport=yes)
 AC_MSG_RESULT($padlocksupport)
 
 # Implementation of the --disable-aesni-support switch.
 AC_MSG_CHECKING([whether AESNI support is requested])
 AC_ARG_ENABLE(aesni-support,
               AS_HELP_STRING([--disable-aesni-support],
                  [Disable support for the Intel AES-NI instructions]),
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
 # Implementation of the --disable-shaext-support switch.
 AC_MSG_CHECKING([whether SHAEXT support is requested])
 AC_ARG_ENABLE(shaext-support,
               AS_HELP_STRING([--disable-shaext-support],
                  [Disable support for the Intel SHAEXT instructions]),
               shaextsupport=$enableval,shaextsupport=yes)
 AC_MSG_RESULT($shaextsupport)
 
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
               AS_HELP_STRING([--disable-pclmul-support],
                  [Disable support for the Intel PCLMUL instructions]),
 	      pclmulsupport=$enableval,pclmulsupport=yes)
 AC_MSG_RESULT($pclmulsupport)
 
 # Implementation of the --disable-sse41-support switch.
 AC_MSG_CHECKING([whether SSE4.1 support is requested])
 AC_ARG_ENABLE(sse41-support,
               AS_HELP_STRING([--disable-sse41-support],
                  [Disable support for the Intel SSE4.1 instructions]),
 	      sse41support=$enableval,sse41support=yes)
 AC_MSG_RESULT($sse41support)
 
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
               AS_HELP_STRING([--disable-drng-support],
                  [Disable support for the Intel DRNG (RDRAND instruction)]),
 	      drngsupport=$enableval,drngsupport=yes)
 AC_MSG_RESULT($drngsupport)
 
 # Implementation of the --disable-avx-support switch.
 AC_MSG_CHECKING([whether AVX support is requested])
 AC_ARG_ENABLE(avx-support,
               AS_HELP_STRING([--disable-avx-support],
                  [Disable support for the Intel AVX instructions]),
 	      avxsupport=$enableval,avxsupport=yes)
 AC_MSG_RESULT($avxsupport)
 
 # Implementation of the --disable-avx2-support switch.
 AC_MSG_CHECKING([whether AVX2 support is requested])
 AC_ARG_ENABLE(avx2-support,
               AS_HELP_STRING([--disable-avx2-support],
                  [Disable support for the Intel AVX2 instructions]),
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
 # Implementation of the --disable-avx512-support switch.
 AC_MSG_CHECKING([whether AVX512 support is requested])
 AC_ARG_ENABLE(avx512-support,
               AS_HELP_STRING([--disable-avx512-support],
                  [Disable support for the Intel AVX512 instructions]),
 	      avx512support=$enableval,avx512support=yes)
 AC_MSG_RESULT($avx512support)
 
 # Implementation of the --disable-gfni-support switch.
 AC_MSG_CHECKING([whether GFNI support is requested])
 AC_ARG_ENABLE(gfni-support,
               AS_HELP_STRING([--disable-gfni-support],
                  [Disable support for the Intel GFNI instructions]),
 	      gfnisupport=$enableval,gfnisupport=yes)
 AC_MSG_RESULT($gfnisupport)
 
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
               AS_HELP_STRING([--disable-neon-support],
                  [Disable support for the ARM NEON instructions]),
 	      neonsupport=$enableval,neonsupport=yes)
 AC_MSG_RESULT($neonsupport)
 
 # Implementation of the --disable-arm-crypto-support switch.
 AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested])
 AC_ARG_ENABLE(arm-crypto-support,
               AS_HELP_STRING([--disable-arm-crypto-support],
                  [Disable support for the ARMv8 Crypto Extension instructions]),
 	      armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
 # Implementation of the --disable-ppc-crypto-support switch.
 AC_MSG_CHECKING([whether PPC crypto support is requested])
 AC_ARG_ENABLE(ppc-crypto-support,
               AS_HELP_STRING([--disable-ppc-crypto-support],
                  [Disable support for the PPC crypto instructions introduced in POWER 8 (PowerISA 2.07)]),
               ppccryptosupport=$enableval,ppccryptosupport=yes)
 AC_MSG_RESULT($ppccryptosupport)
 
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
               AS_HELP_STRING([--disable-O-flag-munging],
                  [Disable modification of the cc -O flag]),
               [enable_o_flag_munging=$enableval],
               [enable_o_flag_munging=yes])
 AC_MSG_RESULT($enable_o_flag_munging)
 AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes")
 
 # Implementation of the --disable-instrumentation-munging switch.
 AC_MSG_CHECKING([whether a instrumentation (-fprofile, -fsanitize) munging is requested])
 AC_ARG_ENABLE([instrumentation-munging],
               AS_HELP_STRING([--disable-instrumentation-munging],
                  [Disable modification of the cc instrumentation options]),
               [enable_instrumentation_munging=$enableval],
               [enable_instrumentation_munging=yes])
 AC_MSG_RESULT($enable_instrumentation_munging)
 AM_CONDITIONAL(ENABLE_INSTRUMENTATION_MUNGING,
 	       test "$enable_instrumentation_munging" = "yes")
 
 # Implementation of the --disable-amd64-as-feature-detection switch.
 AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection])
 AC_ARG_ENABLE(amd64-as-feature-detection,
               AS_HELP_STRING([--disable-amd64-as-feature-detection],
                  [Disable the auto-detection of AMD64 as(1) features]),
 	      amd64_as_feature_detection=$enableval,
               amd64_as_feature_detection=yes)
 AC_MSG_RESULT($amd64_as_feature_detection)
 
 
 AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME",
                    [A human readable text with the name of the OS])
 
 # For some systems we know that we have ld_version scripts.
 # Use it then as default.
 have_ld_version_script=no
 case "${host}" in
     *-*-linux*)
 	have_ld_version_script=yes
         ;;
     *-*-gnu*)
 	have_ld_version_script=yes
         ;;
 esac
 AC_ARG_ENABLE([ld-version-script],
               AS_HELP_STRING([--enable-ld-version-script],
                              [enable/disable use of linker version script.
                               (default is system dependent)]),
               [have_ld_version_script=$enableval],
               [ : ] )
 AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes")
 
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM",
                    [defined to the name of the strong random device])
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM",
                    [defined to the name of the weaker random device])
 
 
 ###############################
 #### Checks for libraries. ####
 ###############################
 
 #
 # gpg-error is required.
 #
 AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION")
 if test "x$GPG_ERROR_LIBS" = "x"; then
   AC_MSG_ERROR([libgpg-error is needed.
                 See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .])
 fi
 
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
 AM_CONDITIONAL(USE_GPGRT_CONFIG, [test -n "$GPGRT_CONFIG" \
                                   -a "$ac_cv_path_GPG_ERROR_CONFIG" = no])
 
 #
 # Check whether pthreads is available
 #
 if test "$have_w32_system" != yes; then
   AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes)
   if test "$have_pthread" = yes; then
     AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.])
   fi
 fi
 
 
 # Solaris needs -lsocket and -lnsl. Unisys system includes
 # gethostbyname in libsocket but needs libnsl for socket.
 AC_SEARCH_LIBS(setsockopt, [socket], ,
 	[AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])])
 AC_SEARCH_LIBS(setsockopt, [nsl])
 
 ##################################
 #### Checks for header files. ####
 ##################################
 
 AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h)
 
 
 ##########################################
 #### Checks for typedefs, structures, ####
 ####  and compiler characteristics.   ####
 ##########################################
 
 AC_C_CONST
 AC_C_INLINE
 AC_TYPE_SIZE_T
 AC_TYPE_PID_T
 
 AC_CHECK_TYPES([byte, ushort, u16, u32, u64])
 
 #
 # Check for __builtin_bswap32 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap32,
        [gcry_cv_have_builtin_bswap32],
        [gcry_cv_have_builtin_bswap32=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [int x = 0; int y = __builtin_bswap32(x); return y;])],
           [gcry_cv_have_builtin_bswap32=yes])])
 if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
              [Defined if compiler has '__builtin_bswap32' intrinsic])
 fi
 
 
 #
 # Check for __builtin_bswap64 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap64,
        [gcry_cv_have_builtin_bswap64],
        [gcry_cv_have_builtin_bswap64=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [long long x = 0; long long y = __builtin_bswap64(x); return y;])],
           [gcry_cv_have_builtin_bswap64=yes])])
 if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
              [Defined if compiler has '__builtin_bswap64' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctz,
        [gcry_cv_have_builtin_ctz],
        [gcry_cv_have_builtin_ctz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_ctz(x); return y;])],
           [gcry_cv_have_builtin_ctz=yes])])
 if test "$gcry_cv_have_builtin_ctz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZ, 1,
              [Defined if compiler has '__builtin_ctz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctzl,
        [gcry_cv_have_builtin_ctzl],
        [gcry_cv_have_builtin_ctzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_ctzl(x); return y;])],
           [gcry_cv_have_builtin_ctzl=yes])])
 if test "$gcry_cv_have_builtin_ctzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZL, 1,
              [Defined if compiler has '__builtin_ctzl' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clz,
        [gcry_cv_have_builtin_clz],
        [gcry_cv_have_builtin_clz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_clz(x); return y;])],
           [gcry_cv_have_builtin_clz=yes])])
 if test "$gcry_cv_have_builtin_clz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
              [Defined if compiler has '__builtin_clz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clzl,
        [gcry_cv_have_builtin_clzl],
        [gcry_cv_have_builtin_clzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_clzl(x); return y;])],
           [gcry_cv_have_builtin_clzl=yes])])
 if test "$gcry_cv_have_builtin_clzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZL, 1,
              [Defined if compiler has '__builtin_clzl' intrinsic])
 fi
 
 
 #
 # Check for __sync_synchronize intrinsic.
 #
 AC_CACHE_CHECK(for __sync_synchronize,
        [gcry_cv_have_sync_synchronize],
        [gcry_cv_have_sync_synchronize=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [__sync_synchronize(); return 0;])],
           [gcry_cv_have_sync_synchronize=yes])])
 if test "$gcry_cv_have_sync_synchronize" = "yes" ; then
    AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1,
              [Defined if compiler has '__sync_synchronize' intrinsic])
 fi
 
 
 #
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,
        [gcry_cv_have_vla],
        [gcry_cv_have_vla=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void f1(char *, int);
             char foo(int i) {
               char b[(i < 0 ? 0 : i) + 1];
               f1(b, sizeof b); return b[0];}]])],
           [gcry_cv_have_vla=yes])])
 if test "$gcry_cv_have_vla" = "yes" ; then
    AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported])
 fi
 
 
 #
 # Check for ELF visibility support.
 #
 AC_CACHE_CHECK(whether the visibility attribute is supported,
        gcry_cv_visibility_attribute,
        [gcry_cv_visibility_attribute=no
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo __attribute__ ((visibility ("hidden"))) = 1;
             int bar __attribute__ ((visibility ("protected"))) = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
             if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then
                 if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then
                     gcry_cv_visibility_attribute=yes
                 fi
             fi
         fi
        ])
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken visibility attribute,
        gcry_cv_broken_visibility_attribute,
        [gcry_cv_broken_visibility_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo (int x);
             int bar (int x) __asm__ ("foo")
                             __attribute__ ((visibility ("hidden")));
             int bar (int x) { return x; }
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep '\.hidden@<:@ 	_@:>@foo' conftest.s >/dev/null 2>&1;
             then
                gcry_cv_broken_visibility_attribute=no
            fi
         fi
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken alias attribute,
        gcry_cv_broken_alias_attribute,
        [gcry_cv_broken_alias_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[extern int foo (int x) __asm ("xyzzy");
             int bar (int x) { return x; }
             extern __typeof (bar) foo __attribute ((weak, alias ("bar")));
             extern int dfoo;
             extern __typeof (dfoo) dfoo __asm ("abccb");
             int dfoo = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \
               grep 'abccb' conftest.s >/dev/null 2>&1; then
               gcry_cv_broken_alias_attribute=no
            fi
         fi
         ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(if gcc supports -fvisibility=hidden,
        gcry_cv_gcc_has_f_visibility,
        [gcry_cv_gcc_has_f_visibility=no
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-fvisibility=hidden"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],
                           gcry_cv_gcc_has_f_visibility=yes)
         CFLAGS=$_gcc_cflags_save;
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes" \
    && test "$gcry_cv_broken_visibility_attribute" != "yes" \
    && test "$gcry_cv_broken_alias_attribute" != "yes" \
    && test "$gcry_cv_gcc_has_f_visibility" = "yes"
  then
    AC_DEFINE(GCRY_USE_VISIBILITY, 1,
                [Define to use the GNU C visibility attribute.])
    CFLAGS="$CFLAGS -fvisibility=hidden"
 fi
 
 
 # Following attribute tests depend on warnings to cause compile to fail,
 # so set -Werror temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether the compiler supports the GCC style aligned attribute
 #
 AC_CACHE_CHECK([whether the GCC style aligned attribute is supported],
        [gcry_cv_gcc_attribute_aligned],
        [gcry_cv_gcc_attribute_aligned=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct { int a; } foo __attribute__ ((aligned (16)));]])],
           [gcry_cv_gcc_attribute_aligned=yes])])
 if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1,
      [Defined if a GCC style "__attribute__ ((aligned (n))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style packed attribute
 #
 AC_CACHE_CHECK([whether the GCC style packed attribute is supported],
        [gcry_cv_gcc_attribute_packed],
        [gcry_cv_gcc_attribute_packed=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct foolong_s { long b; } __attribute__ ((packed));
             struct foo_s { char a; struct foolong_s b; }
               __attribute__ ((packed));
             enum bar {
               FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))),
             };]])],
           [gcry_cv_gcc_attribute_packed=yes])])
 if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1,
      [Defined if a GCC style "__attribute__ ((packed))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style may_alias attribute
 #
 AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported],
        [gcry_cv_gcc_attribute_may_alias],
        [gcry_cv_gcc_attribute_may_alias=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[typedef struct foo_s { int a; }
             __attribute__ ((may_alias)) foo_t;]])],
           [gcry_cv_gcc_attribute_may_alias=yes])])
 if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1,
      [Defined if a GCC style "__attribute__ ((may_alias))" is supported])
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether the compiler supports 'asm' or '__asm__' keyword for
 # assembler blocks.
 #
 AC_CACHE_CHECK([whether 'asm' assembler keyword is supported],
        [gcry_cv_have_asm],
        [gcry_cv_have_asm=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm("":::"memory"); }]])],
           [gcry_cv_have_asm=yes])])
 AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported],
        [gcry_cv_have___asm__],
        [gcry_cv_have___asm__=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { __asm__("":::"memory"); }]])],
           [gcry_cv_have___asm__=yes])])
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_DEFINE(asm,__asm__,
         [Define to supported assembler block keyword, if plain 'asm' was not
          supported])
    fi
 fi
 
 
 #
 # Check whether the compiler supports inline assembly memory barrier.
 #
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
           [gcry_cv_have_asm_volatile_memory],
           [gcry_cv_have_asm_volatile_memory=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void a(int x)
                {
                  __asm__ volatile("":::"memory");
                  __asm__ volatile("":"+r"(x)::"memory");
                }]])],
              [gcry_cv_have_asm_volatile_memory=yes])])
    fi
 else
    AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
        [gcry_cv_have_asm_volatile_memory],
        [gcry_cv_have_asm_volatile_memory=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(int x)
             {
               asm volatile("":::"memory");
               asm volatile("":"+r"(x)::"memory"); }]])],
           [gcry_cv_have_asm_volatile_memory=yes])])
 fi
 if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1,
      [Define if inline asm memory barrier is supported])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARM
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations],
        [gcry_cv_gcc_arm_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_arm_platform_as_ok="n/a"
         else
           gcry_cv_gcc_arm_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
                 "add %r0, %r0, %r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
               );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_arm_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARM assembly implementations])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARMv8/Aarch64
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations],
        [gcry_cv_gcc_aarch64_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_aarch64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_aarch64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 ".text\n\t"
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
               );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_aarch64_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations])
 fi
 
 #
 # Check whether GCC assembler supports for CFI directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
        [gcry_cv_gcc_asm_cfi_directives],
        [gcry_cv_gcc_asm_cfi_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".text\n\t"
                 "ac_test:\n\t"
                 ".cfi_startproc\n\t"
                 ".cfi_remember_state\n\t"
                 ".cfi_adjust_cfa_offset 8\n\t"
                 ".cfi_rel_offset 0, 8\n\t"
                 ".cfi_def_cfa_register 1\n\t"
                 ".cfi_register 2, 3\n\t"
                 ".cfi_restore 2\n\t"
                 ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
                 ".cfi_restore_state\n\t"
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
             );]])],
           [gcry_cv_gcc_asm_cfi_directives=yes])])
 if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
              [Defined if underlying assembler supports for CFI directives])
 fi
 
 
 #
 # Check whether GCC assembler supports for ELF directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for ELF directives],
        [gcry_cv_gcc_asm_elf_directives],
        [gcry_cv_gcc_asm_elf_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if ELF directives '.type' and '.size' are supported. */
                 ".text\n\t"
                 "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,STT_FUNC;\n\t"
             );]])],
           [gcry_cv_gcc_asm_elf_directives=yes])])
 if test "$gcry_cv_gcc_asm_elf_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_ELF_DIRECTIVES,1,
              [Defined if underlying assembler supports for ELF directives])
 fi
 
 
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
 #
 GNUPG_SYS_SYMBOL_UNDERSCORE()
 
 
 #################################
 ####                         ####
 #### Setup assembler stuff.  ####
 #### Define mpi_cpu_arch.    ####
 ####                         ####
 #################################
 AC_ARG_ENABLE(mpi-path,
               AS_HELP_STRING([--enable-mpi-path=EXTRA_PATH],
               [prepend EXTRA_PATH to list of CPU specific optimizations]),
 	      mpi_extra_path="$enableval",mpi_extra_path="")
 AC_MSG_CHECKING(architecture and mpi assembler functions)
 if test -f $srcdir/mpi/config.links ; then
     . $srcdir/mpi/config.links
     AC_CONFIG_LINKS("$mpi_ln_list")
     ac_cv_mpi_sflags="$mpi_sflags"
     AC_MSG_RESULT($mpi_cpu_arch)
 else
     AC_MSG_RESULT(failed)
     AC_MSG_ERROR([mpi/config.links missing!])
 fi
 MPI_SFLAGS="$ac_cv_mpi_sflags"
 AC_SUBST(MPI_SFLAGS)
 
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
    shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
    avx512support="n/a"
    gfnisupport="n/a"
    padlocksupport="n/a"
    drngsupport="n/a"
 fi
 
 if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
    fi
 fi
 
 if test "$mpi_cpu_arch" != "ppc"; then
    ppccryptosupport="n/a"
 fi
 
 #############################################
 ####                                     ####
 #### Platform specific compiler checks.  ####
 ####                                     ####
 #############################################
 
 
 # Following tests depend on warnings to cause compile to fail, so set -Werror
 # temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute],
        [gcry_cv_gcc_attribute_ms_abi],
        [gcry_cv_gcc_attribute_ms_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((ms_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_ms_abi=yes])])
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1,
      [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute])
 fi
 
 
 #
 # Check whether compiler supports 'sysv_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute],
        [gcry_cv_gcc_attribute_sysv_abi],
        [gcry_cv_gcc_attribute_sysv_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((sysv_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_sysv_abi=yes])])
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1,
      [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute])
 fi
 
 
 #
 # Check whether default calling convention is 'ms_abi'.
 #
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'ms_abi'],
           [gcry_cv_gcc_default_abi_is_ms_abi],
           [gcry_cv_gcc_default_abi_is_ms_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((ms_abi))(*msabi_func)(void);
                  /* warning on SysV abi targets, passes on Windows based targets */
                  msabi_func = def_func;
                  return msabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_ms_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1,
         [Defined if default calling convention is 'ms_abi'])
    fi
 fi
 
 
 #
 # Check whether default calling convention is 'sysv_abi'.
 #
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'],
           [gcry_cv_gcc_default_abi_is_sysv_abi],
           [gcry_cv_gcc_default_abi_is_sysv_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((sysv_abi))(*sysvabi_func)(void);
                  /* warning on MS ABI targets, passes on SysV ABI targets */
                  sysvabi_func = def_func;
                  return sysvabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_sysv_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1,
         [Defined if default calling convention is 'sysv_abi'])
    fi
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports SSSE3 instructions
 # This is required for the AES-NI instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
        [gcry_cv_gcc_inline_asm_ssse3],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1,
      [Defined if inline assembler supports SSSE3 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PCLMUL instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
        [gcry_cv_gcc_inline_asm_pclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SHA Extensions instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
        [gcry_cv_gcc_inline_asm_shaext],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_shaext="n/a"
         else
           gcry_cv_gcc_inline_asm_shaext=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_shaext=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
      [Defined if inline assembler supports SHA Extensions instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
        [gcry_cv_gcc_inline_asm_sse41],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1,
      [Defined if inline assembler supports SSE4.1 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
        [gcry_cv_gcc_inline_asm_avx],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
      [Defined if inline assembler supports AVX instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
        [gcry_cv_gcc_inline_asm_avx2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1,
      [Defined if inline assembler supports AVX2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX512 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX512 instructions],
        [gcry_cv_gcc_inline_asm_avx512],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx512="n/a"
         else
           gcry_cv_gcc_inline_asm_avx512=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpopcntq %%zmm7, %%zmm1%{%%k1%}%{z%};\n\t":::"cc");
               __asm__("vpexpandb %%zmm3, %%zmm1;\n\t":::"cc");
               __asm__("vpxorq %%xmm7, %%xmm7, %%xmm7;\n\t":::"cc");
               __asm__("vpxorq %%ymm7, %%ymm7, %%ymm7;\n\t":::"cc");
               __asm__("vpxorq (%%eax)%{1to8%}, %%zmm7, %%zmm7;\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx512=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx512" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX512,1,
      [Defined if inline assembler supports AVX512 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports VAES and VPCLMUL instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports VAES and VPCLMUL instructions],
        [gcry_cv_gcc_inline_asm_vaes_vpclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_vaes_vpclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_vaes_vpclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("vaesenclast %%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/
               __asm__("vaesenclast %%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/
               __asm__("vpclmulqdq \$0,%%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/
               __asm__("vpclmulqdq \$0,%%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_vaes_vpclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_vaes_vpclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL,1,
      [Defined if inline assembler supports VAES and VPCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports GFNI instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports GFNI instructions],
        [gcry_cv_gcc_inline_asm_gfni],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_gfni="n/a"
         else
           gcry_cv_gcc_inline_asm_gfni=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("gf2p8affineqb \$123, %%xmm0, %%xmm0;\n\t":::"cc"); /* SSE */
               __asm__("vgf2p8affineinvqb \$234, %%ymm1, %%ymm1, %%ymm1;\n\t":::"cc"); /* AVX */
               __asm__("vgf2p8mulb (%%eax), %%zmm2, %%zmm2;\n\t":::"cc"); /* AVX512 */
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_gfni=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_gfni" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_GFNI,1,
      [Defined if inline assembler supports GFNI instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
        [gcry_cv_gcc_inline_asm_bmi2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[unsigned int a(unsigned int x, unsigned int y) {
               unsigned int tmp1, tmp2;
               asm ("rorxl %2, %1, %0"
                    : "=r" (tmp1)
                    : "rm0" (x), "J" (32 - ((23) & 31)));
               asm ("andnl %2, %1, %0"
                    : "=r" (tmp2)
                    : "r0" (x), "rm" (y));
               return tmp1 + tmp2;
             }]], [ a(1, 2); ] )],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1,
      [Defined if inline assembler supports BMI2 instructions])
 fi
 
 
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
             [fn();])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
     # Add '-Wa,--divide' to CPPFLAGS and try check again.
     #
     _gcc_cppflags_save="$CPPFLAGS"
     CPPFLAGS="$CPPFLAGS -Wa,--divide"
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
               [fn();])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
       CPPFLAGS="$_gcc_cppflags_save"
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
        [gcry_cv_gcc_amd64_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
                 ".text\n\t"
 		"asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
 		/* Test if assembler allows use of '/' for constant division
 		 * (Solaris/x86 issue). If previous constant division check
 		 * and "-Wa,--divide" workaround failed, this causes assembly
 		 * to be disable on this machine. */
 		"xorl \$(123456789/12345678), %ebp;\n\t"
             );]], [ asmfunc(); ])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
      AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
       AC_LINK_IFELSE([AC_LANG_PROGRAM(
         [[__asm__(
               ".text\n\t"
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
           );]], [ asmfunc(); ])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for assembly
 # implementations that use Intel syntax
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations],
        [gcry_cv_gcc_platform_as_ok_for_intel_syntax],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
                 ".text\n\t"
                 "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
                 "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t"
                 "add eax, ebp;\n\t"
                 "rorx eax, ebp, 1;\n\t"
                 "sub eax, [esp + 4];\n\t"
                 "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
             );]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
   AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1,
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
 
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
 AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture],
        [gcry_cv_cc_arm_arch_is_v6],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_cc_arm_arch_is_v6="n/a"
         else
           gcry_cv_cc_arm_arch_is_v6=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[
            #if defined(__arm__) && \
              ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
              || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
              || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
              || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
              || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
              || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
              || defined(__ARM_ARCH_7EM__))
              /* empty */
            #else
              /* fail compile if not ARMv6. */
              not_armv6 not_armv6 = (not_armv6)not_armv6;
            #endif
           ]])],
           [gcry_cv_cc_arm_arch_is_v6=yes])
         fi])
 if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
    AC_DEFINE(HAVE_ARM_ARCH_V6,1,
      [Defined if ARM architecture is v6 or newer])
 fi
 
 
 #
 # Check whether GCC inline assembler supports NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
        [gcry_cv_gcc_inline_asm_neon],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
                 "vrev64.8 %q0, %q3;\n\t"
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
      [Defined if inline assembler supports NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch32_crypto],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
                 ".text\n\t"
 
                 "testfn:\n\t"
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha1su0.32 q0, q0, q0;\n\t"
                 "sha1su1.32 q0, q0;\n\t"
 
                 "sha256h.32 q0, q0, q0;\n\t"
                 "sha256h2.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha256su0.32 q0, q0;\n\t"
                 "sha256su1.32 q0, q0, q15;\n\t"
 
                 "aese.8 q0, q0;\n\t"
                 "aesd.8 q0, q0;\n\t"
                 "aesmc.8 q0, q0;\n\t"
                 "aesimc.8 q0, q0;\n\t"
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1,
      [Defined if inline assembler supports AArch32 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions],
        [gcry_cv_gcc_inline_asm_aarch64_neon],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1,
      [Defined if inline assembler supports AArch64 NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch64_crypto],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
 
                 "sha1h s0, s0;\n\t"
                 "sha1c q0, s0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
                 "sha1su1 v0.4s, v0.4s;\n\t"
 
                 "sha256h q0, q0, v0.4s;\n\t"
                 "sha256h2 q0, q0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha256su0 v0.4s, v0.4s;\n\t"
                 "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
 
                 "aese v0.16b, v0.16b;\n\t"
                 "aesd v0.16b, v0.16b;\n\t"
                 "aesmc v0.16b, v0.16b;\n\t"
                 "aesimc v0.16b, v0.16b;\n\t"
 
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1,
      [Defined if inline assembler supports AArch64 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether PowerPC AltiVec/VSX intrinsics
 #
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
       [if test "$mpi_cpu_arch" != "ppc" ||
 	  test "$try_asm_modules" != "yes" ; then
 	gcry_cv_cc_ppc_altivec="n/a"
       else
 	gcry_cv_cc_ppc_altivec=no
 	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
 	[[#include <altivec.h>
 	  typedef vector unsigned char block;
 	  typedef vector unsigned int vecu32;
 	  static inline __attribute__((always_inline)) vecu32
 	  vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
 	  {
 	    return vec_sld (a, b, (4 * idx) & 15);
 	  }
 	  block fn(block in)
 	  {
 	    block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	    vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	    y = vec_sld_u32 (y, y, 3);
 	    return vec_cipher_be (t, in) ^ (block)y;
 	  }
 	  ]])],
 	[gcry_cv_cc_ppc_altivec=yes])
       fi])
 if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	    [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
 fi
 
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
     test "$mpi_cpu_arch" = "ppc" &&
     test "$try_asm_modules" == "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
     AC_COMPILE_IFELSE([AC_LANG_SOURCE(
       [[#include <altivec.h>
 	typedef vector unsigned char block;
 	typedef vector unsigned int vecu32;
 	static inline __attribute__((always_inline)) vecu32
 	vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
 	{
 	  return vec_sld (a, b, (4 * idx) & 15);
 	}
 	block fn(block in)
 	{
 	  block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	  y = vec_sld_u32 (y, y, 3);
 	  return vec_cipher_be (t, in) ^ (block)y;
 	}]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags])
   fi
 fi
 
 AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS,
 	       test "$gcry_cv_cc_ppc_altivec_cflags" = "yes")
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions],
        [gcry_cv_gcc_inline_asm_ppc_altivec],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_altivec="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_altivec=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".globl testfn;\n"
                     ".text\n\t"
 		    "testfn:\n"
 		    "stvx %v31,%r12,%r0;\n"
 		    "lvx  %v20,%r12,%r0;\n"
 		    "vcipher %v0, %v1, %v22;\n"
 		    "lxvw4x %vs32, %r0, %r1;\n"
 		    "vadduwm %v0, %v1, %v22;\n"
 		    "vshasigmaw %v0, %v1, 0, 15;\n"
 		    "vshasigmad %v0, %v1, 0, 15;\n"
 		    "vpmsumd %v11, %v11, %v11;\n"
 		  );
             ]], [ testfn(); ] )],
           [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1,
      [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PowerISA 3.00 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions],
        [gcry_cv_gcc_inline_asm_ppc_arch_3_00],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_arch_3_00=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\t"
 		    ".globl testfn;\n"
 		    "testfn:\n"
 		    "stxvb16x %r1,%v12,%v30;\n"
 		  );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1,
      [Defined if inline assembler supports PowerISA 3.00 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
       [gcry_cv_gcc_inline_asm_s390x],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x=no
 	  AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	  [[typedef unsigned int u128_t __attribute__ ((mode (TI)));
 	    unsigned int testfunc(unsigned int x, void *y, unsigned int z)
 	    {
 	      unsigned long fac[8];
 	      register unsigned long reg0 asm("0") = 0;
 	      register unsigned long reg1 asm("1") = x;
 	      u128_t r1 = ((u128_t)(unsigned long)y << 64) | (unsigned long)z;
 	      u128_t r2 = 0;
 	      u128_t r3 = 0;
 	      asm volatile (".insn rre,0xb92e << 16, %[r1], %[r2]\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      asm volatile (".insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      reg0 = 8 - 1;
 	      asm ("stfle %1\n\t"
 	           : "+d" (reg0), "=Q" (fac[0])
 	           :
 	           : "cc", "memory");
 	      asm volatile ("mvc 0(16, %0), 0(%1)\n\t"
 			    :
 			    : "a" (y), "a" (fac)
 			    : "memory");
 	      asm volatile ("xc 0(16, %0), 0(%0)\n\t"
 			    :
 			    : "a" (fac)
 			    : "memory");
 	      asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t"
 			    :
 			    :
 			    : "memory", "r11");
 	      asm volatile ("algrk %%r14, %%r14, %%r14\n\t"
 			    :
 			    :
 			    : "memory", "r14");
 	      return (unsigned int)r1 ^ reg0;
 	    }
 	    ]] , [ testfunc(0, 0, 0); ])],
 	  [gcry_cv_gcc_inline_asm_s390x=yes])
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X,1,
      [Defined if inline assembler supports zSeries instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries vector instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instructions],
       [gcry_cv_gcc_inline_asm_s390x_vx],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x_vx="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x_vx=no
 	  if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
 	    AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	    [[void testfunc(void)
 	      {
 		asm volatile (".machine \"z13+vx\"\n\t"
 			      "vx %%v0, %%v1, %%v31\n\t"
 			      "verllf %%v11, %%v11, (16)(0)\n\t"
 			      :
 			      :
 			      : "memory");
 	      }
 	      ]], [ testfunc(); ])],
 	    [gcry_cv_gcc_inline_asm_s390x_vx=yes])
 	  fi
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X_VX,1,
      [Defined if inline assembler supports zSeries vector instructions])
 fi
 
 
 #######################################
 #### Checks for library functions. ####
 #######################################
 
 AC_FUNC_VPRINTF
 # We have replacements for these in src/missing-string.c
 AC_CHECK_FUNCS(stpcpy strcasecmp)
 # We have replacements for these in src/g10lib.h
 AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
 AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy)
 
 GNUPG_CHECK_MLOCK
 
 #
 # Replacement functions.
 #
 AC_REPLACE_FUNCS([getpid clock])
 
 
 #
 # Check whether it is necessary to link against libdl.
 #
 DL_LIBS=""
 if test "$use_hmac_binary_check" != no ; then
   _gcry_save_libs="$LIBS"
   LIBS=""
   AC_SEARCH_LIBS(dlopen, c dl,,,)
   DL_LIBS=$LIBS
   LIBS="$_gcry_save_libs"
 fi
 AC_SUBST(DL_LIBS)
 
 
 #
 # Check whether we can use Linux capabilities as requested.
 #
 if test "$use_capabilities" = "yes" ; then
 use_capabilities=no
 AC_CHECK_HEADERS(sys/capability.h)
 if test "$ac_cv_header_sys_capability_h" = "yes" ; then
   AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1)
   if test "$ac_cv_lib_cap_cap_init" = "yes"; then
      AC_DEFINE(USE_CAPABILITIES,1,
                [define if capabilities should be used])
      LIBS="$LIBS -lcap"
      use_capabilities=yes
   fi
 fi
 if test "$use_capabilities" = "no" ; then
     AC_MSG_WARN([[
 ***
 *** The use of capabilities on this system is not possible.
 *** You need a recent Linux kernel and some patches:
 ***   fcaps-2.2.9-990610.patch      (kernel patch for 2.2.9)
 ***   fcap-module-990613.tar.gz     (kernel module)
 ***   libcap-1.92.tar.gz            (user mode library and utilities)
 *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN
 *** set (filesystems menu). Be warned: This code is *really* ALPHA.
 ***]])
 fi
 fi
 
 # Check whether a random device is available.
 if test "$try_dev_random" = yes ; then
     AC_CACHE_CHECK(for random device, ac_cv_have_dev_random,
     [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then
       ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi])
     if test "$ac_cv_have_dev_random" = yes; then
         AC_DEFINE(HAVE_DEV_RANDOM,1,
                  [defined if the system supports a random device] )
     fi
 else
     AC_MSG_CHECKING(for random device)
     ac_cv_have_dev_random=no
     AC_MSG_RESULT(has been disabled)
 fi
 
 # Figure out the random modules for this configuration.
 if test "$random" = "default"; then
 
     # Select default value.
     if test "$ac_cv_func_getentropy" = yes; then
         random_modules="getentropy"
     elif test "$ac_cv_have_dev_random" = yes; then
         # Try Linuxish random device.
         random_modules="linux"
     else
         case "${host}" in
         *-*-mingw32ce*)
           # WindowsCE random device.
           random_modules="w32ce"
           ;;
         *-*-mingw32*|*-*-cygwin*)
           # Windows random device.
           random_modules="w32"
           ;;
         *)
           # Build everything, allow to select at runtime.
           random_modules="$auto_random_modules"
           ;;
         esac
     fi
 else
     if test "$random" = "auto"; then
         # Build everything, allow to select at runtime.
         random_modules="$auto_random_modules"
     else
         random_modules="$random"
     fi
 fi
 
 
 #
 # Other defines
 #
 if test mym4_isgit = "yes"; then
     AC_DEFINE(IS_DEVELOPMENT_VERSION,1,
               [Defined if this is not a regular release])
 fi
 
 
 AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes)
 
 
 # This is handy for debugging so the compiler doesn't rearrange
 # things and eliminate variables.
 AC_ARG_ENABLE(optimization,
        AS_HELP_STRING([--disable-optimization],
                       [disable compiler optimization]),
                       [if test $enableval = no ; then
                          CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'`
                        fi])
 
 AC_MSG_NOTICE([checking for cc features])
 # CFLAGS mangling when using gcc.
 if test "$GCC" = yes; then
     AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks])
     _gcc_cflags_save=$CFLAGS
     CFLAGS="-fno-delete-null-pointer-checks"
     AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
     AC_MSG_RESULT($_gcc_wopt)
     CFLAGS=$_gcc_cflags_save;
     if test x"$_gcc_wopt" = xyes ; then
        CFLAGS="$CFLAGS -fno-delete-null-pointer-checks"
     fi
 
     CFLAGS="$CFLAGS -Wall"
     if test "$USE_MAINTAINER_MODE" = "yes"; then
         CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes"
         CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security"
 
         # If -Wno-missing-field-initializers is supported we can enable a
         # a bunch of really useful warnings.
         AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wno-missing-field-initializers"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast"
           CFLAGS="$CFLAGS -Wwrite-strings"
           CFLAGS="$CFLAGS -Wdeclaration-after-statement"
           CFLAGS="$CFLAGS -Wno-missing-field-initializers"
           CFLAGS="$CFLAGS -Wno-sign-compare"
         fi
 
         AC_MSG_CHECKING([if gcc supports -Wpointer-arith])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wpointer-arith"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -Wpointer-arith"
         fi
     fi
 fi
 
 # Check whether as(1) supports a noeexecstack feature.  This test
 # includes an override option.
 CL_AS_NOEXECSTACK
 
 
 AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION)
 AC_SUBST(LIBGCRYPT_CONFIG_LIBS)
 AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS)
 AC_SUBST(LIBGCRYPT_CONFIG_HOST)
 AC_SUBST(LIBGCRYPT_THREAD_MODULES)
 
 AC_CONFIG_COMMANDS([gcrypt-conf],[[
 chmod +x src/libgcrypt-config
 ]],[[
 prefix=$prefix
 exec_prefix=$exec_prefix
 libdir=$libdir
 datadir=$datadir
 DATADIRNAME=$DATADIRNAME
 ]])
 
 #####################
 #### Conclusion. ####
 #####################
 
 # Check that requested feature can actually be used and define
 # ENABLE_foo_SUPPORT macros.
 
 if test x"$aesnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$shaextsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
     shaextsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$sse41support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then
     sse41support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx512support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx512" != "yes" ; then
     avx512support="no (unsupported by compiler)"
   fi
 fi
 if test x"$gfnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_gfni" != "yes" ; then
     gfnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
       armcryptosupport="no (unsupported by compiler)"
     fi
   fi
 fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
 if test x"$shaextsupport" = xyes ; then
   AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
             [Enable support for Intel SHAEXT instructions.])
 fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
 fi
 if test x"$sse41support" = xyes ; then
   AC_DEFINE(ENABLE_SSE41_SUPPORT, 1,
             [Enable support for Intel SSE4.1 instructions.])
 fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
 fi
 if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
 if test x"$avx512support" = xyes ; then
   AC_DEFINE(ENABLE_AVX512_SUPPORT,1,
             [Enable support for Intel AVX512 instructions.])
 fi
 if test x"$gfnisupport" = xyes ; then
   AC_DEFINE(ENABLE_GFNI_SUPPORT,1,
             [Enable support for Intel GFNI instructions.])
 fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
 fi
 if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
 if test x"$ppccryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1,
             [Enable support for POWER 8 (PowerISA 2.07) crypto extension.])
 fi
 if test x"$jentsupport" = xyes ; then
   AC_DEFINE(ENABLE_JENT_SUPPORT, 1,
             [Enable support for the jitter entropy collector.])
 fi
 if test x"$padlocksupport" = xyes ; then
   AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1,
             [Enable support for the PadLock engine.])
 fi
 if test x"$drngsupport" = xyes ; then
   AC_DEFINE(ENABLE_DRNG_SUPPORT, 1,
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
 
 if test x"$force_soft_hwfeatures" = xyes ; then
   AC_DEFINE(ENABLE_FORCE_SOFT_HWFEATURES, 1,
             [Enable forcing 'soft' HW feature bits on (for testing).])
 fi
 
 # Define conditional sources and config.h symbols depending on the
 # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules.
 
 LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS arcfour-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS des-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-amd64.lo"
 
          # Build with the SSSE3 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64-asm.lo"
 
          # Build with the VAES/AVX2 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-avx2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-arm.lo"
 
          # Build with the ARMv8/AArch32 CE implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aarch64.lo"
 
          # Build with the ARMv8/AArch64 CE implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc9le.lo"
 
          if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
             test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
             # Build with AES-GCM bulk implementation for P10
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-gcm-p10le.lo"
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-p10le.lo"
          fi
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
       s390x-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-s390x.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
          # Build with the AES-NI implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aesni.lo"
 
          # Build with the Padlock implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-padlock.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo"
    AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-amd64.lo"
 
          if test x"$avx2support" = xyes ; then
             # Build with the AVX2 implementation
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-avx2-amd64.lo"
          fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-aarch64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the SSE2 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-sse2-amd64.lo"
       ;;
    esac
 
    if test x"$avx2support" = xyes ; then
       # Build with the AVX2 implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx2-amd64.lo"
    fi
 
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo"
    AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(seed, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo"
    AC_DEFINE(USE_SEED, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
 
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
       ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 
    if test x"$avx2support" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx2-amd64.lo"
 
         # Build with the VAES/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
 
         # Build with the GFNI/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
+
+        # Build with the GFNI/AVX512 implementation
+        GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx512-amd64.lo"
       fi
    fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo"
    AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-amd64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo"
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-aarch64.lo"
       ;;
       powerpc64le-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc64-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       s390x-*-*)
          # Build with the s390x/zSeries vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sm4, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4.lo"
    AC_DEFINE(USE_SM4, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo"
    esac
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
    AC_DEFINE(USE_DSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo"
    AC_DEFINE(USE_RSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(elgamal, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo"
    AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(ecc, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \
                           ecc.lo ecc-curves.lo ecc-misc.lo \
                           ecc-ecdh.lo ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo \
                           ecc-sm2.lo"
    AC_DEFINE(USE_ECC, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(crc, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
    AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
 
    case "${host}" in
       i?86-*-* | x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-intel-pclmul.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-ce.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
       powerpc64-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
       powerpc-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(gostr3411-94, $enabled_digests)
 if test "$found" = "1" ; then
    # GOST R 34.11-94 internally uses GOST 28147-89
    LIST_MEMBER(gost28147, $enabled_ciphers)
    if test "$found" = "1" ; then
       GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo"
       AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included])
    fi
 fi
 
 LIST_MEMBER(stribog, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo"
    AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo"
    AC_DEFINE(USE_MD2, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md4, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo"
    AC_DEFINE(USE_MD4, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md5, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo"
    AC_DEFINE(USE_MD5, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rmd160, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo"
    AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(sha256, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo"
    AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
    esac
 
    case "$mpi_cpu_arch" in
      x86)
        # Build with the SHAEXT implementation
        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-intel-shaext.lo"
      ;;
    esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx2-bmi2-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx512-amd64.lo"
       ;;
       i?86-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-i386.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo"
    AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          :
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo"
    AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(whirlpool, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo"
    AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS whirlpool-sse2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
    AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
 
    case "${host}" in
      x86_64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-avx-bmi2-amd64.lo"
      ;;
      aarch64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
      ;;
    esac
 fi
 
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
 case "${host}" in
   x86_64-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-bmi2-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx2-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv7-neon.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch64-ce.lo"
   ;;
 esac
 
 case "$mpi_cpu_arch" in
   x86)
     # Build with the SHAEXT implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-intel-shaext.lo"
   ;;
 esac
 
 # Arch specific GCM implementations
 case "${host}" in
   i?86-*-* | x86_64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-intel-pclmul.lo"
   ;;
   arm*-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv7-neon.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch64-ce.lo"
   ;;
   powerpc64le-*-* | powerpc64-*-* | powerpc-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-ppc.lo"
   ;;
 esac
 
 # Arch specific MAC implementations
 case "${host}" in
   s390x-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
   ;;
   x86_64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo"
   ;;
 esac
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
    AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(getentropy, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndgetentropy.lo"
    AC_DEFINE(USE_RNDGETENTROPY, 1, [Defined if the getentropy RNG should be used.])
 fi
 
 LIST_MEMBER(linux, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndoldlinux.lo"
    AC_DEFINE(USE_RNDOLDLINUX, 1, [Defined if the /dev/random RNG should be used.])
 fi
 
 LIST_MEMBER(unix, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo"
    AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.])
 fi
 
 LIST_MEMBER(egd, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo"
    AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.])
 fi
 
 LIST_MEMBER(w32, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo"
    AC_DEFINE(USE_RNDW32, 1,
              [Defined if the Windows specific RNG should be used.])
 fi
 
 LIST_MEMBER(w32ce, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo"
    AC_DEFINE(USE_RNDW32CE, 1,
              [Defined if the WindowsCE specific RNG should be used.])
 fi
 
 if test "$try_asm_modules" = yes ; then
   # Build with assembly implementations
   GCRYPT_CIPHERS="$GCRYPT_CIPHERS $GCRYPT_ASM_CIPHERS"
   GCRYPT_DIGESTS="$GCRYPT_DIGESTS $GCRYPT_ASM_DIGESTS"
 fi
 
 AC_SUBST([GCRYPT_CIPHERS])
 AC_SUBST([GCRYPT_PUBKEY_CIPHERS])
 AC_SUBST([GCRYPT_DIGESTS])
 AC_SUBST([GCRYPT_KDFS])
 AC_SUBST([GCRYPT_RANDOM])
 
 AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers)
 AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers)
 AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests)
 
 # For printing the configuration we need a colon separated list of
 # algorithm names.
 tmp=`echo "$enabled_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp",
                    [List of available cipher algorithms])
 tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp",
                    [List of available public key cipher algorithms])
 tmp=`echo "$enabled_digests" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp",
                    [List of available digest algorithms])
 tmp=`echo "$enabled_kdfs" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp",
                    [List of available KDF algorithms])
 
 
 #
 # Define conditional sources depending on the used hardware platform.
 # Note that all possible modules must also be listed in
 # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES).
 #
 GCRYPT_HWF_MODULES=
 case "$mpi_cpu_arch" in
      x86)
         AC_DEFINE(HAVE_CPU_ARCH_X86, 1,   [Defined for the x86 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo"
         ;;
      alpha)
         AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms])
         ;;
      sparc)
         AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms])
         ;;
      mips)
         AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1,  [Defined for MIPS platforms])
         ;;
      m68k)
         AC_DEFINE(HAVE_CPU_ARCH_M68K, 1,  [Defined for M68k platforms])
         ;;
      ppc)
         AC_DEFINE(HAVE_CPU_ARCH_PPC, 1,   [Defined for PPC platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-ppc.lo"
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      aarch64)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM AArch64 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      s390x)
         AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
 
 #
 # Option to disable building of doc file
 #
 build_doc=yes
 AC_ARG_ENABLE([doc], AS_HELP_STRING([--disable-doc],
                                     [do not build the documentation]),
                      build_doc=$enableval, build_doc=yes)
 AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno])
 
 
 #
 # Provide information about the build.
 #
 BUILD_REVISION="mym4_revision"
 AC_SUBST(BUILD_REVISION)
 AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION",
                    [GIT commit id revision used to build this package])
 
 changequote(,)dnl
 BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'`
 changequote([,])dnl
 BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec"
 BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,`
 AC_SUBST(BUILD_VERSION)
 AC_SUBST(BUILD_FILEVERSION)
 
 AC_ARG_ENABLE([build-timestamp],
   AS_HELP_STRING([--enable-build-timestamp],
                  [set an explicit build timestamp for reproducibility.
                   (default is the current time in ISO-8601 format)]),
      [if test "$enableval" = "yes"; then
         BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date`
       else
         BUILD_TIMESTAMP="$enableval"
       fi],
      [BUILD_TIMESTAMP="<none>"])
 AC_SUBST(BUILD_TIMESTAMP)
 AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP",
                    [The time this package was configured for a build])
 
 
 # And create the files.
 AC_CONFIG_FILES([
 Makefile
 m4/Makefile
 compat/Makefile
 mpi/Makefile
 cipher/Makefile
 random/Makefile
 doc/Makefile
 src/Makefile
 src/gcrypt.h
 src/libgcrypt-config
 src/libgcrypt.pc
 src/versioninfo.rc
 tests/Makefile
 ])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
 
 
 detection_module="${GCRYPT_HWF_MODULES%.lo}"
 test -n "$detection_module" || detection_module="none"
 
 # Give some feedback
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:])
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Platform:                 ],[$PRINTABLE_OS_NAME ($host)])
 GCRY_MSG_SHOW([Hardware detection module:],[$detection_module])
 GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers])
 GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests])
 GCRY_MSG_WRAP([Enabled kdf algorithms:   ],[$enabled_kdfs])
 GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers])
 GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([FIPS module version:      ],[$fips_module_version])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
 GCRY_MSG_SHOW([Try using Intel AVX512:   ],[$avx512support])
 GCRY_MSG_SHOW([Try using Intel GFNI:     ],[$gfnisupport])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
 GCRY_MSG_SHOW([Try using PPC crypto:     ],[$ppccryptosupport])
 GCRY_MSG_SHOW([],[])
 
 if test "x${gpg_config_script_warn}" != x; then
 cat <<G10EOF
         Mismatches between the target platform and the to
         be used libraries have been been detected for:
          ${gpg_config_script_warn}
         Please check above for warning messages.
 
 G10EOF
 fi
 
 if test "$gcry_cv_gcc_attribute_aligned" != "yes" ; then
 cat <<G10EOF
    Please not that your compiler does not support the GCC style
    aligned attribute. Using this software may evoke bus errors.
 
 G10EOF
 fi
 
 if test -n "$gpl"; then
   echo "Please note that you are building a version of Libgcrypt with"
   echo "  $gpl"
   echo "included.  These parts are licensed under the GPL and thus the"
   echo "use of this library has to comply with the conditions of the GPL."
   echo ""
 fi