diff --git a/LICENSES b/LICENSES
index 94499501..67b80e64 100644
--- a/LICENSES
+++ b/LICENSES
@@ -1,288 +1,318 @@
 Additional license notices for Libgcrypt.                    -*- org -*-
 
 This file contains the copying permission notices for various files in
 the Libgcrypt distribution which are not covered by the GNU Lesser
 General Public License (LGPL) or the GNU General Public License (GPL).
 
 These notices all require that a copy of the notice be included
 in the accompanying documentation and be distributed with binary
 distributions of the code, so be sure to include this file along
 with any binary distributions derived from the GNU C Library.
 
 * BSD_3Clause
 
   For files:
   - cipher/sha256-avx-amd64.S
   - cipher/sha256-avx2-bmi2-amd64.S
   - cipher/sha256-ssse3-amd64.S
   - cipher/sha512-avx-amd64.S
   - cipher/sha512-avx2-bmi2-amd64.S
   - cipher/sha512-ssse3-amd64.S
   - cipher/sha512-ssse3-i386.c
   - cipher/sha512-avx512-amd64.S
 
 #+begin_quote
   Copyright (c) 2012, Intel Corporation
 
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
 
   * Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
 
   * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the
     distribution.
 
   * Neither the name of the Intel Corporation nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
 
 
   THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
+  For files:
+  - cipher/poly1305-amd64-avx512.S
+
+#+begin_quote
+   Copyright (c) 2021-2022, Intel Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+       * Redistributions of source code must retain the above copyright notice,
+         this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of Intel Corporation nor the names of its contributors
+         may be used to endorse or promote products derived from this software
+         without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
   For files:
   - random/jitterentropy-base.c
   - random/jitterentropy-gcd.c
   - random/jitterentropy-gcd.h
   - random/jitterentropy-health.c
   - random/jitterentropy-health.h
   - random/jitterentropy-noise.c
   - random/jitterentropy-noise.h
   - random/jitterentropy-sha3.c
   - random/jitterentropy-sha3.h
   - random/jitterentropy-timer.c
   - random/jitterentropy-timer.h
   - random/jitterentropy.h
   - random/rndjent.c (plus common Libgcrypt copyright holders)
 
 #+begin_quote
  Copyright (C) 2017 - 2021, Stephan Mueller <smueller@chronox.de>
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  1. Redistributions of source code must retain the above copyright
     notice, and the entire permission notice in its entirety,
     including the disclaimer of warranties.
  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
  3. The name of the author may not be used to endorse or promote
     products derived from this software without specific prior
     written permission.
 
  ALTERNATIVELY, this product may be distributed under the terms of
  the GNU General Public License, in which case the provisions of the GPL2
  are required INSTEAD OF the above restrictions.  (This clause is
  necessary due to a potential bad interaction between the GPL and
  the restrictions contained in a BSD-style copyright.)
 
  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
  WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
  DAMAGE.
 #+end_quote
 
   For files:
   - cipher/cipher-gcm-ppc.c
 
 #+begin_quote
  Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
 
        * Redistributions of source code must retain copyright notices,
          this list of conditions and the following disclaimer.
 
        * Redistributions in binary form must reproduce the above
          copyright notice, this list of conditions and the following
          disclaimer in the documentation and/or other materials
          provided with the distribution.
 
        * Neither the name of the CRYPTOGAMS nor the names of its
          copyright holder and contributors may be used to endorse or
          promote products derived from this software without specific
          prior written permission.
 
  ALTERNATIVELY, provided that this notice is retained in full, this
  product may be distributed under the terms of the GNU General Public
  License (GPL), in which case the provisions of the GPL apply INSTEAD OF
  those given above.
 
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
 * X License
 
   For files:
   - install.sh
 
 #+begin_quote
   Copyright (C) 1994 X Consortium
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to
   deal in the Software without restriction, including without limitation the
   rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   sell copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:
 
   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.
 
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
   X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
   AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
   TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
   Except as contained in this notice, the name of the X Consortium shall not
   be used in advertising or otherwise to promote the sale, use or other deal-
   ings in this Software without prior written authorization from the X Consor-
   tium.
 #+end_quote
 
 * Public domain
 
   For files:
   - cipher/arcfour-amd64.S
 
 #+begin_quote
  Author: Marc Bevand <bevand_m (at) epita.fr>
  Licence: I hereby disclaim the copyright on this code and place it
  in the public domain.
 #+end_quote
 
 * OCB license 1
 
   For files:
   - cipher/cipher-ocb.c
 
 #+begin_quote
   OCB is covered by several patents but may be used freely by most
   software.  See http://web.cs.ucdavis.edu/~rogaway/ocb/license.htm .
   In particular license 1 is suitable for Libgcrypt: See
   http://web.cs.ucdavis.edu/~rogaway/ocb/license1.pdf for the full
   license document; it basically says:
 
     License 1 — License for Open-Source Software Implementations of OCB
                 (Jan 9, 2013)
 
     Under this license, you are authorized to make, use, and
     distribute open-source software implementations of OCB. This
     license terminates for you if you sue someone over their
     open-source software implementation of OCB claiming that you have
     a patent covering their implementation.
 
 
 
  License for Open Source Software Implementations of OCB
  January 9, 2013
 
  1 Definitions
 
  1.1 “Licensor” means Phillip Rogaway.
 
  1.2 “Licensed Patents” means any patent that claims priority to United
  States Patent Application No. 09/918,615 entitled “Method and Apparatus
  for Facilitating Efficient Authenticated Encryption,” and any utility,
  divisional, provisional, continuation, continuations-in-part, reexamination,
  reissue, or foreign counterpart patents that may issue with respect to the
  aforesaid patent application. This includes, but is not limited to, United
  States Patent No. 7,046,802; United States Patent No. 7,200,227; United
  States Patent No. 7,949,129; United States Patent No. 8,321,675 ; and any
  patent that issues out of United States Patent Application No. 13/669,114.
 
  1.3 “Use” means any practice of any invention claimed in the Licensed Patents.
 
  1.4 “Software Implementation” means any practice of any invention
  claimed in the Licensed Patents that takes the form of software executing on
  a user-programmable, general-purpose computer or that takes the form of a
  computer-readable medium storing such software. Software Implementation does
  not include, for example, application-specific integrated circuits (ASICs),
  field-programmable gate arrays (FPGAs), embedded systems, or IP cores.
 
  1.5 “Open Source Software” means software whose source code is published
  and made available for inspection and use by anyone because either (a) the
  source code is subject to a license that permits recipients to copy, modify,
  and distribute the source code without payment of fees or royalties, or
  (b) the source code is in the public domain, including code released for
  public use through a CC0 waiver. All licenses certified by the Open Source
  Initiative at opensource.org as of January 9, 2013 and all Creative Commons
  licenses identified on the creativecommons.org website as of January 9,
  2013, including the Public License Fallback of the CC0 waiver, satisfy these
  requirements for the purposes of this license.
 
  1.6 “Open Source Software Implementation” means a Software
  Implementation in which the software implicating the Licensed Patents is
  Open Source Software. Open Source Software Implementation does not include
  any Software Implementation in which the software implicating the Licensed
  Patents is combined, so as to form a larger program, with software that is
  not Open Source Software.
 
  2 License Grant
 
  2.1 License. Subject to your compliance with the term s of this license,
  including the restriction set forth in Section 2.2, Licensor hereby
  grants to you a perpetual, worldwide, non-exclusive, non-transferable,
  non-sublicenseable, no-charge, royalty-free, irrevocable license to practice
  any invention claimed in the Licensed Patents in any Open Source Software
  Implementation.
 
  2.2 Restriction. If you or your affiliates institute patent litigation
  (including, but not limited to, a cross-claim or counterclaim in a lawsuit)
  against any entity alleging that any Use authorized by this license
  infringes another patent, then any rights granted to you under this license
  automatically terminate as of the date such litigation is filed.
 
  3 Disclaimer
  YOUR USE OF THE LICENSED PATENTS IS AT YOUR OWN RISK AND UNLESS REQUIRED
  BY APPLICABLE LAW, LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
  KIND CONCERNING THE LICENSED PATENTS OR ANY PRODUCT EMBODYING ANY LICENSED
  PATENT, EXPRESS OR IMPLIED, STATUT ORY OR OTHERWISE, INCLUDING, WITHOUT
  LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR
  PURPOSE, OR NONINFRINGEMENT. IN NO EVENT WILL LICENSOR BE LIABLE FOR ANY
  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  ARISING FROM OR RELATED TO ANY USE OF THE LICENSED PATENTS, INCLUDING,
  WITHOUT LIMITATION, DIRECT, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE
  OR SPECIAL DAMAGES, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF
  SUCH DAMAGES PRIOR TO SUCH AN OCCURRENCE.
 #+end_quote
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 30be9f98..582205a3 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -1,277 +1,277 @@
 # Makefile for cipher modules
 # Copyright (C) 1998, 1999, 2000, 2001, 2002,
 #               2003, 2009 Free Software Foundation, Inc.
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # Process this file with automake to produce Makefile.in
 
 # Need to include ../src in addition to top_srcdir because gcrypt.h is
 # a built header.
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
 AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
 
 EXTRA_DIST = gost-s-box.c
 
 CLEANFILES = gost-s-box$(EXEEXT_FOR_BUILD)
 DISTCLEANFILES = gost-sb.h
 
 noinst_LTLIBRARIES = libcipher.la
 
 GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
                  @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
 
 libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
 libcipher_la_LIBADD = $(GCRYPT_MODULES)
 
 libcipher_la_SOURCES = \
 	cipher.c cipher-internal.h \
 	cipher-cbc.c \
 	cipher-cfb.c \
 	cipher-ofb.c \
 	cipher-ctr.c \
 	cipher-aeswrap.c \
 	cipher-ccm.c \
 	cipher-cmac.c \
 	cipher-gcm.c \
 	cipher-poly1305.c \
 	cipher-ocb.c \
 	cipher-xts.c \
 	cipher-eax.c \
 	cipher-siv.c \
 	cipher-gcm-siv.c \
 	cipher-selftest.c cipher-selftest.h \
 	pubkey.c pubkey-internal.h pubkey-util.c \
 	md.c \
 	mac.c mac-internal.h \
 	mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
 	poly1305.c poly1305-internal.h \
 	kdf.c kdf-internal.h \
 	bithelp.h  \
 	bufhelp.h  \
 	primegen.c  \
 	hash-common.c hash-common.h \
 	dsa-common.c rsa-common.c \
 	sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
 	asm-common-aarch64.h \
 	asm-common-amd64.h \
 	asm-common-s390x.h \
 	asm-inline-s390x.h \
 	asm-poly1305-aarch64.h \
 	asm-poly1305-amd64.h \
 	asm-poly1305-s390x.h \
 	arcfour.c arcfour-amd64.S \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
 	chacha20-ppc.c chacha20-s390x.S \
 	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
 	des.c des-amd64.S \
 	dsa.c \
 	elgamal.c \
 	ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
 	ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c ecc-sm2.c \
 	idea.c \
 	gost28147.c gost.h \
 	gostr3411-94.c \
 	md4.c \
 	md5.c \
-	poly1305-s390x.S \
+	poly1305-s390x.S poly1305-amd64-avx512.S \
 	rijndael.c rijndael-internal.h rijndael-tables.h   \
 	rijndael-aesni.c rijndael-padlock.c                \
 	rijndael-amd64.S rijndael-arm.S                    \
 	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
 	rijndael-vaes.c rijndael-vaes-avx2-amd64.S         \
 	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
 	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
 	rijndael-ppc.c rijndael-ppc9le.c                   \
 	rijndael-p10le.c rijndael-gcm-p10le.s             \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
 	rijndael-s390x.c                                   \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
 	scrypt.c \
 	seed.c \
 	serpent.c serpent-sse2-amd64.S \
 	sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
 	sm4-armv8-aarch64-ce.S \
 	serpent-avx2-amd64.S serpent-armv7-neon.S \
 	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 	sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
 	sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
 	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
 	sha256-avx2-bmi2-amd64.S \
 	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
 	sha256-intel-shaext.c sha256-ppc.c \
 	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
 	sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
 	sha512-armv7-neon.S sha512-arm.S \
 	sha512-ppc.c sha512-ssse3-i386.c \
 	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
 	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 	stribog.c \
 	tiger.c \
 	whirlpool.c whirlpool-sse2-amd64.S \
 	twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
 	twofish-avx2-amd64.S \
 	rfc2268.c \
 	camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
 	camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \
 	camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2s-amd64-avx.S
 
 gost28147.lo: gost-sb.h
 gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD)
 	./gost-s-box$(EXEEXT_FOR_BUILD) $@
 
 gost-s-box$(EXEEXT_FOR_BUILD): gost-s-box.c
 	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
 	    $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
 
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O\([2-9sg][2-9sg]*\)/-O1/' -e 's/-Ofast/-O1/g'
 else
 o_flag_munging = cat
 endif
 
 
 # We need to lower the optimization for this module.
 tiger.o: $(srcdir)/tiger.c Makefile
 	`echo $(COMPILE) -c $< | $(o_flag_munging) `
 
 tiger.lo: $(srcdir)/tiger.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(o_flag_munging) `
 
 
 # We need to disable instrumentation for these modules as they use cc as
 # thin assembly front-end and do not tolerate in-between function calls
 # inserted by compiler as those functions may clobber the XMM registers.
 if ENABLE_INSTRUMENTATION_MUNGING
 instrumentation_munging = sed \
 	-e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
 	-e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
 	-e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
 else
 instrumentation_munging = cat
 endif
 
 rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
 	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
 
 crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
 ppc_vcrypto_cflags = -O2 -maltivec -mvsx -mcrypto
 else
 ppc_vcrypto_cflags =
 endif
 
 rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-p10le.o: $(srcdir)/rijndael-p10le.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 rijndael-p10le.lo: $(srcdir)/rijndael-p10le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
 cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
new file mode 100644
index 00000000..48892777
--- /dev/null
+++ b/cipher/poly1305-amd64-avx512.S
@@ -0,0 +1,1625 @@
+/*
+;;
+;; Copyright (c) 2021-2022, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+*/
+/*
+ * From:
+ *  https://github.com/intel/intel-ipsec-mb/blob/f0cad21a644231c0f5d4af51f56061a5796343fb/lib/avx512/poly_fma_avx512.asm
+ *
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX512)
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+.text
+
+ELF(.type _gcry_poly1305_avx512_consts,@object)
+_gcry_poly1305_avx512_consts:
+
+.align 64
+.Lmask_44:
+  .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
+  .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
+
+.align 64
+.Lmask_42:
+  .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
+  .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
+
+.align 64
+.Lhigh_bit:
+  .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
+  .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
+
+.Lbyte_len_to_mask_table:
+  .short 0x0000, 0x0001, 0x0003, 0x0007
+  .short 0x000f, 0x001f, 0x003f, 0x007f
+  .short 0x00ff, 0x01ff, 0x03ff, 0x07ff
+  .short 0x0fff, 0x1fff, 0x3fff, 0x7fff
+  .short 0xffff
+
+.align 64
+.Lbyte64_len_to_mask_table:
+  .quad 0x0000000000000000, 0x0000000000000001
+  .quad 0x0000000000000003, 0x0000000000000007
+  .quad 0x000000000000000f, 0x000000000000001f
+  .quad 0x000000000000003f, 0x000000000000007f
+  .quad 0x00000000000000ff, 0x00000000000001ff
+  .quad 0x00000000000003ff, 0x00000000000007ff
+  .quad 0x0000000000000fff, 0x0000000000001fff
+  .quad 0x0000000000003fff, 0x0000000000007fff
+  .quad 0x000000000000ffff, 0x000000000001ffff
+  .quad 0x000000000003ffff, 0x000000000007ffff
+  .quad 0x00000000000fffff, 0x00000000001fffff
+  .quad 0x00000000003fffff, 0x00000000007fffff
+  .quad 0x0000000000ffffff, 0x0000000001ffffff
+  .quad 0x0000000003ffffff, 0x0000000007ffffff
+  .quad 0x000000000fffffff, 0x000000001fffffff
+  .quad 0x000000003fffffff, 0x000000007fffffff
+  .quad 0x00000000ffffffff, 0x00000001ffffffff
+  .quad 0x00000003ffffffff, 0x00000007ffffffff
+  .quad 0x0000000fffffffff, 0x0000001fffffffff
+  .quad 0x0000003fffffffff, 0x0000007fffffffff
+  .quad 0x000000ffffffffff, 0x000001ffffffffff
+  .quad 0x000003ffffffffff, 0x000007ffffffffff
+  .quad 0x00000fffffffffff, 0x00001fffffffffff
+  .quad 0x00003fffffffffff, 0x00007fffffffffff
+  .quad 0x0000ffffffffffff, 0x0001ffffffffffff
+  .quad 0x0003ffffffffffff, 0x0007ffffffffffff
+  .quad 0x000fffffffffffff, 0x001fffffffffffff
+  .quad 0x003fffffffffffff, 0x007fffffffffffff
+  .quad 0x00ffffffffffffff, 0x01ffffffffffffff
+  .quad 0x03ffffffffffffff, 0x07ffffffffffffff
+  .quad 0x0fffffffffffffff, 0x1fffffffffffffff
+  .quad 0x3fffffffffffffff, 0x7fffffffffffffff
+  .quad 0xffffffffffffffff
+
+.Lqword_high_bit_mask:
+  .short 0, 0x1, 0x5, 0x15, 0x55, 0x57, 0x5f, 0x7f, 0xff
+
+ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
+
+#define raxd eax
+#define rbxd ebx
+#define rcxd ecx
+#define rdxd edx
+#define rsid esi
+#define rdid edi
+#define rbpd ebp
+#define rspd esp
+#define __DWORD(X) X##d
+#define DWORD(R) __DWORD(R)
+
+#define arg1    rdi
+#define arg2    rsi
+#define arg3    rdx
+#define arg4    rcx
+
+#define job     arg1
+#define gp1     rsi
+#define gp2     rcx
+
+/* ;; don't use rdx and rax - they are needed for multiply operation */
+#define gp3     rbp
+#define gp4     r8
+#define gp5     r9
+#define gp6     r10
+#define gp7     r11
+#define gp8     r12
+#define gp9     r13
+#define gp10    r14
+#define gp11    r15
+
+#define len     gp11
+#define msg     gp10
+
+#define POLY1305_BLOCK_SIZE 16
+
+#define STACK_r_save         0
+#define STACK_r_save_size    (6 * 64)
+#define STACK_gpr_save       (STACK_r_save + STACK_r_save_size)
+#define STACK_gpr_save_size  (8 * 8)
+#define STACK_rsp_save       (STACK_gpr_save + STACK_gpr_save_size)
+#define STACK_rsp_save_size  (1 * 8)
+#define STACK_SIZE           (STACK_rsp_save + STACK_rsp_save_size)
+
+#define A2_ZERO(...) /**/
+#define A2_ZERO_INVERT(...) __VA_ARGS__
+#define A2_NOT_ZERO(...) __VA_ARGS__
+#define A2_NOT_ZERO_INVERT(...) /**/
+
+#define clear_zmm(vec) vpxord vec, vec, vec
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for message length being multiple of block size
+;; =============================================================================
+;; Combining 64-bit x 64-bit multiplication with reduction steps
+;;
+;; NOTES:
+;;   1) A2 here is only two bits so anything above is subject of reduction.
+;;      Constant C1 = R1 + (R1 >> 2) simplifies multiply with less operations
+;;   2) Magic 5x comes from mod 2^130-5 property and incorporating
+;;      reduction into multiply phase.
+;;      See "Cheating at modular arithmetic" and "Poly1305's prime: 2^130 - 5"
+;;      paragraphs at https://loup-vaillant.fr/tutorials/poly1305-design for more details.
+;;
+;; Flow of the code below is as follows:
+;;
+;;          A2        A1        A0
+;;        x           R1        R0
+;;   -----------------------------
+;;       A2×R0     A1×R0     A0×R0
+;;   +             A0×R1
+;;   +           5xA2xR1   5xA1xR1
+;;   -----------------------------
+;;     [0|L2L] [L1H|L1L] [L0H|L0L]
+;;
+;;   Registers:  T3:T2     T1:A0
+;;
+;; Completing the multiply and adding (with carry) 3x128-bit limbs into
+;; 192-bits again (3x64-bits):
+;; A0 = L0L
+;; A1 = L0H + L1L
+;; T3 = L1H + L2L
+; A0     [in/out] GPR with accumulator bits 63:0
+; A1     [in/out] GPR with accumulator bits 127:64
+; A2     [in/out] GPR with accumulator bits 195:128
+; R0     [in] GPR with R constant bits 63:0
+; R1     [in] GPR with R constant bits 127:64
+; C1     [in] C1 = R1 + (R1 >> 2)
+; T1     [clobbered] GPR register
+; T2     [clobbered] GPR register
+; T3     [clobbered] GPR register
+; GP_RAX [clobbered] RAX register
+; GP_RDX [clobbered] RDX register
+; IF_A2  [in] Used if input A2 is not 0
+*/
+#define POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, C1, T1, T2, T3, GP_RAX, GP_RDX, IF_A2) \
+	/* T3:T2 = (A0 * R1) */ \
+	mov     GP_RAX, R1; \
+	mul     A0; \
+	mov     T2, GP_RAX; \
+	mov     GP_RAX, R0; \
+	mov     T3, GP_RDX; \
+	\
+	/* T1:A0 = (A0 * R0) */ \
+	mul     A0; \
+	mov     A0, GP_RAX; /* A0 not used in other operations */ \
+	mov     GP_RAX, R0; \
+	mov     T1, GP_RDX; \
+	\
+	/* T3:T2 += (A1 * R0) */ \
+	mul     A1; \
+	add     T2, GP_RAX; \
+	mov     GP_RAX, C1; \
+	adc     T3, GP_RDX; \
+	\
+	/* T1:A0 += (A1 * R1x5) */ \
+	mul     A1; \
+	IF_A2(mov A1, A2); /* use A1 for A2 */ \
+	add     A0, GP_RAX; \
+	adc     T1, GP_RDX; \
+	\
+	/* NOTE: A2 is clamped to 2-bits, */ \
+	/*       R1/R0 is clamped to 60-bits, */ \
+	/*       their product is less than 2^64. */ \
+	\
+	IF_A2(/* T3:T2 += (A2 * R1x5) */); \
+	IF_A2(imul    A1, C1); \
+	IF_A2(add     T2, A1); \
+	IF_A2(mov     A1, T1); /* T1:A0 => A1:A0 */ \
+	IF_A2(adc     T3, 0); \
+	\
+	IF_A2(/* T3:A1 += (A2 * R0) */); \
+	IF_A2(imul    A2, R0); \
+	IF_A2(add     A1, T2); \
+	IF_A2(adc     T3, A2); \
+	\
+	IF_A2##_INVERT(/* If A2 == 0, just move and add T1-T2 to A1 */); \
+	IF_A2##_INVERT(mov     A1, T1); \
+	IF_A2##_INVERT(add     A1, T2); \
+	IF_A2##_INVERT(adc     T3, 0); \
+	\
+	/* At this point, 3 64-bit limbs are in T3:A1:A0 */ \
+	/* T3 can span over more than 2 bits so final partial reduction step is needed. */ \
+	\
+	/* Partial reduction (just to fit into 130 bits) */ \
+	/*    A2 = T3 & 3 */ \
+	/*    k = (T3 & ~3) + (T3 >> 2) */ \
+	/*         Y    x4  +  Y    x1 */ \
+	/*    A2:A1:A0 += k */ \
+	\
+	/* Result will be in A2:A1:A0 */ \
+	mov     T1, T3; \
+	mov     DWORD(A2), DWORD(T3); \
+	and     T1, ~3; \
+	shr     T3, 2; \
+	and     DWORD(A2), 3; \
+	add     T1, T3; \
+	\
+	/* A2:A1:A0 += k (kept in T1) */ \
+	add     A0, T1; \
+	adc     A1, 0; \
+	adc     DWORD(A2), 0
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 8 16-byte message blocks,
+;; and adds new message blocks to accumulator.
+;;
+;; It first multiplies all 8 blocks with powers of R:
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2.
+;;
+;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;R0    [in] ZMM register (R0) to include the 1st limb of R
+;R1    [in] ZMM register (R1) to include the 2nd limb of R
+;R2    [in] ZMM register (R2) to include the 3rd limb of R
+;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
+;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;ZTMP1 [clobbered] Temporary ZMM register
+*/
+#define POLY1305_MUL_REDUCE_VEC(A0, A1, A2, R0, R1, R2, R1P, R2P, P0_L, P0_H, \
+				P1_L, P1_H, P2_L, P2_H, ZTMP1) \
+	/* ;; Reset accumulator */ \
+	vpxorq  P0_L, P0_L, P0_L; \
+	vpxorq  P0_H, P0_H, P0_H; \
+	vpxorq  P1_L, P1_L, P1_L; \
+	vpxorq  P1_H, P1_H, P1_H; \
+	vpxorq  P2_L, P2_L, P2_L; \
+	vpxorq  P2_H, P2_H, P2_H; \
+	\
+	/* ; Reset accumulator and calculate products */ \
+	vpmadd52luq P0_L, A2, R1P; \
+	vpmadd52huq P0_H, A2, R1P; \
+	vpmadd52luq P1_L, A2, R2P; \
+	vpmadd52huq P1_H, A2, R2P; \
+	vpmadd52luq P2_L, A2, R0; \
+	vpmadd52huq P2_H, A2, R0; \
+	\
+	vpmadd52luq P1_L, A0, R1; \
+	vpmadd52huq P1_H, A0, R1; \
+	vpmadd52luq P2_L, A0, R2; \
+	vpmadd52huq P2_H, A0, R2; \
+	vpmadd52luq P0_L, A0, R0; \
+	vpmadd52huq P0_H, A0, R0; \
+	\
+	vpmadd52luq P0_L, A1, R2P; \
+	vpmadd52huq P0_H, A1, R2P; \
+	vpmadd52luq P1_L, A1, R0; \
+	vpmadd52huq P1_H, A1, R0; \
+	vpmadd52luq P2_L, A1, R1; \
+	vpmadd52huq P2_H, A1, R1; \
+	\
+	/* ; Carry propagation (first pass) */ \
+	vpsrlq  ZTMP1, P0_L, 44; \
+	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpsllq  P0_H, P0_H, 8; \
+	vpaddq  P0_H, P0_H, ZTMP1; \
+	vpaddq  P1_L, P1_L, P0_H; \
+	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpsrlq  ZTMP1, P1_L, 44; \
+	vpsllq  P1_H, P1_H, 8; \
+	vpaddq  P1_H, P1_H, ZTMP1; \
+	vpaddq  P2_L, P2_L, P1_H; \
+	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsrlq  ZTMP1, P2_L, 42; \
+	vpsllq  P2_H, P2_H, 10; \
+	vpaddq  P2_H, P2_H, ZTMP1; \
+	\
+	/* ; Carry propagation (second pass) */ \
+	\
+	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
+	vpaddq  A0, A0, P2_H; \
+	vpsllq  P2_H, P2_H, 2; \
+	vpaddq  A0, A0, P2_H; \
+	vpsrlq  ZTMP1, A0, 44; \
+	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  A1, A1, ZTMP1;
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 16 16-byte message blocks,
+;; and adds new message blocks to accumulator,
+;; interleaving this computation with the loading and splatting
+;; of new data.
+;;
+;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
+;; and 8 blocks from B0-B2, multiplied by R0-R2)
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43)
+;; from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2, and adds
+;; the results to A0-A2 and B0-B2.
+;;
+;; =============================================================================
+;A0    [in/out] ZMM register containing 1st 44-bit limb of blocks 1-8
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 1-8
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 1-8
+;B0    [in/out] ZMM register containing 1st 44-bit limb of blocks 9-16
+;B1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 9-16
+;B2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 9-16
+;R0    [in] ZMM register (R0) to include the 1st limb of R
+;R1    [in] ZMM register (R1) to include the 2nd limb of R
+;R2    [in] ZMM register (R2) to include the 3rd limb of R
+;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
+;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
+;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
+;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
+;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
+;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
+;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
+;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
+;ZTMP1 [clobbered] Temporary ZMM register
+;ZTMP2 [clobbered] Temporary ZMM register
+;ZTMP3 [clobbered] Temporary ZMM register
+;ZTMP4 [clobbered] Temporary ZMM register
+;ZTMP5 [clobbered] Temporary ZMM register
+;ZTMP6 [clobbered] Temporary ZMM register
+;ZTMP7 [clobbered] Temporary ZMM register
+;ZTMP8 [clobbered] Temporary ZMM register
+;ZTMP9 [clobbered] Temporary ZMM register
+;MSG   [in/out] Pointer to message
+;LEN   [in/out] Length left of message
+*/
+#define POLY1305_MSG_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, \
+				      R2P, P0_L, P0_H, P1_L, P1_H, P2_L, P2_H, \
+				      Q0_L, Q0_H, Q1_L, Q1_H, Q2_L, Q2_H, \
+				      ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, \
+				      ZTMP6, ZTMP7, ZTMP8, ZTMP9, MSG, LEN) \
+	/* ;; Reset accumulator */ \
+	vpxorq  P0_L, P0_L, P0_L; \
+	vpxorq  P0_H, P0_H, P0_H; \
+	vpxorq  P1_L, P1_L, P1_L; \
+	vpxorq  P1_H, P1_H, P1_H; \
+	vpxorq  P2_L, P2_L, P2_L; \
+	vpxorq  P2_H, P2_H, P2_H; \
+	vpxorq  Q0_L, Q0_L, Q0_L; \
+	vpxorq  Q0_H, Q0_H, Q0_H; \
+	vpxorq  Q1_L, Q1_L, Q1_L; \
+	vpxorq  Q1_H, Q1_H, Q1_H; \
+	vpxorq  Q2_L, Q2_L, Q2_L; \
+	vpxorq  Q2_H, Q2_H, Q2_H; \
+	\
+	/* ;; This code interleaves hash computation with input loading/splatting */ \
+	\
+		/* ; Calculate products */ \
+		vpmadd52luq P0_L, A2, R1P; \
+		vpmadd52huq P0_H, A2, R1P; \
+	/* ;; input loading of new blocks */ \
+	add     MSG, POLY1305_BLOCK_SIZE*16; \
+	sub     LEN, POLY1305_BLOCK_SIZE*16; \
+	\
+		vpmadd52luq Q0_L, B2, R1P; \
+		vpmadd52huq Q0_H, B2, R1P; \
+		\
+		vpmadd52luq P1_L, A2, R2P; \
+		vpmadd52huq P1_H, A2, R2P; \
+	/* ; Load next block of data (128 bytes) */ \
+	vmovdqu64 ZTMP5, [MSG]; \
+	vmovdqu64 ZTMP2, [MSG + 64]; \
+	\
+		vpmadd52luq Q1_L, B2, R2P; \
+		vpmadd52huq Q1_H, B2, R2P; \
+	\
+	/* ; Interleave new blocks of data */ \
+	vpunpckhqdq ZTMP3, ZTMP5, ZTMP2; \
+	vpunpcklqdq ZTMP5, ZTMP5, ZTMP2; \
+	\
+		vpmadd52luq P0_L, A0, R0; \
+		vpmadd52huq P0_H, A0, R0; \
+	/* ; Highest 42-bit limbs of new blocks */ \
+	vpsrlq  ZTMP6, ZTMP3, 24; \
+	vporq   ZTMP6, ZTMP6, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
+	\
+		vpmadd52luq Q0_L, B0, R0; \
+		vpmadd52huq Q0_H, B0, R0; \
+		\
+	/* ; Middle 44-bit limbs of new blocks */ \
+	vpsrlq  ZTMP2, ZTMP5, 44; \
+	vpsllq  ZTMP4, ZTMP3, 20; \
+	\
+		vpmadd52luq P2_L, A2, R0; \
+		vpmadd52huq P2_H, A2, R0; \
+	vpternlogq ZTMP2, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	/* ; Lowest 44-bit limbs of new blocks */ \
+	vpandq  ZTMP5, ZTMP5, [.Lmask_44 ADD_RIP]; \
+	\
+		vpmadd52luq Q2_L, B2, R0; \
+		vpmadd52huq Q2_H, B2, R0; \
+		\
+	/* ; Load next block of data (128 bytes) */ \
+	vmovdqu64 ZTMP8, [MSG + 64*2]; \
+	vmovdqu64 ZTMP9, [MSG + 64*3]; \
+	\
+		vpmadd52luq P1_L, A0, R1; \
+		vpmadd52huq P1_H, A0, R1; \
+	/* ; Interleave new blocks of data */ \
+	vpunpckhqdq ZTMP3, ZTMP8, ZTMP9; \
+	vpunpcklqdq ZTMP8, ZTMP8, ZTMP9; \
+	\
+		vpmadd52luq Q1_L, B0, R1; \
+		vpmadd52huq Q1_H, B0, R1; \
+	\
+	/* ; Highest 42-bit limbs of new blocks */ \
+	vpsrlq  ZTMP7, ZTMP3, 24; \
+	vporq   ZTMP7, ZTMP7, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
+	\
+		vpmadd52luq P0_L, A1, R2P; \
+		vpmadd52huq P0_H, A1, R2P; \
+		\
+	/* ; Middle 44-bit limbs of new blocks */ \
+	vpsrlq  ZTMP9, ZTMP8, 44; \
+	vpsllq  ZTMP4, ZTMP3, 20; \
+	\
+		vpmadd52luq Q0_L, B1, R2P; \
+		vpmadd52huq Q0_H, B1, R2P; \
+		\
+	vpternlogq ZTMP9, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	/* ; Lowest 44-bit limbs of new blocks */ \
+	vpandq  ZTMP8, ZTMP8, [.Lmask_44 ADD_RIP]; \
+	\
+		vpmadd52luq P2_L, A0, R2; \
+		vpmadd52huq P2_H, A0, R2; \
+	/* ; Carry propagation (first pass) */ \
+	vpsrlq  ZTMP1, P0_L, 44; \
+	vpsllq  P0_H, P0_H, 8; \
+		vpmadd52luq Q2_L, B0, R2; \
+		vpmadd52huq Q2_H, B0, R2; \
+		\
+	vpsrlq  ZTMP3, Q0_L, 44; \
+	vpsllq  Q0_H, Q0_H, 8; \
+	\
+		vpmadd52luq P1_L, A1, R0; \
+		vpmadd52huq P1_H, A1, R0; \
+	/* ; Carry propagation (first pass) - continue */ \
+	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  P0_H, P0_H, ZTMP1; \
+		vpmadd52luq Q1_L, B1, R0; \
+		vpmadd52huq Q1_H, B1, R0; \
+	\
+	vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q0_H, Q0_H, ZTMP3; \
+	\
+		vpmadd52luq P2_L, A1, R1; \
+		vpmadd52huq P2_H, A1, R1; \
+	/* ; Carry propagation (first pass) - continue */ \
+	vpaddq  P1_L, P1_L, P0_H; \
+	vpsllq  P1_H, P1_H, 8; \
+	vpsrlq  ZTMP1, P1_L, 44; \
+		vpmadd52luq Q2_L, B1, R1; \
+		vpmadd52huq Q2_H, B1, R1; \
+	\
+	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q1_L, Q1_L, Q0_H; \
+	vpsllq  Q1_H, Q1_H, 8; \
+	vpsrlq  ZTMP3, Q1_L, 44; \
+	vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	\
+	vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  P2_L, P2_L, ZTMP1; \
+	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpaddq  A2, A2, ZTMP6; /* ; Add highest bits from new blocks to accumulator */ \
+	vpsrlq  ZTMP1, P2_L, 42; \
+	vpsllq  P2_H, P2_H, 10; \
+	vpaddq  P2_H, P2_H, ZTMP1; \
+	\
+	vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  Q2_L, Q2_L, ZTMP3; \
+	vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpaddq  B2, B2, ZTMP7; /* ; Add highest bits from new blocks to accumulator */ \
+	vpsrlq  ZTMP3, Q2_L, 42; \
+	vpsllq  Q2_H, Q2_H, 10; \
+	vpaddq  Q2_H, Q2_H, ZTMP3; \
+	\
+	/* ; Carry propagation (second pass) */ \
+	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
+	vpaddq  A0, A0, P2_H; \
+	vpsllq  P2_H, P2_H, 2; \
+	vpaddq  A0, A0, P2_H; \
+	vpaddq  B0, B0, Q2_H; \
+	vpsllq  Q2_H, Q2_H, 2; \
+	vpaddq  B0, B0, Q2_H; \
+	\
+	vpsrlq  ZTMP1, A0, 44; \
+	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  A0, A0, ZTMP5; /* ; Add low 42-bit bits from new blocks to accumulator */ \
+	vpaddq  A1, A1, ZTMP2; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
+	vpaddq  A1, A1, ZTMP1; \
+	vpsrlq  ZTMP3, B0, 44; \
+	vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  B0, B0, ZTMP8; /* ; Add low 42-bit bits from new blocks to accumulator */ \
+	vpaddq  B1, B1, ZTMP9; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
+	vpaddq  B1, B1, ZTMP3
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 16 16-byte message blocks.
+;;
+;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
+;; and 8 blocks from B0-B2, multiplied by R0-R2 and S0-S2)
+;;
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2.
+;;
+;; =============================================================================
+;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;B0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;B1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;B2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;R0    [in] ZMM register (R0) to include the 1st limb in IDX
+;R1    [in] ZMM register (R1) to include the 2nd limb in IDX
+;R2    [in] ZMM register (R2) to include the 3rd limb in IDX
+;R1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
+;R2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
+;S0    [in] ZMM register (R0) to include the 1st limb in IDX
+;S1    [in] ZMM register (R1) to include the 2nd limb in IDX
+;S2    [in] ZMM register (R2) to include the 3rd limb in IDX
+;S1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
+;S2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;ZTMP1 [clobbered] Temporary ZMM register
+;ZTMP2 [clobbered] Temporary ZMM register
+*/
+#define POLY1305_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, R2P,\
+				  S0, S1, S2, S1P, S2P, P0_L, P0_H, P1_L, P1_H,\
+				  P2_L, P2_H, Q0_L, Q0_H, Q1_L, Q1_H, Q2_L,\
+				  Q2_H, ZTMP1, ZTMP2) \
+	/* ;; Reset accumulator */ \
+	vpxorq  P0_L, P0_L, P0_L; \
+	vpxorq  P0_H, P0_H, P0_H; \
+	vpxorq  P1_L, P1_L, P1_L; \
+	vpxorq  P1_H, P1_H, P1_H; \
+	vpxorq  P2_L, P2_L, P2_L; \
+	vpxorq  P2_H, P2_H, P2_H; \
+	vpxorq  Q0_L, Q0_L, Q0_L; \
+	vpxorq  Q0_H, Q0_H, Q0_H; \
+	vpxorq  Q1_L, Q1_L, Q1_L; \
+	vpxorq  Q1_H, Q1_H, Q1_H; \
+	vpxorq  Q2_L, Q2_L, Q2_L; \
+	vpxorq  Q2_H, Q2_H, Q2_H; \
+	\
+	/* ;; This code interleaves hash computation with input loading/splatting */ \
+	\
+	/* ; Calculate products */ \
+	vpmadd52luq P0_L, A2, R1P; \
+	vpmadd52huq P0_H, A2, R1P; \
+	\
+	vpmadd52luq Q0_L, B2, S1P; \
+	vpmadd52huq Q0_H, B2, S1P; \
+	\
+	vpmadd52luq P1_L, A2, R2P; \
+	vpmadd52huq P1_H, A2, R2P; \
+	\
+	vpmadd52luq Q1_L, B2, S2P; \
+	vpmadd52huq Q1_H, B2, S2P; \
+	\
+	vpmadd52luq P0_L, A0, R0; \
+	vpmadd52huq P0_H, A0, R0; \
+	\
+	vpmadd52luq Q0_L, B0, S0; \
+	vpmadd52huq Q0_H, B0, S0; \
+	\
+	vpmadd52luq P2_L, A2, R0; \
+	vpmadd52huq P2_H, A2, R0; \
+	vpmadd52luq Q2_L, B2, S0; \
+	vpmadd52huq Q2_H, B2, S0; \
+	\
+	vpmadd52luq P1_L, A0, R1; \
+	vpmadd52huq P1_H, A0, R1; \
+	vpmadd52luq Q1_L, B0, S1; \
+	vpmadd52huq Q1_H, B0, S1; \
+	\
+	vpmadd52luq P0_L, A1, R2P; \
+	vpmadd52huq P0_H, A1, R2P; \
+	\
+	vpmadd52luq Q0_L, B1, S2P; \
+	vpmadd52huq Q0_H, B1, S2P; \
+	\
+	vpmadd52luq P2_L, A0, R2; \
+	vpmadd52huq P2_H, A0, R2; \
+	\
+	vpmadd52luq Q2_L, B0, S2; \
+	vpmadd52huq Q2_H, B0, S2; \
+	\
+	/* ; Carry propagation (first pass) */ \
+	vpsrlq  ZTMP1, P0_L, 44; \
+	vpsllq  P0_H, P0_H, 8; \
+	vpsrlq  ZTMP2, Q0_L, 44; \
+	vpsllq  Q0_H, Q0_H, 8; \
+	\
+	vpmadd52luq P1_L, A1, R0; \
+	vpmadd52huq P1_H, A1, R0; \
+	vpmadd52luq Q1_L, B1, S0; \
+	vpmadd52huq Q1_H, B1, S0; \
+	\
+	/* ; Carry propagation (first pass) - continue */ \
+	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  P0_H, P0_H, ZTMP1; \
+	vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q0_H, Q0_H, ZTMP2; \
+	\
+	vpmadd52luq P2_L, A1, R1; \
+	vpmadd52huq P2_H, A1, R1; \
+	vpmadd52luq Q2_L, B1, S1; \
+	vpmadd52huq Q2_H, B1, S1; \
+	\
+	/* ; Carry propagation (first pass) - continue */ \
+	vpaddq  P1_L, P1_L, P0_H; \
+	vpsllq  P1_H, P1_H, 8; \
+	vpsrlq  ZTMP1, P1_L, 44; \
+	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q1_L, Q1_L, Q0_H; \
+	vpsllq  Q1_H, Q1_H, 8; \
+	vpsrlq  ZTMP2, Q1_L, 44; \
+	vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	\
+	vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  P2_L, P2_L, ZTMP1; \
+	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsrlq  ZTMP1, P2_L, 42; \
+	vpsllq  P2_H, P2_H, 10; \
+	vpaddq  P2_H, P2_H, ZTMP1; \
+	\
+	vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  Q2_L, Q2_L, ZTMP2; \
+	vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsrlq  ZTMP2, Q2_L, 42; \
+	vpsllq  Q2_H, Q2_H, 10; \
+	vpaddq  Q2_H, Q2_H, ZTMP2; \
+	\
+	/* ; Carry propagation (second pass) */ \
+	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
+	vpaddq  A0, A0, P2_H; \
+	vpsllq  P2_H, P2_H, 2; \
+	vpaddq  A0, A0, P2_H; \
+	vpaddq  B0, B0, Q2_H; \
+	vpsllq  Q2_H, Q2_H, 2; \
+	vpaddq  B0, B0, Q2_H; \
+	\
+	vpsrlq  ZTMP1, A0, 44; \
+	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  A1, A1, ZTMP1; \
+	vpsrlq  ZTMP2, B0, 44; \
+	vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  B1, B1, ZTMP2;
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Shuffle data blocks, so they match the right power of R.
+;; Powers of R are in this order: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
+;; Data blocks are coming in this order: A0 A4 A1 A5 A2 A6 A3 A7
+;; Generally the computation is: A0*R^8 + A1*R^7 + A2*R^6 + A3*R^5 +
+;;                               A4*R^4 + A5*R^3 + A6*R^2 + A7*R
+;; When there are less data blocks, less powers of R are used, so data needs to
+;; be shuffled. Example: if 4 blocks are left, only A0-A3 are available and only
+;; R-R^4 are used (A0*R^4 + A1*R^3 + A2*R^2 + A3*R), so A0-A3 need to be shifted
+;; =============================================================================
+;A_L      [in/out] 0-43 bits of input data
+;A_M      [in/out] 44-87 bits of input data
+;A_H      [in/out] 88-129 bits of input data
+;TMP      [clobbered] Temporary GP register
+;N_BLOCKS [in] Number of remaining input blocks
+*/
+#define SHUFFLE_DATA_SMASK_1 0x39
+#define SHUFFLE_DATA_KMASK_1 0xffff
+#define SHUFFLE_DATA_SMASK_2 0x4E
+#define SHUFFLE_DATA_KMASK_2 0xffff
+#define SHUFFLE_DATA_SMASK_3 0x93
+#define SHUFFLE_DATA_KMASK_3 0xffff
+#define SHUFFLE_DATA_KMASK_4 0xffff
+#define SHUFFLE_DATA_SMASK_5 0x39
+#define SHUFFLE_DATA_KMASK_5 0xfff0
+#define SHUFFLE_DATA_SMASK_6 0x4E
+#define SHUFFLE_DATA_KMASK_6 0xff00
+#define SHUFFLE_DATA_SMASK_7 0x93
+#define SHUFFLE_DATA_KMASK_7 0xf000
+
+#define SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, N_BLOCKS) \
+	mov     TMP, SHUFFLE_DATA_KMASK_##N_BLOCKS; \
+	kmovq   k1, TMP; \
+	vpshufd A_L{k1}, A_L, 0x4E; \
+	vpshufd A_M{k1}, A_M, 0x4E; \
+	vpshufd A_H{k1}, A_H, 0x4E; \
+	vshufi64x2 A_L, A_L, A_L, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
+	vshufi64x2 A_M, A_M, A_M, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
+	vshufi64x2 A_H, A_H, A_H, SHUFFLE_DATA_SMASK_##N_BLOCKS
+
+#define SHUFFLE_DATA_BLOCKS_1(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 1)
+
+#define SHUFFLE_DATA_BLOCKS_2(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 2)
+
+#define SHUFFLE_DATA_BLOCKS_3(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 3)
+
+#define SHUFFLE_DATA_BLOCKS_4(A_L, A_M, A_H, TMP) \
+	mov     TMP, SHUFFLE_DATA_KMASK_4; \
+	kmovq   k1, TMP; \
+	vpshufd A_L{k1}, A_L, 0x4E; \
+	vpshufd A_M{k1}, A_M, 0x4E; \
+	vpshufd A_H{k1}, A_H, 0x4E;
+
+#define SHUFFLE_DATA_BLOCKS_5(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 5)
+
+#define SHUFFLE_DATA_BLOCKS_6(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 6)
+
+#define SHUFFLE_DATA_BLOCKS_7(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 7)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for message length being multiple of block size
+;; =============================================================================
+;MSG    [in/out] GPR pointer to input message (updated)
+;LEN    [in/out] GPR in: length in bytes / out: length mod 16
+;A0     [in/out] accumulator bits 63..0
+;A1     [in/out] accumulator bits 127..64
+;A2     [in/out] accumulator bits 195..128
+;R0     [in] R constant bits 63..0
+;R1     [in] R constant bits 127..64
+;T0     [clobbered] GPR register
+;T1     [clobbered] GPR register
+;T2     [clobbered] GPR register
+;T3     [clobbered] GPR register
+;GP_RAX [clobbered] RAX register
+;GP_RDX [clobbered] RDX register
+*/
+#define POLY1305_BLOCKS(MSG, LEN, A0, A1, A2, R0, R1, T0, T1, T2, T3, \
+			GP_RAX, GP_RDX) \
+	/* ; Minimum of 256 bytes to run vectorized code */ \
+	cmp     LEN, POLY1305_BLOCK_SIZE*16; \
+	jb      .L_final_loop; \
+	\
+	/* ; Spread accumulator into 44-bit limbs in quadwords */ \
+	mov     T0, A0; \
+	and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (A[43:0]) */ \
+	vmovq   xmm5, T0; \
+	\
+	mov     T0, A1; \
+	shrd    A0, T0, 44; \
+	and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (A[77:52]) */ \
+	vmovq   xmm6, A0; \
+	\
+	shrd    A1, A2, 24; \
+	and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (A[129:88]) */ \
+	vmovq   xmm7, A1; \
+	\
+	/* ; Load first block of data (128 bytes) */ \
+	vmovdqu64 zmm0, [MSG]; \
+	vmovdqu64 zmm1, [MSG + 64]; \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm15, zmm0, zmm1; \
+	vpunpcklqdq zmm13, zmm0, zmm1; \
+	\
+	vpsrlq  zmm14, zmm13, 44; \
+	vpsllq  zmm18, zmm15, 20; \
+	vpternlogq zmm14, zmm18, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm13, zmm13, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm15, zmm15, 24; \
+	\
+	/* ; Add 2^128 to all 8 final qwords of the message */ \
+	vporq   zmm15, zmm15, [.Lhigh_bit ADD_RIP]; \
+	\
+	vpaddq  zmm13, zmm13, zmm5; \
+	vpaddq  zmm14, zmm14, zmm6; \
+	vpaddq  zmm15, zmm15, zmm7; \
+	\
+	/* ; Load next blocks of data (128 bytes) */ \
+	vmovdqu64 zmm0, [MSG + 64*2]; \
+	vmovdqu64 zmm1, [MSG + 64*3]; \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm18, zmm0, zmm1; \
+	vpunpcklqdq zmm16, zmm0, zmm1; \
+	\
+	vpsrlq  zmm17, zmm16, 44; \
+	vpsllq  zmm19, zmm18, 20; \
+	vpternlogq zmm17, zmm19, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm16, zmm16, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm18, zmm18, 24; \
+	\
+	/* ; Add 2^128 to all 8 final qwords of the message */ \
+	vporq   zmm18, zmm18, [.Lhigh_bit ADD_RIP]; \
+	\
+	/* ; Use memory in stack to save powers of R, before loading them into ZMM registers */ \
+	/* ; The first 16*8 bytes will contain the 16 bytes of the 8 powers of R */ \
+	/* ; The last 64 bytes will contain the last 2 bits of powers of R, spread in 8 qwords, */ \
+	/* ; to be OR'd with the highest qwords (in zmm26) */ \
+	vmovq   xmm3, R0; \
+	vpinsrq xmm3, xmm3, R1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 3; \
+	\
+	vpxorq  zmm0, zmm0, zmm0; \
+	vpxorq  zmm2, zmm2, zmm2; \
+	\
+	/* ; Calculate R^2 */ \
+	mov     T0, R1; \
+	shr     T0, 2; \
+	add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
+	\
+	mov     A0, R0; \
+	mov     A1, R1; \
+	\
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_ZERO); \
+	\
+	vmovq   xmm3, A0; \
+	vpinsrq xmm3, xmm3, A1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 2; \
+	\
+	vmovq   xmm4, A2; \
+	vinserti32x4 zmm2, zmm2, xmm4, 2; \
+	\
+	/* ; Calculate R^3 */ \
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+	\
+	vmovq   xmm3, A0; \
+	vpinsrq xmm3, xmm3, A1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 1; \
+	\
+	vmovq   xmm4, A2; \
+	vinserti32x4 zmm2, zmm2, xmm4, 1; \
+	\
+	/* ; Calculate R^4 */ \
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+	\
+	vmovq   xmm3, A0; \
+	vpinsrq xmm3, xmm3, A1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 0; \
+	\
+	vmovq   xmm4, A2; \
+	vinserti32x4 zmm2, zmm2, xmm4, 0; \
+	\
+	/* ; Move 2 MSbits to top 24 bits, to be OR'ed later */ \
+	vpsllq  zmm2, zmm2, 40; \
+	\
+	vpunpckhqdq zmm21, zmm1, zmm0; \
+	vpunpcklqdq zmm19, zmm1, zmm0; \
+	\
+	vpsrlq  zmm20, zmm19, 44; \
+	vpsllq  zmm4, zmm21, 20; \
+	vpternlogq zmm20, zmm4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm19, zmm19, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm21, zmm21, 24; \
+	\
+	/* ; zmm2 contains the 2 highest bits of the powers of R */ \
+	vporq   zmm21, zmm21, zmm2; \
+	\
+	/* ; Broadcast 44-bit limbs of R^4 */ \
+	mov     T0, A0; \
+	and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (R^4[43:0]) */ \
+	vpbroadcastq zmm22, T0; \
+	\
+	mov     T0, A1; \
+	shrd    A0, T0, 44; \
+	and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (R^4[87:44]) */ \
+	vpbroadcastq zmm23, A0; \
+	\
+	shrd    A1, A2, 24; \
+	and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (R^4[129:88]) */ \
+	vpbroadcastq zmm24, A1; \
+	\
+	/* ; Generate 4*5*R^4 */ \
+	vpsllq  zmm25, zmm23, 2; \
+	vpsllq  zmm26, zmm24, 2; \
+	\
+	/* ; 5*R^4 */ \
+	vpaddq  zmm25, zmm25, zmm23; \
+	vpaddq  zmm26, zmm26, zmm24; \
+	\
+	/* ; 4*5*R^4 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	vpslldq zmm29, zmm19, 8; \
+	vpslldq zmm30, zmm20, 8; \
+	vpslldq zmm31, zmm21, 8; \
+	\
+	/* ; Calculate R^8-R^5 */ \
+	POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ; Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R */ \
+	vporq   zmm19, zmm19, zmm29; \
+	vporq   zmm20, zmm20, zmm30; \
+	vporq   zmm21, zmm21, zmm31; \
+	\
+	/* ; Broadcast R^8 */ \
+	vpbroadcastq zmm22, xmm19; \
+	vpbroadcastq zmm23, xmm20; \
+	vpbroadcastq zmm24, xmm21; \
+	\
+	/* ; Generate 4*5*R^8 */ \
+	vpsllq  zmm25, zmm23, 2; \
+	vpsllq  zmm26, zmm24, 2; \
+	\
+	/* ; 5*R^8 */ \
+	vpaddq  zmm25, zmm25, zmm23; \
+	vpaddq  zmm26, zmm26, zmm24; \
+	\
+	/* ; 4*5*R^8 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	cmp     LEN, POLY1305_BLOCK_SIZE*32; \
+	jb      .L_len_256_511; \
+	\
+	/* ; Store R^8-R for later use */ \
+	vmovdqa64 [rsp + STACK_r_save], zmm19; \
+	vmovdqa64 [rsp + STACK_r_save + 64], zmm20; \
+	vmovdqa64 [rsp + STACK_r_save + 64*2], zmm21; \
+	\
+	/* ; Calculate R^16-R^9 */ \
+	POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ; Store R^16-R^9 for later use */ \
+	vmovdqa64 [rsp + STACK_r_save + 64*3], zmm19; \
+	vmovdqa64 [rsp + STACK_r_save + 64*4], zmm20; \
+	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm21; \
+	\
+	/* ; Broadcast R^16 */ \
+	vpbroadcastq zmm22, xmm19; \
+	vpbroadcastq zmm23, xmm20; \
+	vpbroadcastq zmm24, xmm21; \
+	\
+	/* ; Generate 4*5*R^16 */ \
+	vpsllq  zmm25, zmm23, 2; \
+	vpsllq  zmm26, zmm24, 2; \
+	\
+	/* ; 5*R^16 */ \
+	vpaddq  zmm25, zmm25, zmm23; \
+	vpaddq  zmm26, zmm26, zmm24; \
+	\
+	/* ; 4*5*R^16 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	mov     T0, LEN; \
+	and     T0, 0xffffffffffffff00; /* ; multiple of 256 bytes */ \
+	\
+.L_poly1305_blocks_loop: \
+	cmp     T0, POLY1305_BLOCK_SIZE*16; \
+	jbe     .L_poly1305_blocks_loop_end; \
+	\
+	/* ; zmm13-zmm18 contain the 16 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 5x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 5x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MSG_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
+				      zmm22, zmm23, zmm24, zmm25, zmm26, \
+				      zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				      zmm19, zmm20, zmm21, zmm27, zmm28, zmm29, \
+				      zmm30, zmm31, zmm11, zmm0, zmm1, \
+				      zmm2, zmm3, zmm4, zmm12, MSG, T0); \
+	\
+	jmp     .L_poly1305_blocks_loop; \
+	\
+.L_poly1305_blocks_loop_end: \
+	\
+	/* ;; Need to multiply by r^16, r^15, r^14... r */ \
+	\
+	/* ; First multiply by r^16-r^9 */ \
+	\
+	/* ; Read R^16-R^9 */ \
+	vmovdqa64 zmm19, [rsp + STACK_r_save + 64*3]; \
+	vmovdqa64 zmm20, [rsp + STACK_r_save + 64*4]; \
+	vmovdqa64 zmm21, [rsp + STACK_r_save + 64*5]; \
+	/* ; Read R^8-R */ \
+	vmovdqa64 zmm22, [rsp + STACK_r_save]; \
+	vmovdqa64 zmm23, [rsp + STACK_r_save + 64]; \
+	vmovdqa64 zmm24, [rsp + STACK_r_save + 64*2]; \
+	\
+	/* ; zmm27 to have bits 87-44 of all 9-16th powers of R' in 8 qwords */ \
+	/* ; zmm28 to have bits 129-88 of all 9-16th powers of R' in 8 qwords */ \
+	vpsllq  zmm0, zmm20, 2; \
+	vpaddq  zmm27, zmm20, zmm0; /* ; R1' (R1*5) */ \
+	vpsllq  zmm1, zmm21, 2; \
+	vpaddq  zmm28, zmm21, zmm1; /* ; R2' (R2*5) */ \
+	\
+	/* ; 4*5*R */ \
+	vpsllq  zmm27, zmm27, 2; \
+	vpsllq  zmm28, zmm28, 2; \
+	\
+	/* ; Then multiply by r^8-r */ \
+	\
+	/* ; zmm25 to have bits 87-44 of all 1-8th powers of R' in 8 qwords */ \
+	/* ; zmm26 to have bits 129-88 of all 1-8th powers of R' in 8 qwords */ \
+	vpsllq  zmm2, zmm23, 2; \
+	vpaddq  zmm25, zmm23, zmm2; /* ; R1' (R1*5) */ \
+	vpsllq  zmm3, zmm24, 2; \
+	vpaddq  zmm26, zmm24, zmm3; /* ; R2' (R2*5) */ \
+	\
+	/* ; 4*5*R */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	POLY1305_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
+				  zmm19, zmm20, zmm21, zmm27, zmm28, \
+				  zmm22, zmm23, zmm24, zmm25, zmm26, \
+				  zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, \
+				  zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm29); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vpaddq  zmm13, zmm13, zmm16; \
+	vpaddq  zmm14, zmm14, zmm17; \
+	vpaddq  zmm15, zmm15, zmm18; \
+	\
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	/* ; Finish folding and clear second qword */ \
+	mov     T0, 0xfd; \
+	kmovq   k1, T0; \
+	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE*16; \
+	\
+	and     LEN, (POLY1305_BLOCK_SIZE*16 - 1); /* ; Get remaining lengths (LEN < 256 bytes) */ \
+	\
+.L_less_than_256: \
+	\
+	cmp     LEN, POLY1305_BLOCK_SIZE*8; \
+	jb      .L_less_than_128; \
+	\
+	/* ; Read next 128 bytes */ \
+	/* ; Load first block of data (128 bytes) */ \
+	vmovdqu64 zmm0, [MSG]; \
+	vmovdqu64 zmm1, [MSG + 64]; \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm5, zmm0, zmm1; \
+	vpunpcklqdq zmm3, zmm0, zmm1; \
+	\
+	vpsrlq  zmm4, zmm3, 44; \
+	vpsllq  zmm8, zmm5, 20; \
+	vpternlogq zmm4, zmm8, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm3, zmm3, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm5, zmm5, 24; \
+	\
+	/* ; Add 2^128 to all 8 final qwords of the message */ \
+	vporq   zmm5, zmm5, [.Lhigh_bit ADD_RIP]; \
+	\
+	vpaddq  zmm13, zmm13, zmm3; \
+	vpaddq  zmm14, zmm14, zmm4; \
+	vpaddq  zmm15, zmm15, zmm5; \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE*8; \
+	sub     LEN, POLY1305_BLOCK_SIZE*8; \
+	\
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	/* ; Finish folding and clear second qword */ \
+	mov     T0, 0xfd; \
+	kmovq   k1, T0; \
+	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+	\
+.L_less_than_128: \
+	cmp     LEN, 32; /* ; If remaining bytes is <= 32, perform last blocks in scalar */ \
+	jbe     .L_simd_to_gp; \
+	\
+	mov     T0, LEN; \
+	and     T0, 0x3f; \
+	lea     T1, [.Lbyte64_len_to_mask_table ADD_RIP]; \
+	mov     T1, [T1 + 8*T0]; \
+	\
+	/* ; Load default byte masks */ \
+	mov     T2, 0xffffffffffffffff; \
+	xor     T3, T3; \
+	\
+	cmp     LEN, 64; \
+	cmovb   T2, T1; /* ; Load mask for first 64 bytes */ \
+	cmovg   T3, T1; /* ; Load mask for second 64 bytes */ \
+	\
+	kmovq   k1, T2; \
+	kmovq   k2, T3; \
+	vmovdqu8 zmm0{k1}{z}, [MSG]; \
+	vmovdqu8 zmm1{k2}{z}, [MSG + 64]; \
+	\
+	/* ; Pad last block message, if partial */ \
+	mov     T0, LEN; \
+	and     T0, 0x70; /* ; Multiple of 16 bytes */ \
+	/* ; Load last block of data (up to 112 bytes) */ \
+	shr     T0, 3; /* ; Get number of full qwords */ \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm4, zmm0, zmm1; \
+	vpunpcklqdq zmm2, zmm0, zmm1; \
+	\
+	vpsrlq  zmm3, zmm2, 44; \
+	vpsllq  zmm28, zmm4, 20; \
+	vpternlogq zmm3, zmm28, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm2, zmm2, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm4, zmm4, 24; \
+	\
+	lea     T1, [.Lqword_high_bit_mask ADD_RIP]; \
+	kmovb   k1, [T1 + T0]; \
+	/* ; Add 2^128 to final qwords of the message (all full blocks and partial block, */ \
+	/* ; if "pad_to_16" is selected) */ \
+	vporq   zmm4{k1}, zmm4, [.Lhigh_bit ADD_RIP]; \
+	\
+	vpaddq  zmm13, zmm13, zmm2; \
+	vpaddq  zmm14, zmm14, zmm3; \
+	vpaddq  zmm15, zmm15, zmm4; \
+	\
+	mov     T0, LEN; \
+	add     T0, 15; \
+	shr     T0, 4;      /* ; Get number of 16-byte blocks (including partial blocks) */ \
+	xor     LEN, LEN; /* ; All length will be consumed */ \
+	\
+	/* ; No need to shuffle data blocks (data is in the right order) */ \
+	cmp     T0, 8; \
+	je      .L_end_shuffle; \
+	\
+	cmp     T0, 4; \
+	je      .L_shuffle_blocks_4; \
+	jb      .L_shuffle_blocks_3; \
+	\
+	/* ; Number of 16-byte blocks > 4 */ \
+	cmp     T0, 6; \
+	je      .L_shuffle_blocks_6; \
+	ja      .L_shuffle_blocks_7; \
+	jmp     .L_shuffle_blocks_5; \
+	\
+.L_shuffle_blocks_3: \
+	SHUFFLE_DATA_BLOCKS_3(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_4: \
+	SHUFFLE_DATA_BLOCKS_4(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_5: \
+	SHUFFLE_DATA_BLOCKS_5(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_6: \
+	SHUFFLE_DATA_BLOCKS_6(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_7: \
+	SHUFFLE_DATA_BLOCKS_7(zmm13, zmm14, zmm15, T1); \
+	\
+.L_end_shuffle: \
+	\
+	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+.L_simd_to_gp: \
+	/* ; Carry propagation */ \
+	vpsrlq  xmm0, xmm13, 44; \
+	vpandq  xmm13, xmm13, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  xmm14, xmm14, xmm0; \
+	vpsrlq  xmm0, xmm14, 44; \
+	vpandq  xmm14, xmm14, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  xmm15, xmm15, xmm0; \
+	vpsrlq  xmm0, xmm15, 42; \
+	vpandq  xmm15, xmm15, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsllq  xmm1, xmm0, 2; \
+	vpaddq  xmm0, xmm0, xmm1; \
+	vpaddq  xmm13, xmm13, xmm0; \
+	\
+	/* ; Put together A */ \
+	vmovq   A0, xmm13; \
+	\
+	vmovq   T0, xmm14; \
+	mov     T1, T0; \
+	shl     T1, 44; \
+	or      A0, T1; \
+	\
+	shr     T0, 20; \
+	vmovq   A2, xmm15; \
+	mov     A1, A2; \
+	shl     A1, 24; \
+	or      A1, T0; \
+	shr     A2, 40; \
+	\
+	/* ; Clear powers of R */ \
+	vpxorq  zmm0, zmm0, zmm0; \
+	vmovdqa64 [rsp + STACK_r_save], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*2], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*3], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*4], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \
+	\
+	vzeroall; \
+	clear_zmm(xmm16); clear_zmm(xmm20); clear_zmm(xmm24); clear_zmm(xmm28); \
+	clear_zmm(xmm17); clear_zmm(xmm21); clear_zmm(xmm25); clear_zmm(xmm29); \
+	clear_zmm(xmm18); clear_zmm(xmm22); clear_zmm(xmm26); clear_zmm(xmm30); \
+	clear_zmm(xmm19); clear_zmm(xmm23); clear_zmm(xmm27); clear_zmm(xmm31); \
+	\
+.L_final_loop: \
+	cmp     LEN, POLY1305_BLOCK_SIZE; \
+	jb      .L_poly1305_blocks_exit; \
+	\
+	/* ;; A += MSG[i] */ \
+	add     A0, [MSG + 0]; \
+	adc     A1, [MSG + 8]; \
+	adc     A2, 1; /* ;; no padding bit */ \
+	\
+	mov     T0, R1; \
+	shr     T0, 2; \
+	add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
+	\
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, \
+			    T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE; \
+	sub     LEN, POLY1305_BLOCK_SIZE; \
+	\
+	jmp     .L_final_loop; \
+	\
+.L_len_256_511: \
+	\
+	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ; Then multiply by r^8-r */ \
+	\
+	/* ; zmm19-zmm21 contains R^8-R, need to move it to zmm22-24, */ \
+	/* ; as it might be used in other part of the code */ \
+	vmovdqa64 zmm22, zmm19; \
+	vmovdqa64 zmm23, zmm20; \
+	vmovdqa64 zmm24, zmm21; \
+	\
+	/* ; zmm25 to have bits 87-44 of all 8 powers of R' in 8 qwords */ \
+	/* ; zmm26 to have bits 129-88 of all 8 powers of R' in 8 qwords */ \
+	vpsllq  zmm0, zmm23, 2; \
+	vpaddq  zmm25, zmm23, zmm0; /* ; R1' (R1*5) */ \
+	vpsllq  zmm1, zmm24, 2; \
+	vpaddq  zmm26, zmm24, zmm1; /* ; R2' (R2*5) */ \
+	\
+	/* ; 4*5*R^8 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	vpaddq  zmm13, zmm13, zmm16; \
+	vpaddq  zmm14, zmm14, zmm17; \
+	vpaddq  zmm15, zmm15, zmm18; \
+	\
+	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	/* ; Finish folding and clear second qword */ \
+	mov     T0, 0xfd; \
+	kmovq   k1, T0; \
+	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE*16; \
+	sub     LEN, POLY1305_BLOCK_SIZE*16; \
+	\
+	jmp     .L_less_than_256; \
+.L_poly1305_blocks_exit: \
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Creates stack frame and saves registers
+;; =============================================================================
+*/
+#define FUNC_ENTRY() \
+	mov     rax, rsp; \
+	CFI_DEF_CFA_REGISTER(rax); \
+	sub     rsp, STACK_SIZE; \
+	and	rsp, -64; \
+	\
+	mov     [rsp + STACK_gpr_save + 8*0], rbx; \
+	mov     [rsp + STACK_gpr_save + 8*1], rbp; \
+	mov     [rsp + STACK_gpr_save + 8*2], r12; \
+	mov     [rsp + STACK_gpr_save + 8*3], r13; \
+	mov     [rsp + STACK_gpr_save + 8*4], r14; \
+	mov     [rsp + STACK_gpr_save + 8*5], r15; \
+	mov     [rsp + STACK_rsp_save], rax; \
+	CFI_CFA_ON_STACK(STACK_rsp_save, 0)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Restores registers and removes the stack frame
+;; =============================================================================
+*/
+#define FUNC_EXIT() \
+	mov     rbx, [rsp + STACK_gpr_save + 8*0]; \
+	mov     rbp, [rsp + STACK_gpr_save + 8*1]; \
+	mov     r12, [rsp + STACK_gpr_save + 8*2]; \
+	mov     r13, [rsp + STACK_gpr_save + 8*3]; \
+	mov     r14, [rsp + STACK_gpr_save + 8*4]; \
+	mov     r15, [rsp + STACK_gpr_save + 8*5]; \
+	mov     rsp, [rsp + STACK_rsp_save]; \
+	CFI_DEF_CFA_REGISTER(rsp)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; void poly1305_aead_update_fma_avx512(const void *msg, const uint64_t msg_len,
+;;                                      void *hash, const void *key)
+;; arg1 - Input message
+;; arg2 - Message length
+;; arg3 - Input/output hash
+;; arg4 - Poly1305 key
+*/
+.align 32
+.globl _gcry_poly1305_amd64_avx512_blocks
+ELF(.type _gcry_poly1305_amd64_avx512_blocks,@function;)
+_gcry_poly1305_amd64_avx512_blocks:
+	CFI_STARTPROC()
+	vpxord xmm16, xmm16, xmm16;
+	vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */
+	FUNC_ENTRY()
+
+#define _a0 gp3
+#define _a0 gp3
+#define _a1 gp4
+#define _a2 gp5
+#define _r0 gp6
+#define _r1 gp7
+#define _len arg2
+#define _arg3 arg4             /* ; use rcx, arg3 = rdx */
+
+	/* ;; load R */
+	mov     _r0, [arg4 + 0 * 8]
+	mov     _r1, [arg4 + 1 * 8]
+
+	/* ;; load accumulator / current hash value */
+	/* ;; note: arg4 can't be used beyond this point */
+	mov     _arg3, arg3             /* ; note: _arg3 = arg4 (linux) */
+	mov     _a0, [_arg3 + 0 * 8]
+	mov     _a1, [_arg3 + 1 * 8]
+	mov     DWORD(_a2), [_arg3 + 2 * 8]    /* ; note: _a2 = arg4 (win) */
+
+	POLY1305_BLOCKS(arg1, _len, _a0, _a1, _a2, _r0, _r1,
+			gp10, gp11, gp8, gp9, rax, rdx)
+
+	/* ;; save accumulator back */
+	mov     [_arg3 + 0 * 8], _a0
+	mov     [_arg3 + 1 * 8], _a1
+	mov     [_arg3 + 2 * 8], DWORD(_a2)
+
+	FUNC_EXIT()
+	xor eax, eax
+	kmovw k1, eax
+	kmovw k2, eax
+	ret_spec_stop
+	CFI_ENDPROC()
+ELF(.size _gcry_poly1305_amd64_avx512_blocks,
+	  .-_gcry_poly1305_amd64_avx512_blocks;)
+
+#endif
+#endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index 19cee5f6..9e01df46 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -1,64 +1,77 @@
 /* poly1305-internal.h  -  Poly1305 internals
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef G10_POLY1305_INTERNAL_H
 #define G10_POLY1305_INTERNAL_H
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 
 #define POLY1305_TAGLEN 16
 #define POLY1305_KEYLEN 32
 #define POLY1305_BLOCKSIZE 16
 
 
+/* POLY1305_USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef POLY1305_USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define POLY1305_USE_AVX512 1
+#endif
+
+
 typedef struct
 {
   u32 k[4];
   u32 r[4];
   u32 h[5];
 } POLY1305_STATE;
 
 typedef struct poly1305_context_s
 {
   POLY1305_STATE state;
   byte buffer[POLY1305_BLOCKSIZE];
   unsigned int leftover;
+#ifdef POLY1305_USE_AVX512
+  unsigned int use_avx512:1;
+#endif
 } poly1305_context_t;
 
 
 gcry_err_code_t _gcry_poly1305_init (poly1305_context_t *ctx, const byte *key,
 				     size_t keylen);
 
 void _gcry_poly1305_finish (poly1305_context_t *ctx,
 			     byte mac[POLY1305_TAGLEN]);
 
 void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
 			     size_t buflen);
 
 unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx,
 					 const byte *m, size_t bytes);
 
 #endif /* G10_POLY1305_INTERNAL_H */
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index e57e64f3..5482fc6a 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -1,763 +1,809 @@
 /* poly1305.c  -  Poly1305 internals and generic implementation
  * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "poly1305-internal.h"
 
 #include "mpi-internal.h"
 #include "longlong.h"
 
 
 static const char *selftest (void);
 
 
 #undef HAVE_ASM_POLY1305_BLOCKS
 
 
 #undef USE_MPI_64BIT
 #undef USE_MPI_32BIT
 #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64)
 # define USE_MPI_64BIT 1
 #elif BYTES_PER_MPI_LIMB == 4
 # define USE_MPI_32BIT 1
 #else
 # error please implement for this limb size.
 #endif
 
 
 /* USE_S390X_ASM indicates whether to enable zSeries code. */
 #undef USE_S390X_ASM
 #if BYTES_PER_MPI_LIMB == 8
 # if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
 #  if defined(HAVE_GCC_INLINE_ASM_S390X)
 #   define USE_S390X_ASM 1
 #  endif /* USE_S390X_ASM */
 # endif
 #endif
 
 
+/* AMD64 Assembly implementations use SystemV ABI, ABI conversion and
+ * additional stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_FUNC_WRAPPER_ATTR
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_FUNC_WRAPPER_ATTR __attribute__((noinline))
+#else
+# define ASM_FUNC_ABI
+# define ASM_FUNC_WRAPPER_ATTR
+#endif
+
+
 #ifdef USE_S390X_ASM
 
 #define HAVE_ASM_POLY1305_BLOCKS 1
 
 extern unsigned int _gcry_poly1305_s390x_blocks1(void *state,
 						 const byte *buf, size_t len,
 						 byte high_pad);
 
 static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
   return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad);
 }
 
 #endif /* USE_S390X_ASM */
 
 
+#ifdef POLY1305_USE_AVX512
+
+extern unsigned int
+_gcry_poly1305_amd64_avx512_blocks(const void *msg, const u64 msg_len,
+				   void *hash, const void *key) ASM_FUNC_ABI;
+
+ASM_FUNC_WRAPPER_ATTR static unsigned int
+poly1305_amd64_avx512_blocks(poly1305_context_t *ctx, const byte *buf,
+			     size_t len)
+{
+  POLY1305_STATE *st = &ctx->state;
+  return _gcry_poly1305_amd64_avx512_blocks(buf, len, st->h, st->r);
+}
+
+#endif /* POLY1305_USE_AVX512 */
+
+
 static void poly1305_init (poly1305_context_t *ctx,
 			   const byte key[POLY1305_KEYLEN])
 {
   POLY1305_STATE *st = &ctx->state;
 
+#ifdef POLY1305_USE_AVX512
+  ctx->use_avx512 = (_gcry_get_hw_features () & HWF_INTEL_AVX512) != 0;
+#endif
+
   ctx->leftover = 0;
 
   st->h[0] = 0;
   st->h[1] = 0;
   st->h[2] = 0;
   st->h[3] = 0;
   st->h[4] = 0;
 
   st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
   st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
   st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
   st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
 
   st->k[0] = buf_get_le32(key + 16);
   st->k[1] = buf_get_le32(key + 20);
   st->k[2] = buf_get_le32(key + 24);
   st->k[3] = buf_get_le32(key + 28);
 }
 
 
 #ifdef USE_MPI_64BIT
 
 #if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4
 
 /* A += B (armv8/aarch64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("adds %0, %3, %0\n" \
 	       "adcs %1, %4, %1\n" \
 	       "adc  %2, %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "r" (B0), "r" (B1), "r" (B2) \
 	       : "cc" )
 
 #endif /* __aarch64__ */
 
 #if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
 
 /* A += B (x86-64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("addq %3, %0\n" \
 	       "adcq %4, %1\n" \
 	       "adcq %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "g" (B0), "g" (B1), "g" (B2) \
 	       : "cc" )
 
 #endif /* __x86_64__ */
 
 #if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4
 
 /* A += B (ppc64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("addc %0, %3, %0\n" \
 	       "adde %1, %4, %1\n" \
 	       "adde %2, %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "r" (B0), "r" (B1), "r" (B2) \
 	       : "cc" )
 
 #endif /* __powerpc__ */
 
 #ifndef ADD_1305_64
 /* A += B (generic, mpi) */
 #  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
     u64 carry; \
     add_ssaaaa(carry, A0, 0, A0, 0, B0); \
     add_ssaaaa(A2, A1, A2, A1, B2, B1); \
     add_ssaaaa(A2, A1, A2, A1, 0, carry); \
   } while (0)
 #endif
 
 /* H = H * R mod 2¹³⁰-5 */
 #define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
     u64 x0_lo, x0_hi, x1_lo, x1_hi; \
     u64 t0_lo, t0_hi, t1_lo, t1_hi; \
     \
     /* x = a * r (partial mod 2^130-5) */ \
     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
     \
     umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
     add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
     umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
     add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
     \
     t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
     t1_hi = H2 * R0;       /* h2 * r0 */ \
     add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
     \
     /* carry propagation */ \
     H2 = H0 & 3; \
     H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
     ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
   } while (0)
 
 #ifndef HAVE_ASM_POLY1305_BLOCKS
 
 static unsigned int
-poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
-		 byte high_pad)
+poly1305_blocks_generic (poly1305_context_t *ctx, const byte *buf, size_t len,
+			 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u64 r0, r1, r1_mult5;
   u64 h0, h1, h2;
   u64 m0, m1, m2;
 
   m2 = high_pad;
 
   h0 = st->h[0] + ((u64)st->h[1] << 32);
   h1 = st->h[2] + ((u64)st->h[3] << 32);
   h2 = st->h[4];
 
   r0 = st->r[0] + ((u64)st->r[1] << 32);
   r1 = st->r[2] + ((u64)st->r[3] << 32);
 
   r1_mult5 = (r1 >> 2) + r1;
 
   m0 = buf_get_le64(buf + 0);
   m1 = buf_get_le64(buf + 8);
   buf += POLY1305_BLOCKSIZE;
   len -= POLY1305_BLOCKSIZE;
 
   while (len >= POLY1305_BLOCKSIZE)
     {
       /* a = h + m */
       ADD_1305_64(h2, h1, h0, m2, m1, m0);
 
       m0 = buf_get_le64(buf + 0);
       m1 = buf_get_le64(buf + 8);
 
       /* h = a * r (partial mod 2^130-5) */
       MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
       buf += POLY1305_BLOCKSIZE;
       len -= POLY1305_BLOCKSIZE;
     }
 
   /* a = h + m */
   ADD_1305_64(h2, h1, h0, m2, m1, m0);
 
   /* h = a * r (partial mod 2^130-5) */
   MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
   st->h[0] = h0;
   st->h[1] = h0 >> 32;
   st->h[2] = h1;
   st->h[3] = h1 >> 32;
   st->h[4] = h2;
 
   return 6 * sizeof (void *) + 18 * sizeof (u64);
 }
 
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
+{
+#ifdef POLY1305_USE_AVX512
+  if ((high_pad & ctx->use_avx512) != 0)
+    return poly1305_amd64_avx512_blocks(ctx, buf, len);
+#endif
+
+  return poly1305_blocks_generic(ctx, buf, len, high_pad);
+}
+
 #endif /* !HAVE_ASM_POLY1305_BLOCKS */
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
 				    byte mac[POLY1305_TAGLEN])
 {
   POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
   u64 u, carry;
   u64 k0, k1;
   u64 h0, h1;
   u64 h2;
 
   /* process the remaining block */
   if (ctx->leftover)
     {
       ctx->buffer[ctx->leftover++] = 1;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
 	{
 	  memset (&ctx->buffer[ctx->leftover], 0,
 		  POLY1305_BLOCKSIZE - ctx->leftover);
 	  ctx->leftover = POLY1305_BLOCKSIZE;
 	}
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
   h0 = st->h[0] + ((u64)st->h[1] << 32);
   h1 = st->h[2] + ((u64)st->h[3] << 32);
   h2 = st->h[4];
 
   k0 = st->k[0] + ((u64)st->k[1] << 32);
   k1 = st->k[2] + ((u64)st->k[3] << 32);
 
   /* check if h is more than 2^130-5, by adding 5. */
   add_ssaaaa(carry, u, 0, h0, 0, 5);
   add_ssaaaa(carry, u, 0, carry, 0, h1);
   u = (carry + h2) >> 2; /* u == 0 or 1 */
 
   /* minus 2^130-5 ... (+5) */
   u = (-u) & 5;
   add_ssaaaa(h1, h0, h1, h0, 0, u);
 
   /* add high part of key + h */
   add_ssaaaa(h1, h0, h1, h0, k1, k0);
   buf_put_le64(mac + 0, h0);
   buf_put_le64(mac + 8, h1);
 
   /* burn_stack */
   return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
 }
 
 #endif /* USE_MPI_64BIT */
 
 #ifdef USE_MPI_32BIT
 
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 /* HI:LO += A * B (arm) */
 #define UMUL_ADD_32(HI, LO, A, B) \
       __asm__ ("umlal %1, %0, %4, %5" \
 	       : "=r" (HI), "=r" (LO) \
 	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
 
 /* A += B (arm) */
 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 #  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
       u32 __carry; \
       __asm__ ("adds %0, %0, %5\n" \
 	       "adcs %1, %1, %6\n" \
 	       "adcs %2, %2, %7\n" \
 	       "adcs %3, %3, %8\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), \
 	         "=@cccs" (__carry) \
 	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3) \
 	       : ); \
       (A4) += (B4) + __carry; \
     } while (0)
 #else
 #  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
       u32 __carry = (B0); \
       __asm__ ("adds %0, %0, %2\n" \
 	       "adcs %1, %1, %3\n" \
 	       "rrx %2, %2\n" /* carry to 31th bit */ \
 	       : "+r" (A0), "+r" (A1), "+r" (__carry) \
 	       : "r" (B1), "r" (0) \
 	       : "cc" ); \
       __asm__ ("lsls %0, %0, #1\n" /* carry from 31th bit */ \
 	       "adcs %1, %1, %4\n" \
 	       "adcs %2, %2, %5\n" \
 	       "adc  %3, %3, %6\n" \
 	       : "+r" (__carry), "+r" (A2), "+r" (A3), "+r" (A4) \
 	       : "r" (B2), "r" (B3), "r" (B4) \
 	       : "cc" ); \
     } while (0)
 #endif
 
 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
 
 #if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 5
 /* Note: ADD_1305_32 below does not compile on GCC-4.7 */
 
 /* A += B (i386) */
 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
       __asm__ ("addl %5, %0\n" \
 	       "adcl %6, %1\n" \
 	       "adcl %7, %2\n" \
 	       "adcl %8, %3\n" \
 	       "adcl %9, %4\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
 	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
 	       : "cc" )
 
 #endif /* __i386__ */
 
 #ifndef UMUL_ADD_32
 /* HI:LO += A * B (generic, mpi) */
 #  define UMUL_ADD_32(HI, LO, A, B) do { \
     u32 t_lo, t_hi; \
     umul_ppmm(t_hi, t_lo, A, B); \
     add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
   } while (0)
 #endif
 
 #ifndef ADD_1305_32
 /* A += B (generic, mpi) */
 #  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
     u32 carry0, carry1, carry2; \
     add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
     add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
     add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
     add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
     add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
     add_ssaaaa(A4, A3, A4, A3, B4, B3); \
     add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
   } while (0)
 #endif
 
 /* H = H * R mod 2¹³⁰-5 */
 #define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
                         R3_MULT5, R2_MULT5, R1_MULT5) do { \
     u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
     u32 t0_lo, t0_hi; \
     \
     /* x = a * r (partial mod 2^130-5) */ \
     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
     umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
     umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
     UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
     H1 = x0_hi; \
     UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
     \
     t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
     t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
     add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
     add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
     t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
     t0_hi = H4 * R0;       /* h4 * r0 */ \
     add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
     \
     /* carry propagation */ \
     H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
     H4 = H4 & 3; \
     ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
   } while (0)
 
 #ifndef HAVE_ASM_POLY1305_BLOCKS
 
 static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u32 r1_mult5, r2_mult5, r3_mult5;
   u32 h0, h1, h2, h3, h4;
   u32 m0, m1, m2, m3, m4;
 
   m4 = high_pad;
 
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
   r1_mult5 = (st->r[1] >> 2) + st->r[1];
   r2_mult5 = (st->r[2] >> 2) + st->r[2];
   r3_mult5 = (st->r[3] >> 2) + st->r[3];
 
   while (len >= POLY1305_BLOCKSIZE)
     {
       m0 = buf_get_le32(buf + 0);
       m1 = buf_get_le32(buf + 4);
       m2 = buf_get_le32(buf + 8);
       m3 = buf_get_le32(buf + 12);
 
       /* a = h + m */
       ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
 
       /* h = a * r (partial mod 2^130-5) */
       MUL_MOD_1305_32(h4, h3, h2, h1, h0,
 		      st->r[3], st->r[2], st->r[1], st->r[0],
 		      r3_mult5, r2_mult5, r1_mult5);
 
       buf += POLY1305_BLOCKSIZE;
       len -= POLY1305_BLOCKSIZE;
     }
 
   st->h[0] = h0;
   st->h[1] = h1;
   st->h[2] = h2;
   st->h[3] = h3;
   st->h[4] = h4;
 
   return 6 * sizeof (void *) + 28 * sizeof (u32);
 }
 
 #endif /* !HAVE_ASM_POLY1305_BLOCKS */
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
 				    byte mac[POLY1305_TAGLEN])
 {
   POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
   u32 carry, tmp0, tmp1, tmp2, u;
   u32 h4, h3, h2, h1, h0;
 
   /* process the remaining block */
   if (ctx->leftover)
     {
       ctx->buffer[ctx->leftover++] = 1;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
 	{
 	  memset (&ctx->buffer[ctx->leftover], 0,
 		  POLY1305_BLOCKSIZE - ctx->leftover);
 	  ctx->leftover = POLY1305_BLOCKSIZE;
 	}
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
   /* check if h is more than 2^130-5, by adding 5. */
   add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
   u = (carry + h4) >> 2; /* u == 0 or 1 */
 
   /* minus 2^130-5 ... (+5) */
   u = (-u) & 5;
   add_ssaaaa(carry, h0, 0, h0, 0, u);
   add_ssaaaa(carry, h1, 0, h1, 0, carry);
   add_ssaaaa(carry, h2, 0, h2, 0, carry);
   add_ssaaaa(carry, h3, 0, h3, 0, carry);
 
   /* add high part of key + h */
   add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
   add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
   add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
   add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
   add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
   add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
   h3 += tmp2;
 
   buf_put_le32(mac + 0, h0);
   buf_put_le32(mac + 4, h1);
   buf_put_le32(mac + 8, h2);
   buf_put_le32(mac + 12, h3);
 
   /* burn_stack */
   return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
 }
 
 #endif /* USE_MPI_32BIT */
 
 
 unsigned int
 _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
 			    size_t bytes)
 {
   unsigned int burn = 0;
 
   /* handle leftover */
   if (ctx->leftover)
     {
       size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
       if (want > bytes)
 	want = bytes;
       buf_cpy (ctx->buffer + ctx->leftover, m, want);
       bytes -= want;
       m += want;
       ctx->leftover += want;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
 	return 0;
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
       ctx->leftover = 0;
     }
 
   /* process full blocks */
   if (bytes >= POLY1305_BLOCKSIZE)
     {
       size_t nblks = bytes / POLY1305_BLOCKSIZE;
       burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
       m += nblks * POLY1305_BLOCKSIZE;
       bytes -= nblks * POLY1305_BLOCKSIZE;
     }
 
   /* store leftover */
   if (bytes)
     {
       buf_cpy (ctx->buffer + ctx->leftover, m, bytes);
       ctx->leftover += bytes;
     }
 
   return burn;
 }
 
 
 void
 _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
 {
   unsigned int burn;
 
   burn = _gcry_poly1305_update_burn (ctx, m, bytes);
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 void
 _gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
 {
   unsigned int burn;
 
   burn = poly1305_final (ctx, mac);
 
   _gcry_burn_stack (burn);
 }
 
 
 gcry_err_code_t
 _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
 		     size_t keylen)
 {
   static int initialized;
   static const char *selftest_failed;
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
 	log_error ("Poly1305 selftest failed (%s)\n", selftest_failed);
     }
 
   if (keylen != POLY1305_KEYLEN)
     return GPG_ERR_INV_KEYLEN;
 
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   poly1305_init (ctx, key);
 
   return 0;
 }
 
 
 static void
 poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes,
 	       const byte * key)
 {
   poly1305_context_t ctx;
 
   memset (&ctx, 0, sizeof (ctx));
 
   _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN);
   _gcry_poly1305_update (&ctx, m, bytes);
   _gcry_poly1305_finish (&ctx, mac);
 
   wipememory (&ctx, sizeof (ctx));
 }
 
 
 static const char *
 selftest (void)
 {
   /* example from nacl */
   static const byte nacl_key[POLY1305_KEYLEN] = {
     0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
     0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
     0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
     0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80,
   };
 
   static const byte nacl_msg[131] = {
     0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
     0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
     0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
     0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
     0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
     0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
     0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
     0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
     0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
     0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
     0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
     0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
     0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
     0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
     0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
     0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
     0xe3, 0x55, 0xa5
   };
 
   static const byte nacl_mac[16] = {
     0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
     0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
   };
 
   /* generates a final value of (2^130 - 2) == 3 */
   static const byte wrap_key[POLY1305_KEYLEN] = {
     0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
 
   static const byte wrap_msg[16] = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   };
 
   static const byte wrap_mac[16] = {
     0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
 
   /* mac of the macs of messages of length 0 to 256, where the key and messages
    * have all their values set to the length
    */
   static const byte total_key[POLY1305_KEYLEN] = {
     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   };
 
   static const byte total_mac[16] = {
     0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd,
     0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39
   };
 
   poly1305_context_t ctx;
   poly1305_context_t total_ctx;
   byte all_key[POLY1305_KEYLEN];
   byte all_msg[256];
   byte mac[16];
   size_t i, j;
 
   memset (&ctx, 0, sizeof (ctx));
   memset (&total_ctx, 0, sizeof (total_ctx));
 
   memset (mac, 0, sizeof (mac));
   poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key);
   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 1 failed.";
 
   /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so
    * make sure everything still works varying between them */
   memset (mac, 0, sizeof (mac));
   _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN);
   _gcry_poly1305_update (&ctx, nacl_msg + 0, 32);
   _gcry_poly1305_update (&ctx, nacl_msg + 32, 64);
   _gcry_poly1305_update (&ctx, nacl_msg + 96, 16);
   _gcry_poly1305_update (&ctx, nacl_msg + 112, 8);
   _gcry_poly1305_update (&ctx, nacl_msg + 120, 4);
   _gcry_poly1305_update (&ctx, nacl_msg + 124, 2);
   _gcry_poly1305_update (&ctx, nacl_msg + 126, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 127, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 128, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 129, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 130, 1);
   _gcry_poly1305_finish (&ctx, mac);
   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 2 failed.";
 
   memset (mac, 0, sizeof (mac));
   poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key);
   if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 3 failed.";
 
   _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN);
   for (i = 0; i < 256; i++)
     {
       /* set key and message to 'i,i,i..' */
       for (j = 0; j < sizeof (all_key); j++)
 	all_key[j] = i;
       for (j = 0; j < i; j++)
 	all_msg[j] = i;
       poly1305_auth (mac, all_msg, i, all_key);
       _gcry_poly1305_update (&total_ctx, mac, 16);
     }
   _gcry_poly1305_finish (&total_ctx, mac);
   if (memcmp (total_mac, mac, sizeof (total_mac)) != 0)
     return "Poly1305 test 4 failed.";
 
   return NULL;
 }
diff --git a/configure.ac b/configure.ac
index fc49bb86..eb149a51 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,3348 +1,3351 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 # Copyright (C) 2012-2021  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # (Process this file with autoconf to produce a configure script.)
 AC_REVISION($Revision$)
 AC_PREREQ([2.69])
 min_automake_version="1.14"
 
 # To build a release you need to create a tag with the version number
 # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force".  Please
 # bump the version number immediately after the release and do another
 # commit and push so that the git magic is able to work.  See below
 # for the LT versions.
 m4_define([mym4_package],[libgcrypt])
 m4_define([mym4_major], [1])
 m4_define([mym4_minor], [11])
 m4_define([mym4_micro], [0])
 
 # Below is m4 magic to extract and compute the git revision number,
 # the decimalized short revision number, a beta version string and a
 # flag indicating a development version (mym4_isbeta).  Note that the
 # m4 processing is done by autoconf and not during the configure run.
 m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \
                            mym4_package mym4_major mym4_minor mym4_micro),[:]))
 m4_define([mym4_isbeta],       m4_argn(2, mym4_verslist))
 m4_define([mym4_version],      m4_argn(4, mym4_verslist))
 m4_define([mym4_revision],     m4_argn(7, mym4_verslist))
 m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist))
 m4_esyscmd([echo ]mym4_version[>VERSION])
 AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org])
 
 # LT Version numbers, remember to change them just *before* a release.
 # NOET NOTE - Already updated for a 1.11 series - NOTE NOTE
 #   (Code changed:			REVISION++)
 #   (Interfaces added/removed/changed:	CURRENT++, REVISION=0)
 #   (Interfaces added:			AGE++)
 #   (Interfaces removed:		AGE=0)
 #
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
 LIBGCRYPT_LT_CURRENT=25
 LIBGCRYPT_LT_AGE=5
 LIBGCRYPT_LT_REVISION=0
 ################################################
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
 AC_SUBST(LIBGCRYPT_LT_AGE)
 AC_SUBST(LIBGCRYPT_LT_REVISION)
 
 # If the API is changed in an incompatible way: increment the next counter.
 #
 # 1.6: ABI and API change but the change is to most users irrelevant
 #      and thus the API version number has not been incremented.
 LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
 NEED_GPG_ERROR_VERSION=1.27
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
 AM_INIT_AUTOMAKE([serial-tests dist-bzip2])
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_LIBOBJ_DIR([compat])
 AC_CANONICAL_HOST
 AM_MAINTAINER_MODE
 AM_SILENT_RULES
 
 AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory])
 
 AH_TOP([
 #ifndef _GCRYPT_CONFIG_H_INCLUDED
 #define _GCRYPT_CONFIG_H_INCLUDED
 
 /* Enable gpg-error's strerror macro for W32CE.  */
 #define GPG_ERR_ENABLE_ERRNO_MACROS 1
 ])
 
 AH_BOTTOM([
 #define _GCRYPT_IN_LIBGCRYPT 1
 
 /* Add .note.gnu.property section for Intel CET in assembler sources
    when CET is enabled.  */
 #if defined(__ASSEMBLER__) && defined(__CET__)
 # include <cet.h>
 #endif
 
 /* If the configure check for endianness has been disabled, get it from
    OS macros.  This is intended for making fat binary builds on OS X.  */
 #ifdef DISABLED_ENDIAN_CHECK
 # if defined(__BIG_ENDIAN__)
 #  define WORDS_BIGENDIAN 1
 # elif defined(__LITTLE_ENDIAN__)
 #  undef WORDS_BIGENDIAN
 # else
 #  error "No endianness found"
 # endif
 #endif /*DISABLED_ENDIAN_CHECK*/
 
 /* We basically use the original Camellia source.  Make sure the symbols
    properly prefixed.  */
 #define CAMELLIA_EXT_SYM_PREFIX _gcry_
 
 #endif /*_GCRYPT_CONFIG_H_INCLUDED*/
 ])
 
 AH_VERBATIM([_REENTRANT],
 [/* To allow the use of Libgcrypt in multithreaded programs we have to use
     special features from the library. */
 #ifndef _REENTRANT
 # define _REENTRANT 1
 #endif
 ])
 
 
 ######################
 ##  Basic checks.  ### (we need some results later on (e.g. $GCC)
 ######################
 
 AC_PROG_MAKE_SET
 missing_dir=`cd $ac_aux_dir && pwd`
 AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
 AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
 AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
 AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
 # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
 AC_PROG_CC
 AC_PROG_CPP
 AM_PROG_CC_C_O
 AM_PROG_AS
 AC_SEARCH_LIBS([strerror],[cposix])
 AC_PROG_INSTALL
 AC_PROG_AWK
 
 AC_USE_SYSTEM_EXTENSIONS
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
 dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has
 dnl the precedence over the run path, so that if a compatible MPFR library
 dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested
 dnl MPFR library will be this library instead of the MPFR library from the
 dnl build tree. Other OS with the same issue might be added later.
 dnl
 dnl References:
 dnl   https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732
 dnl   http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html
 dnl
 dnl We need to check whether --disable-new-dtags is supported as alternate
 dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc).
 dnl
 case $host in
   *-*-linux*)
     if test -n "$LD_LIBRARY_PATH"; then
       saved_LDFLAGS="$LDFLAGS"
       LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags"
       LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE"
       AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker)
       AC_LINK_IFELSE([AC_LANG_SOURCE([[
 int main (void) { return 0; }
       ]])],
       [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))],
       [AC_MSG_RESULT(no)
        LDADD_FOR_TESTS_KLUDGE=""
       ])
       LDFLAGS="$saved_LDFLAGS"
     fi
     ;;
 esac
 AC_SUBST([LDADD_FOR_TESTS_KLUDGE])
 
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \
                           mym4_minor mym4_micro)
 AC_SUBST(VERSION_NUMBER)
 
 # We need to compile and run a program on the build machine.
 AX_CC_FOR_BUILD
 
 
 LT_PREREQ([2.2.6])
 LT_INIT([win32-dll disable-static])
 LT_LANG([Windows Resource])
 
 
 ##########################
 ## General definitions. ##
 ##########################
 
 # Used by libgcrypt-config
 LIBGCRYPT_CONFIG_LIBS="-lgcrypt"
 LIBGCRYPT_CONFIG_CFLAGS=""
 LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 available_ciphers="$available_ciphers sm4"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
 available_pubkey_ciphers="dsa elgamal rsa ecc"
 enabled_pubkey_ciphers=""
 
 # Definitions for message digests.
 available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512"
 available_digests="$available_digests sha3 tiger whirlpool stribog blake2"
 available_digests="$available_digests sm3"
 enabled_digests=""
 
 # Definitions for kdfs (optional ones)
 available_kdfs="s2k pkdf2 scrypt"
 enabled_kdfs=""
 
 # Definitions for random modules.
 available_random_modules="getentropy linux egd unix"
 auto_random_modules="$available_random_modules"
 
 # Supported thread backends.
 LIBGCRYPT_THREAD_MODULES=""
 
 # Other definitions.
 have_w32_system=no
 have_w32ce_system=no
 have_pthread=no
 
 
 # Setup some stuff depending on host.
 case "${host}" in
     *-*-mingw32*)
       ac_cv_have_dev_random=no
       have_w32_system=yes
       case "${host}" in
         *-mingw32ce*)
             have_w32ce_system=yes
             available_random_modules="w32ce"
             ;;
         *)
             available_random_modules="w32"
             ;;
       esac
       AC_DEFINE(USE_ONLY_8DOT3,1,
                 [set this to limit filenames to the 8.3 format])
       AC_DEFINE(HAVE_DRIVE_LETTERS,1,
                 [defined if we must run on a stupid file system])
       AC_DEFINE(HAVE_DOSISH_SYSTEM,1,
                 [defined if we run on some of the PCDOS like systems
                  (DOS, Windoze. OS/2) with special properties like
                   no file modes])
       ;;
 
     i?86-emx-os2 | i?86-*-os2*emx)
         # OS/2 with the EMX environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     i?86-*-msdosdjgpp*)
         # DOS with the DJGPP environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     *-*-hpux*)
         if test -z "$GCC" ; then
             CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE"
         fi
         ;;
     *-dec-osf4*)
         if test -z "$GCC" ; then
             # Suppress all warnings
             # to get rid of the unsigned/signed char mismatch warnings.
             CFLAGS="$CFLAGS -w"
         fi
         ;;
     m68k-atari-mint)
         ;;
     *-apple-darwin*)
         AC_DEFINE(_DARWIN_C_SOURCE, 1,
                   Expose all libc features (__DARWIN_C_FULL).)
         AC_DEFINE(USE_POSIX_SPAWN_FOR_TESTS, 1,
                   [defined if we use posix_spawn in test program])
         AC_CHECK_HEADERS(spawn.h)
         ;;
     *)
       ;;
 esac
 
 if test "$have_w32_system" = yes; then
    AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system])
    if test "$have_w32ce_system" = yes; then
      AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE])
    fi
 fi
 AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes)
 AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes)
 
 
 
 # A printable OS Name is sometimes useful.
 case "${host}" in
     *-*-mingw32ce*)
         PRINTABLE_OS_NAME="W32CE"
         ;;
 
     *-*-mingw32*)
         PRINTABLE_OS_NAME="W32"
         ;;
 
     i?86-emx-os2 | i?86-*-os2*emx )
         PRINTABLE_OS_NAME="OS/2"
         ;;
 
     i?86-*-msdosdjgpp*)
         PRINTABLE_OS_NAME="MSDOS/DJGPP"
         ;;
 
     *-linux*)
         PRINTABLE_OS_NAME="GNU/Linux"
         ;;
 
     *)
         PRINTABLE_OS_NAME=`uname -s || echo "Unknown"`
         ;;
 esac
 
 NAME_OF_DEV_RANDOM="/dev/random"
 NAME_OF_DEV_URANDOM="/dev/urandom"
 
 AC_ARG_ENABLE(endian-check,
               AS_HELP_STRING([--disable-endian-check],
               [disable the endian check and trust the OS provided macros]),
 	      endiancheck=$enableval,endiancheck=yes)
 if test x"$endiancheck" = xyes ; then
   AC_C_BIGENDIAN
 else
   AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness])
 fi
 
 AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
 
 if test "$ac_cv_sizeof_unsigned_short" = "0" \
    || test "$ac_cv_sizeof_unsigned_int" = "0" \
    || test "$ac_cv_sizeof_unsigned_long" = "0"; then
     AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]);
 fi
 
 # Ensure that we have UINT64_C before we bother to check for uint64_t
 AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works],
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <inttypes.h>]],
        [[uint64_t foo=UINT64_C(42);]])],
      gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no))
 if test "$gnupg_cv_uint64_c_works" = "yes" ; then
    AC_CHECK_SIZEOF(uint64_t)
 fi
 
 # Do we have any 64-bit data types?
 if test "$ac_cv_sizeof_unsigned_int" != "8" \
    && test "$ac_cv_sizeof_unsigned_long" != "8" \
    && test "$ac_cv_sizeof_unsigned_long_long" != "8" \
    && test "$ac_cv_sizeof_uint64_t" != "8"; then
     AC_MSG_ERROR([[
 ***
 *** No 64-bit integer type available.
 *** It is not possible to build Libgcrypt on this platform.
 ***]])
 fi
 
 
 # If not specified otherwise, all available algorithms will be
 # included.
 default_ciphers="$available_ciphers"
 default_pubkey_ciphers="$available_pubkey_ciphers"
 default_digests="$available_digests"
 default_kdfs="$available_kdfs"
 # Blacklist MD2 by default
 default_digests=`echo $default_digests | sed -e 's/md2//g'`
 
 # Substitutions to set generated files in a Emacs buffer to read-only.
 AC_SUBST(emacs_local_vars_begin, ['Local Variables:'])
 AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t'])
 AC_SUBST(emacs_local_vars_end, ['End:'])
 
 ############################
 ## Command line switches. ##
 ############################
 
 # Implementation of the --enable-ciphers switch.
 AC_ARG_ENABLE(ciphers,
 	      AS_HELP_STRING([--enable-ciphers=ciphers],
                              [select the symmetric ciphers to include]),
 	      [enabled_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_ciphers=""])
 if test "x$enabled_ciphers" = "x" \
    -o "$enabled_ciphers" = "yes"  \
    -o "$enabled_ciphers" = "no"; then
    enabled_ciphers=$default_ciphers
 fi
 AC_MSG_CHECKING([which symmetric ciphers to include])
 for cipher in $enabled_ciphers; do
     LIST_MEMBER($cipher, $available_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported cipher "$cipher" specified])
     fi
 done
 AC_MSG_RESULT([$enabled_ciphers])
 
 # Implementation of the --enable-pubkey-ciphers switch.
 AC_ARG_ENABLE(pubkey-ciphers,
 	      AS_HELP_STRING([--enable-pubkey-ciphers=ciphers],
                              [select the public-key ciphers to include]),
 	      [enabled_pubkey_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_pubkey_ciphers=""])
 if test "x$enabled_pubkey_ciphers" = "x" \
    -o "$enabled_pubkey_ciphers" = "yes"  \
    -o "$enabled_pubkey_ciphers" = "no"; then
    enabled_pubkey_ciphers=$default_pubkey_ciphers
 fi
 AC_MSG_CHECKING([which public-key ciphers to include])
 for cipher in $enabled_pubkey_ciphers; do
     LIST_MEMBER($cipher, $available_pubkey_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported public-key cipher specified])
     fi
 done
 AC_MSG_RESULT([$enabled_pubkey_ciphers])
 
 # Implementation of the --enable-digests switch.
 AC_ARG_ENABLE(digests,
 	      AS_HELP_STRING([--enable-digests=digests],
                              [select the message digests to include]),
 	      [enabled_digests=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_digests=""])
 if test "x$enabled_digests" = "x" \
    -o "$enabled_digests" = "yes"  \
    -o "$enabled_digests" = "no"; then
    enabled_digests=$default_digests
 fi
 AC_MSG_CHECKING([which message digests to include])
 for digest in $enabled_digests; do
     LIST_MEMBER($digest, $available_digests)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported message digest specified])
     fi
 done
 AC_MSG_RESULT([$enabled_digests])
 
 # Implementation of the --enable-kdfs switch.
 AC_ARG_ENABLE(kdfs,
       AS_HELP_STRING([--enable-kfds=kdfs],
                      [select the KDFs to include]),
       [enabled_kdfs=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
       [enabled_kdfs=""])
 if test "x$enabled_kdfs" = "x" \
    -o "$enabled_kdfs" = "yes"  \
    -o "$enabled_kdfs" = "no"; then
    enabled_kdfs=$default_kdfs
 fi
 AC_MSG_CHECKING([which key derivation functions to include])
 for kdf in $enabled_kdfs; do
     LIST_MEMBER($kdf, $available_kdfs)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported key derivation function specified])
     fi
 done
 AC_MSG_RESULT([$enabled_kdfs])
 
 # Implementation of the --enable-random switch.
 AC_ARG_ENABLE(random,
 	      AS_HELP_STRING([--enable-random=name],
                              [select which random number generator to use]),
 	      [random=`echo $enableval | tr '[A-Z]' '[a-z]'`],
 	      [])
 if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then
     random=default
 fi
 AC_MSG_CHECKING([which random module to use])
 if test "$random" != "default" -a "$random" != "auto"; then
     LIST_MEMBER($random, $available_random_modules)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported random module specified])
     fi
 fi
 AC_MSG_RESULT($random)
 
 # Implementation of the --disable-dev-random switch.
 AC_MSG_CHECKING([whether use of /dev/random is requested])
 AC_ARG_ENABLE(dev-random,
 [  --disable-dev-random    disable the use of dev random],
     try_dev_random=$enableval, try_dev_random=yes)
 AC_MSG_RESULT($try_dev_random)
 
 # Implementation of the --with-egd-socket switch.
 AC_ARG_WITH(egd-socket,
     [  --with-egd-socket=NAME  Use NAME for the EGD socket)],
             egd_socket_name="$withval", egd_socket_name="" )
 AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested])
 AC_ARG_ENABLE([asm],
               AS_HELP_STRING([--disable-asm],
                              [Disable MPI and cipher assembler modules]),
               [try_asm_modules=$enableval],
               [try_asm_modules=yes])
 AC_MSG_RESULT($try_asm_modules)
 if test "$try_asm_modules" != yes ; then
     AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure])
 fi
 
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
               AS_HELP_STRING([--enable-large-data-tests],
                  [Enable the real long ruinning large data tests]),
 	      large_data_tests=$enableval,large_data_tests=no)
 AC_MSG_RESULT($large_data_tests)
 AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests)
 
 # Implementation of --enable-force-soft-hwfeatures
 AC_MSG_CHECKING([whether 'soft' HW feature bits are forced on])
 AC_ARG_ENABLE([force-soft-hwfeatures],
               AS_HELP_STRING([--enable-force-soft-hwfeatures],
                              [Enable forcing 'soft' HW feature bits on]),
               [force_soft_hwfeatures=$enableval],
               [force_soft_hwfeatures=no])
 AC_MSG_RESULT($force_soft_hwfeatures)
 
 
 # Implementation of the --with-capabilities switch.
 # Check whether we want to use Linux capabilities
 AC_MSG_CHECKING([whether use of capabilities is requested])
 AC_ARG_WITH(capabilities,
             AS_HELP_STRING([--with-capabilities],
                            [Use linux capabilities [default=no]]),
             [use_capabilities="$withval"],[use_capabilities=no])
 AC_MSG_RESULT($use_capabilities)
 
 # Implementation of the --enable-hmac-binary-check.
 AC_MSG_CHECKING([whether a HMAC binary check is requested])
 AC_ARG_ENABLE(hmac-binary-check,
               AS_HELP_STRING([--enable-hmac-binary-check],
                              [Enable library integrity check]),
               [use_hmac_binary_check="$enableval"],
               [use_hmac_binary_check=no])
 AC_MSG_RESULT($use_hmac_binary_check)
 if test "$use_hmac_binary_check" = no ; then
     DEF_HMAC_BINARY_CHECK=''
 else
     AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1,
               [Define to support an HMAC based integrity check])
     AC_CHECK_TOOL(OBJCOPY, [objcopy])
     AC_CHECK_TOOL(READELF, [readelf])
     if test "$use_hmac_binary_check" != yes ; then
         DEF_HMAC_BINARY_CHECK=-DKEY_FOR_BINARY_CHECK="'\"$use_hmac_binary_check\"'"
     fi
 fi
 AM_CONDITIONAL(USE_HMAC_BINARY_CHECK, test "x$use_hmac_binary_check" != xno)
 AC_SUBST(DEF_HMAC_BINARY_CHECK)
 
 # Implementation of the --with-fips-module-version.
 AC_ARG_WITH(fips-module-version,
             AS_HELP_STRING([--with-fips-module-version=VERSION],
                            [Specify the FIPS module version for the build]),
             fips_module_version="$withval", fips_module_version="" )
 AC_DEFINE_UNQUOTED(FIPS_MODULE_VERSION, "$fips_module_version",
                    [Define FIPS module version for certification])
 
 # Implementation of the --disable-jent-support switch.
 AC_MSG_CHECKING([whether jitter entropy support is requested])
 AC_ARG_ENABLE(jent-support,
               AS_HELP_STRING([--disable-jent-support],
                         [Disable support for the Jitter entropy collector]),
 	      jentsupport=$enableval,jentsupport=yes)
 AC_MSG_RESULT($jentsupport)
 
 # Implementation of the --disable-padlock-support switch.
 AC_MSG_CHECKING([whether padlock support is requested])
 AC_ARG_ENABLE(padlock-support,
               AS_HELP_STRING([--disable-padlock-support],
                         [Disable support for the PadLock Engine of VIA processors]),
 	      padlocksupport=$enableval,padlocksupport=yes)
 AC_MSG_RESULT($padlocksupport)
 
 # Implementation of the --disable-aesni-support switch.
 AC_MSG_CHECKING([whether AESNI support is requested])
 AC_ARG_ENABLE(aesni-support,
               AS_HELP_STRING([--disable-aesni-support],
                  [Disable support for the Intel AES-NI instructions]),
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
 # Implementation of the --disable-shaext-support switch.
 AC_MSG_CHECKING([whether SHAEXT support is requested])
 AC_ARG_ENABLE(shaext-support,
               AS_HELP_STRING([--disable-shaext-support],
                  [Disable support for the Intel SHAEXT instructions]),
               shaextsupport=$enableval,shaextsupport=yes)
 AC_MSG_RESULT($shaextsupport)
 
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
               AS_HELP_STRING([--disable-pclmul-support],
                  [Disable support for the Intel PCLMUL instructions]),
 	      pclmulsupport=$enableval,pclmulsupport=yes)
 AC_MSG_RESULT($pclmulsupport)
 
 # Implementation of the --disable-sse41-support switch.
 AC_MSG_CHECKING([whether SSE4.1 support is requested])
 AC_ARG_ENABLE(sse41-support,
               AS_HELP_STRING([--disable-sse41-support],
                  [Disable support for the Intel SSE4.1 instructions]),
 	      sse41support=$enableval,sse41support=yes)
 AC_MSG_RESULT($sse41support)
 
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
               AS_HELP_STRING([--disable-drng-support],
                  [Disable support for the Intel DRNG (RDRAND instruction)]),
 	      drngsupport=$enableval,drngsupport=yes)
 AC_MSG_RESULT($drngsupport)
 
 # Implementation of the --disable-avx-support switch.
 AC_MSG_CHECKING([whether AVX support is requested])
 AC_ARG_ENABLE(avx-support,
               AS_HELP_STRING([--disable-avx-support],
                  [Disable support for the Intel AVX instructions]),
 	      avxsupport=$enableval,avxsupport=yes)
 AC_MSG_RESULT($avxsupport)
 
 # Implementation of the --disable-avx2-support switch.
 AC_MSG_CHECKING([whether AVX2 support is requested])
 AC_ARG_ENABLE(avx2-support,
               AS_HELP_STRING([--disable-avx2-support],
                  [Disable support for the Intel AVX2 instructions]),
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
 # Implementation of the --disable-avx512-support switch.
 AC_MSG_CHECKING([whether AVX512 support is requested])
 AC_ARG_ENABLE(avx512-support,
               AS_HELP_STRING([--disable-avx512-support],
                  [Disable support for the Intel AVX512 instructions]),
 	      avx512support=$enableval,avx512support=yes)
 AC_MSG_RESULT($avx512support)
 
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
               AS_HELP_STRING([--disable-neon-support],
                  [Disable support for the ARM NEON instructions]),
 	      neonsupport=$enableval,neonsupport=yes)
 AC_MSG_RESULT($neonsupport)
 
 # Implementation of the --disable-arm-crypto-support switch.
 AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested])
 AC_ARG_ENABLE(arm-crypto-support,
               AS_HELP_STRING([--disable-arm-crypto-support],
                  [Disable support for the ARMv8 Crypto Extension instructions]),
 	      armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
 # Implementation of the --disable-ppc-crypto-support switch.
 AC_MSG_CHECKING([whether PPC crypto support is requested])
 AC_ARG_ENABLE(ppc-crypto-support,
               AS_HELP_STRING([--disable-ppc-crypto-support],
                  [Disable support for the PPC crypto instructions introduced in POWER 8 (PowerISA 2.07)]),
               ppccryptosupport=$enableval,ppccryptosupport=yes)
 AC_MSG_RESULT($ppccryptosupport)
 
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
               AS_HELP_STRING([--disable-O-flag-munging],
                  [Disable modification of the cc -O flag]),
               [enable_o_flag_munging=$enableval],
               [enable_o_flag_munging=yes])
 AC_MSG_RESULT($enable_o_flag_munging)
 AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes")
 
 # Implementation of the --disable-instrumentation-munging switch.
 AC_MSG_CHECKING([whether a instrumentation (-fprofile, -fsanitize) munging is requested])
 AC_ARG_ENABLE([instrumentation-munging],
               AS_HELP_STRING([--disable-instrumentation-munging],
                  [Disable modification of the cc instrumentation options]),
               [enable_instrumentation_munging=$enableval],
               [enable_instrumentation_munging=yes])
 AC_MSG_RESULT($enable_instrumentation_munging)
 AM_CONDITIONAL(ENABLE_INSTRUMENTATION_MUNGING,
 	       test "$enable_instrumentation_munging" = "yes")
 
 # Implementation of the --disable-amd64-as-feature-detection switch.
 AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection])
 AC_ARG_ENABLE(amd64-as-feature-detection,
               AS_HELP_STRING([--disable-amd64-as-feature-detection],
                  [Disable the auto-detection of AMD64 as(1) features]),
 	      amd64_as_feature_detection=$enableval,
               amd64_as_feature_detection=yes)
 AC_MSG_RESULT($amd64_as_feature_detection)
 
 
 AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME",
                    [A human readable text with the name of the OS])
 
 # For some systems we know that we have ld_version scripts.
 # Use it then as default.
 have_ld_version_script=no
 case "${host}" in
     *-*-linux*)
 	have_ld_version_script=yes
         ;;
     *-*-gnu*)
 	have_ld_version_script=yes
         ;;
 esac
 AC_ARG_ENABLE([ld-version-script],
               AS_HELP_STRING([--enable-ld-version-script],
                              [enable/disable use of linker version script.
                               (default is system dependent)]),
               [have_ld_version_script=$enableval],
               [ : ] )
 AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes")
 
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM",
                    [defined to the name of the strong random device])
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM",
                    [defined to the name of the weaker random device])
 
 
 ###############################
 #### Checks for libraries. ####
 ###############################
 
 #
 # gpg-error is required.
 #
 AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION")
 if test "x$GPG_ERROR_LIBS" = "x"; then
   AC_MSG_ERROR([libgpg-error is needed.
                 See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .])
 fi
 
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
 AM_CONDITIONAL(USE_GPGRT_CONFIG, [test -n "$GPGRT_CONFIG" \
                                   -a "$ac_cv_path_GPG_ERROR_CONFIG" = no])
 
 #
 # Check whether pthreads is available
 #
 if test "$have_w32_system" != yes; then
   AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes)
   if test "$have_pthread" = yes; then
     AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.])
   fi
 fi
 
 
 # Solaris needs -lsocket and -lnsl. Unisys system includes
 # gethostbyname in libsocket but needs libnsl for socket.
 AC_SEARCH_LIBS(setsockopt, [socket], ,
 	[AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])])
 AC_SEARCH_LIBS(setsockopt, [nsl])
 
 ##################################
 #### Checks for header files. ####
 ##################################
 
 AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h)
 
 
 ##########################################
 #### Checks for typedefs, structures, ####
 ####  and compiler characteristics.   ####
 ##########################################
 
 AC_C_CONST
 AC_C_INLINE
 AC_TYPE_SIZE_T
 AC_TYPE_PID_T
 
 AC_CHECK_TYPES([byte, ushort, u16, u32, u64])
 
 gl_TYPE_SOCKLEN_T
 
 #
 # Check for __builtin_bswap32 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap32,
        [gcry_cv_have_builtin_bswap32],
        [gcry_cv_have_builtin_bswap32=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [int x = 0; int y = __builtin_bswap32(x); return y;])],
           [gcry_cv_have_builtin_bswap32=yes])])
 if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
              [Defined if compiler has '__builtin_bswap32' intrinsic])
 fi
 
 
 #
 # Check for __builtin_bswap64 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap64,
        [gcry_cv_have_builtin_bswap64],
        [gcry_cv_have_builtin_bswap64=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [long long x = 0; long long y = __builtin_bswap64(x); return y;])],
           [gcry_cv_have_builtin_bswap64=yes])])
 if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
              [Defined if compiler has '__builtin_bswap64' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctz,
        [gcry_cv_have_builtin_ctz],
        [gcry_cv_have_builtin_ctz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_ctz(x); return y;])],
           [gcry_cv_have_builtin_ctz=yes])])
 if test "$gcry_cv_have_builtin_ctz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZ, 1,
              [Defined if compiler has '__builtin_ctz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctzl,
        [gcry_cv_have_builtin_ctzl],
        [gcry_cv_have_builtin_ctzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_ctzl(x); return y;])],
           [gcry_cv_have_builtin_ctzl=yes])])
 if test "$gcry_cv_have_builtin_ctzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZL, 1,
              [Defined if compiler has '__builtin_ctzl' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clz,
        [gcry_cv_have_builtin_clz],
        [gcry_cv_have_builtin_clz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_clz(x); return y;])],
           [gcry_cv_have_builtin_clz=yes])])
 if test "$gcry_cv_have_builtin_clz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
              [Defined if compiler has '__builtin_clz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clzl,
        [gcry_cv_have_builtin_clzl],
        [gcry_cv_have_builtin_clzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_clzl(x); return y;])],
           [gcry_cv_have_builtin_clzl=yes])])
 if test "$gcry_cv_have_builtin_clzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZL, 1,
              [Defined if compiler has '__builtin_clzl' intrinsic])
 fi
 
 
 #
 # Check for __sync_synchronize intrinsic.
 #
 AC_CACHE_CHECK(for __sync_synchronize,
        [gcry_cv_have_sync_synchronize],
        [gcry_cv_have_sync_synchronize=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [__sync_synchronize(); return 0;])],
           [gcry_cv_have_sync_synchronize=yes])])
 if test "$gcry_cv_have_sync_synchronize" = "yes" ; then
    AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1,
              [Defined if compiler has '__sync_synchronize' intrinsic])
 fi
 
 
 #
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,
        [gcry_cv_have_vla],
        [gcry_cv_have_vla=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void f1(char *, int);
             char foo(int i) {
               char b[(i < 0 ? 0 : i) + 1];
               f1(b, sizeof b); return b[0];}]])],
           [gcry_cv_have_vla=yes])])
 if test "$gcry_cv_have_vla" = "yes" ; then
    AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported])
 fi
 
 
 #
 # Check for ELF visibility support.
 #
 AC_CACHE_CHECK(whether the visibility attribute is supported,
        gcry_cv_visibility_attribute,
        [gcry_cv_visibility_attribute=no
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo __attribute__ ((visibility ("hidden"))) = 1;
             int bar __attribute__ ((visibility ("protected"))) = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
             if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then
                 if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then
                     gcry_cv_visibility_attribute=yes
                 fi
             fi
         fi
        ])
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken visibility attribute,
        gcry_cv_broken_visibility_attribute,
        [gcry_cv_broken_visibility_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo (int x);
             int bar (int x) __asm__ ("foo")
                             __attribute__ ((visibility ("hidden")));
             int bar (int x) { return x; }
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep '\.hidden@<:@ 	_@:>@foo' conftest.s >/dev/null 2>&1;
             then
                gcry_cv_broken_visibility_attribute=no
            fi
         fi
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken alias attribute,
        gcry_cv_broken_alias_attribute,
        [gcry_cv_broken_alias_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[extern int foo (int x) __asm ("xyzzy");
             int bar (int x) { return x; }
             extern __typeof (bar) foo __attribute ((weak, alias ("bar")));
             extern int dfoo;
             extern __typeof (dfoo) dfoo __asm ("abccb");
             int dfoo = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \
               grep 'abccb' conftest.s >/dev/null 2>&1; then
               gcry_cv_broken_alias_attribute=no
            fi
         fi
         ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(if gcc supports -fvisibility=hidden,
        gcry_cv_gcc_has_f_visibility,
        [gcry_cv_gcc_has_f_visibility=no
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-fvisibility=hidden"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],
                           gcry_cv_gcc_has_f_visibility=yes)
         CFLAGS=$_gcc_cflags_save;
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes" \
    && test "$gcry_cv_broken_visibility_attribute" != "yes" \
    && test "$gcry_cv_broken_alias_attribute" != "yes" \
    && test "$gcry_cv_gcc_has_f_visibility" = "yes"
  then
    AC_DEFINE(GCRY_USE_VISIBILITY, 1,
                [Define to use the GNU C visibility attribute.])
    CFLAGS="$CFLAGS -fvisibility=hidden"
 fi
 
 
 # Following attribute tests depend on warnings to cause compile to fail,
 # so set -Werror temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether the compiler supports the GCC style aligned attribute
 #
 AC_CACHE_CHECK([whether the GCC style aligned attribute is supported],
        [gcry_cv_gcc_attribute_aligned],
        [gcry_cv_gcc_attribute_aligned=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct { int a; } foo __attribute__ ((aligned (16)));]])],
           [gcry_cv_gcc_attribute_aligned=yes])])
 if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1,
      [Defined if a GCC style "__attribute__ ((aligned (n))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style packed attribute
 #
 AC_CACHE_CHECK([whether the GCC style packed attribute is supported],
        [gcry_cv_gcc_attribute_packed],
        [gcry_cv_gcc_attribute_packed=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct foolong_s { long b; } __attribute__ ((packed));
             struct foo_s { char a; struct foolong_s b; }
               __attribute__ ((packed));
             enum bar {
               FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))),
             };]])],
           [gcry_cv_gcc_attribute_packed=yes])])
 if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1,
      [Defined if a GCC style "__attribute__ ((packed))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style may_alias attribute
 #
 AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported],
        [gcry_cv_gcc_attribute_may_alias],
        [gcry_cv_gcc_attribute_may_alias=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[typedef struct foo_s { int a; }
             __attribute__ ((may_alias)) foo_t;]])],
           [gcry_cv_gcc_attribute_may_alias=yes])])
 if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1,
      [Defined if a GCC style "__attribute__ ((may_alias))" is supported])
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether the compiler supports 'asm' or '__asm__' keyword for
 # assembler blocks.
 #
 AC_CACHE_CHECK([whether 'asm' assembler keyword is supported],
        [gcry_cv_have_asm],
        [gcry_cv_have_asm=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm("":::"memory"); }]])],
           [gcry_cv_have_asm=yes])])
 AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported],
        [gcry_cv_have___asm__],
        [gcry_cv_have___asm__=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { __asm__("":::"memory"); }]])],
           [gcry_cv_have___asm__=yes])])
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_DEFINE(asm,__asm__,
         [Define to supported assembler block keyword, if plain 'asm' was not
          supported])
    fi
 fi
 
 
 #
 # Check whether the compiler supports inline assembly memory barrier.
 #
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
           [gcry_cv_have_asm_volatile_memory],
           [gcry_cv_have_asm_volatile_memory=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void a(int x)
                {
                  __asm__ volatile("":::"memory");
                  __asm__ volatile("":"+r"(x)::"memory");
                }]])],
              [gcry_cv_have_asm_volatile_memory=yes])])
    fi
 else
    AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
        [gcry_cv_have_asm_volatile_memory],
        [gcry_cv_have_asm_volatile_memory=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(int x)
             {
               asm volatile("":::"memory");
               asm volatile("":"+r"(x)::"memory"); }]])],
           [gcry_cv_have_asm_volatile_memory=yes])])
 fi
 if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1,
      [Define if inline asm memory barrier is supported])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARM
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations],
        [gcry_cv_gcc_arm_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_arm_platform_as_ok="n/a"
         else
           gcry_cv_gcc_arm_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
                 "add %r0, %r0, %r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
               );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_arm_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARM assembly implementations])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARMv8/Aarch64
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations],
        [gcry_cv_gcc_aarch64_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_aarch64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_aarch64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 ".text\n\t"
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
               );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_aarch64_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations])
 fi
 
 #
 # Check whether GCC assembler supports for CFI directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
        [gcry_cv_gcc_asm_cfi_directives],
        [gcry_cv_gcc_asm_cfi_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".text\n\t"
                 "ac_test:\n\t"
                 ".cfi_startproc\n\t"
                 ".cfi_remember_state\n\t"
                 ".cfi_adjust_cfa_offset 8\n\t"
                 ".cfi_rel_offset 0, 8\n\t"
                 ".cfi_def_cfa_register 1\n\t"
                 ".cfi_register 2, 3\n\t"
                 ".cfi_restore 2\n\t"
                 ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
                 ".cfi_restore_state\n\t"
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
             );]])],
           [gcry_cv_gcc_asm_cfi_directives=yes])])
 if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
              [Defined if underlying assembler supports for CFI directives])
 fi
 
 
 #
 # Check whether GCC assembler supports for ELF directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for ELF directives],
        [gcry_cv_gcc_asm_elf_directives],
        [gcry_cv_gcc_asm_elf_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if ELF directives '.type' and '.size' are supported. */
                 ".text\n\t"
                 "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,STT_FUNC;\n\t"
             );]])],
           [gcry_cv_gcc_asm_elf_directives=yes])])
 if test "$gcry_cv_gcc_asm_elf_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_ELF_DIRECTIVES,1,
              [Defined if underlying assembler supports for ELF directives])
 fi
 
 
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
 #
 GNUPG_SYS_SYMBOL_UNDERSCORE()
 
 
 #################################
 ####                         ####
 #### Setup assembler stuff.  ####
 #### Define mpi_cpu_arch.    ####
 ####                         ####
 #################################
 AC_ARG_ENABLE(mpi-path,
               AS_HELP_STRING([--enable-mpi-path=EXTRA_PATH],
               [prepend EXTRA_PATH to list of CPU specific optimizations]),
 	      mpi_extra_path="$enableval",mpi_extra_path="")
 AC_MSG_CHECKING(architecture and mpi assembler functions)
 if test -f $srcdir/mpi/config.links ; then
     . $srcdir/mpi/config.links
     AC_CONFIG_LINKS("$mpi_ln_list")
     ac_cv_mpi_sflags="$mpi_sflags"
     AC_MSG_RESULT($mpi_cpu_arch)
 else
     AC_MSG_RESULT(failed)
     AC_MSG_ERROR([mpi/config.links missing!])
 fi
 MPI_SFLAGS="$ac_cv_mpi_sflags"
 AC_SUBST(MPI_SFLAGS)
 
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
    shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
    avx512support="n/a"
    padlocksupport="n/a"
    drngsupport="n/a"
 fi
 
 if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
    fi
 fi
 
 if test "$mpi_cpu_arch" != "ppc"; then
    ppccryptosupport="n/a"
 fi
 
 #############################################
 ####                                     ####
 #### Platform specific compiler checks.  ####
 ####                                     ####
 #############################################
 
 
 # Following tests depend on warnings to cause compile to fail, so set -Werror
 # temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute],
        [gcry_cv_gcc_attribute_ms_abi],
        [gcry_cv_gcc_attribute_ms_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((ms_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_ms_abi=yes])])
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1,
      [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute])
 fi
 
 
 #
 # Check whether compiler supports 'sysv_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute],
        [gcry_cv_gcc_attribute_sysv_abi],
        [gcry_cv_gcc_attribute_sysv_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((sysv_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_sysv_abi=yes])])
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1,
      [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute])
 fi
 
 
 #
 # Check whether default calling convention is 'ms_abi'.
 #
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'ms_abi'],
           [gcry_cv_gcc_default_abi_is_ms_abi],
           [gcry_cv_gcc_default_abi_is_ms_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((ms_abi))(*msabi_func)(void);
                  /* warning on SysV abi targets, passes on Windows based targets */
                  msabi_func = def_func;
                  return msabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_ms_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1,
         [Defined if default calling convention is 'ms_abi'])
    fi
 fi
 
 
 #
 # Check whether default calling convention is 'sysv_abi'.
 #
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'],
           [gcry_cv_gcc_default_abi_is_sysv_abi],
           [gcry_cv_gcc_default_abi_is_sysv_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((sysv_abi))(*sysvabi_func)(void);
                  /* warning on MS ABI targets, passes on SysV ABI targets */
                  sysvabi_func = def_func;
                  return sysvabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_sysv_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1,
         [Defined if default calling convention is 'sysv_abi'])
    fi
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports SSSE3 instructions
 # This is required for the AES-NI instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
        [gcry_cv_gcc_inline_asm_ssse3],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1,
      [Defined if inline assembler supports SSSE3 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PCLMUL instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
        [gcry_cv_gcc_inline_asm_pclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SHA Extensions instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
        [gcry_cv_gcc_inline_asm_shaext],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_shaext="n/a"
         else
           gcry_cv_gcc_inline_asm_shaext=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_shaext=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
      [Defined if inline assembler supports SHA Extensions instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
        [gcry_cv_gcc_inline_asm_sse41],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1,
      [Defined if inline assembler supports SSE4.1 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
        [gcry_cv_gcc_inline_asm_avx],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
      [Defined if inline assembler supports AVX instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
        [gcry_cv_gcc_inline_asm_avx2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1,
      [Defined if inline assembler supports AVX2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX512 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX512 instructions],
        [gcry_cv_gcc_inline_asm_avx512],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx512="n/a"
         else
           gcry_cv_gcc_inline_asm_avx512=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpopcntq %%zmm7, %%zmm1%{%%k1%}%{z%};\n\t":::"cc");
               __asm__("vpexpandb %%zmm3, %%zmm1;\n\t":::"cc");
               __asm__("vpxorq %%xmm7, %%xmm7, %%xmm7;\n\t":::"cc");
               __asm__("vpxorq %%ymm7, %%ymm7, %%ymm7;\n\t":::"cc");
               __asm__("vpxorq (%%eax)%{1to8%}, %%zmm7, %%zmm7;\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx512=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx512" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX512,1,
      [Defined if inline assembler supports AVX512 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports VAES and VPCLMUL instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports VAES and VPCLMUL instructions],
        [gcry_cv_gcc_inline_asm_vaes_vpclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_vaes_vpclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_vaes_vpclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("vaesenclast %%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/
               __asm__("vaesenclast %%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/
               __asm__("vpclmulqdq \$0,%%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/
               __asm__("vpclmulqdq \$0,%%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_vaes_vpclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_vaes_vpclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL,1,
      [Defined if inline assembler supports VAES and VPCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
        [gcry_cv_gcc_inline_asm_bmi2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[unsigned int a(unsigned int x, unsigned int y) {
               unsigned int tmp1, tmp2;
               asm ("rorxl %2, %1, %0"
                    : "=r" (tmp1)
                    : "rm0" (x), "J" (32 - ((23) & 31)));
               asm ("andnl %2, %1, %0"
                    : "=r" (tmp2)
                    : "r0" (x), "rm" (y));
               return tmp1 + tmp2;
             }]], [ a(1, 2); ] )],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1,
      [Defined if inline assembler supports BMI2 instructions])
 fi
 
 
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
             [fn();])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
     # Add '-Wa,--divide' to CPPFLAGS and try check again.
     #
     _gcc_cppflags_save="$CPPFLAGS"
     CPPFLAGS="$CPPFLAGS -Wa,--divide"
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
               [fn();])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
       CPPFLAGS="$_gcc_cppflags_save"
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
        [gcry_cv_gcc_amd64_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
                 ".text\n\t"
 		"asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
 		/* Test if assembler allows use of '/' for constant division
 		 * (Solaris/x86 issue). If previous constant division check
 		 * and "-Wa,--divide" workaround failed, this causes assembly
 		 * to be disable on this machine. */
 		"xorl \$(123456789/12345678), %ebp;\n\t"
             );]], [ asmfunc(); ])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
      AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
       AC_LINK_IFELSE([AC_LANG_PROGRAM(
         [[__asm__(
               ".text\n\t"
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
           );]], [ asmfunc(); ])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for assembly
 # implementations that use Intel syntax
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations],
        [gcry_cv_gcc_platform_as_ok_for_intel_syntax],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
                 ".text\n\t"
                 "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
                 "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t"
                 "add eax, ebp;\n\t"
                 "rorx eax, ebp, 1;\n\t"
                 "sub eax, [esp + 4];\n\t"
                 "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
             );]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
   AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1,
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
 
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
 AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture],
        [gcry_cv_cc_arm_arch_is_v6],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_cc_arm_arch_is_v6="n/a"
         else
           gcry_cv_cc_arm_arch_is_v6=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[
            #if defined(__arm__) && \
              ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
              || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
              || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
              || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
              || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
              || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
              || defined(__ARM_ARCH_7EM__))
              /* empty */
            #else
              /* fail compile if not ARMv6. */
              not_armv6 not_armv6 = (not_armv6)not_armv6;
            #endif
           ]])],
           [gcry_cv_cc_arm_arch_is_v6=yes])
         fi])
 if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
    AC_DEFINE(HAVE_ARM_ARCH_V6,1,
      [Defined if ARM architecture is v6 or newer])
 fi
 
 
 #
 # Check whether GCC inline assembler supports NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
        [gcry_cv_gcc_inline_asm_neon],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
                 "vrev64.8 %q0, %q3;\n\t"
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
      [Defined if inline assembler supports NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch32_crypto],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
                 ".text\n\t"
 
                 "testfn:\n\t"
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha1su0.32 q0, q0, q0;\n\t"
                 "sha1su1.32 q0, q0;\n\t"
 
                 "sha256h.32 q0, q0, q0;\n\t"
                 "sha256h2.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha256su0.32 q0, q0;\n\t"
                 "sha256su1.32 q0, q0, q15;\n\t"
 
                 "aese.8 q0, q0;\n\t"
                 "aesd.8 q0, q0;\n\t"
                 "aesmc.8 q0, q0;\n\t"
                 "aesimc.8 q0, q0;\n\t"
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1,
      [Defined if inline assembler supports AArch32 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions],
        [gcry_cv_gcc_inline_asm_aarch64_neon],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1,
      [Defined if inline assembler supports AArch64 NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch64_crypto],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
 
                 "sha1h s0, s0;\n\t"
                 "sha1c q0, s0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
                 "sha1su1 v0.4s, v0.4s;\n\t"
 
                 "sha256h q0, q0, v0.4s;\n\t"
                 "sha256h2 q0, q0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha256su0 v0.4s, v0.4s;\n\t"
                 "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
 
                 "aese v0.16b, v0.16b;\n\t"
                 "aesd v0.16b, v0.16b;\n\t"
                 "aesmc v0.16b, v0.16b;\n\t"
                 "aesimc v0.16b, v0.16b;\n\t"
 
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1,
      [Defined if inline assembler supports AArch64 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether PowerPC AltiVec/VSX intrinsics
 #
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
       [if test "$mpi_cpu_arch" != "ppc" ||
 	  test "$try_asm_modules" != "yes" ; then
 	gcry_cv_cc_ppc_altivec="n/a"
       else
 	gcry_cv_cc_ppc_altivec=no
 	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
 	[[#include <altivec.h>
 	  typedef vector unsigned char block;
 	  typedef vector unsigned int vecu32;
 	  static inline __attribute__((always_inline)) vecu32
 	  vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
 	  {
 	    return vec_sld (a, b, (4 * idx) & 15);
 	  }
 	  block fn(block in)
 	  {
 	    block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	    vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	    y = vec_sld_u32 (y, y, 3);
 	    return vec_cipher_be (t, in) ^ (block)y;
 	  }
 	  ]])],
 	[gcry_cv_cc_ppc_altivec=yes])
       fi])
 if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	    [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
 fi
 
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
     test "$mpi_cpu_arch" = "ppc" &&
     test "$try_asm_modules" == "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
     AC_COMPILE_IFELSE([AC_LANG_SOURCE(
       [[#include <altivec.h>
 	typedef vector unsigned char block;
 	typedef vector unsigned int vecu32;
 	static inline __attribute__((always_inline)) vecu32
 	vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
 	{
 	  return vec_sld (a, b, (4 * idx) & 15);
 	}
 	block fn(block in)
 	{
 	  block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	  y = vec_sld_u32 (y, y, 3);
 	  return vec_cipher_be (t, in) ^ (block)y;
 	}]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags])
   fi
 fi
 
 AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS,
 	       test "$gcry_cv_cc_ppc_altivec_cflags" = "yes")
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions],
        [gcry_cv_gcc_inline_asm_ppc_altivec],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_altivec="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_altivec=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".globl testfn;\n"
                     ".text\n\t"
 		    "testfn:\n"
 		    "stvx %v31,%r12,%r0;\n"
 		    "lvx  %v20,%r12,%r0;\n"
 		    "vcipher %v0, %v1, %v22;\n"
 		    "lxvw4x %vs32, %r0, %r1;\n"
 		    "vadduwm %v0, %v1, %v22;\n"
 		    "vshasigmaw %v0, %v1, 0, 15;\n"
 		    "vshasigmad %v0, %v1, 0, 15;\n"
 		    "vpmsumd %v11, %v11, %v11;\n"
 		  );
             ]], [ testfn(); ] )],
           [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1,
      [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PowerISA 3.00 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions],
        [gcry_cv_gcc_inline_asm_ppc_arch_3_00],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_arch_3_00=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\t"
 		    ".globl testfn;\n"
 		    "testfn:\n"
 		    "stxvb16x %r1,%v12,%v30;\n"
 		  );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1,
      [Defined if inline assembler supports PowerISA 3.00 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
       [gcry_cv_gcc_inline_asm_s390x],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x=no
 	  AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	  [[typedef unsigned int u128_t __attribute__ ((mode (TI)));
 	    unsigned int testfunc(unsigned int x, void *y, unsigned int z)
 	    {
 	      unsigned long fac[8];
 	      register unsigned long reg0 asm("0") = 0;
 	      register unsigned long reg1 asm("1") = x;
 	      u128_t r1 = ((u128_t)(unsigned long)y << 64) | (unsigned long)z;
 	      u128_t r2 = 0;
 	      u128_t r3 = 0;
 	      asm volatile (".insn rre,0xb92e << 16, %[r1], %[r2]\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      asm volatile (".insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      reg0 = 8 - 1;
 	      asm ("stfle %1\n\t"
 	           : "+d" (reg0), "=Q" (fac[0])
 	           :
 	           : "cc", "memory");
 	      asm volatile ("mvc 0(16, %0), 0(%1)\n\t"
 			    :
 			    : "a" (y), "a" (fac)
 			    : "memory");
 	      asm volatile ("xc 0(16, %0), 0(%0)\n\t"
 			    :
 			    : "a" (fac)
 			    : "memory");
 	      asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t"
 			    :
 			    :
 			    : "memory", "r11");
 	      asm volatile ("algrk %%r14, %%r14, %%r14\n\t"
 			    :
 			    :
 			    : "memory", "r14");
 	      return (unsigned int)r1 ^ reg0;
 	    }
 	    ]] , [ testfunc(0, 0, 0); ])],
 	  [gcry_cv_gcc_inline_asm_s390x=yes])
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X,1,
      [Defined if inline assembler supports zSeries instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries vector instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instructions],
       [gcry_cv_gcc_inline_asm_s390x_vx],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x_vx="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x_vx=no
 	  if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
 	    AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	    [[void testfunc(void)
 	      {
 		asm volatile (".machine \"z13+vx\"\n\t"
 			      "vx %%v0, %%v1, %%v31\n\t"
 			      "verllf %%v11, %%v11, (16)(0)\n\t"
 			      :
 			      :
 			      : "memory");
 	      }
 	      ]], [ testfunc(); ])],
 	    [gcry_cv_gcc_inline_asm_s390x_vx=yes])
 	  fi
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X_VX,1,
      [Defined if inline assembler supports zSeries vector instructions])
 fi
 
 
 #######################################
 #### Checks for library functions. ####
 #######################################
 
 AC_FUNC_VPRINTF
 # We have replacements for these in src/missing-string.c
 AC_CHECK_FUNCS(stpcpy strcasecmp)
 # We have replacements for these in src/g10lib.h
 AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
 AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy)
 
 GNUPG_CHECK_MLOCK
 
 #
 # Replacement functions.
 #
 AC_REPLACE_FUNCS([getpid clock])
 
 
 #
 # Check whether it is necessary to link against libdl.
 #
 DL_LIBS=""
 if test "$use_hmac_binary_check" != no ; then
   _gcry_save_libs="$LIBS"
   LIBS=""
   AC_SEARCH_LIBS(dlopen, c dl,,,)
   DL_LIBS=$LIBS
   LIBS="$_gcry_save_libs"
 fi
 AC_SUBST(DL_LIBS)
 
 
 #
 # Check whether we can use Linux capabilities as requested.
 #
 if test "$use_capabilities" = "yes" ; then
 use_capabilities=no
 AC_CHECK_HEADERS(sys/capability.h)
 if test "$ac_cv_header_sys_capability_h" = "yes" ; then
   AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1)
   if test "$ac_cv_lib_cap_cap_init" = "yes"; then
      AC_DEFINE(USE_CAPABILITIES,1,
                [define if capabilities should be used])
      LIBS="$LIBS -lcap"
      use_capabilities=yes
   fi
 fi
 if test "$use_capabilities" = "no" ; then
     AC_MSG_WARN([[
 ***
 *** The use of capabilities on this system is not possible.
 *** You need a recent Linux kernel and some patches:
 ***   fcaps-2.2.9-990610.patch      (kernel patch for 2.2.9)
 ***   fcap-module-990613.tar.gz     (kernel module)
 ***   libcap-1.92.tar.gz            (user mode library and utilities)
 *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN
 *** set (filesystems menu). Be warned: This code is *really* ALPHA.
 ***]])
 fi
 fi
 
 # Check whether a random device is available.
 if test "$try_dev_random" = yes ; then
     AC_CACHE_CHECK(for random device, ac_cv_have_dev_random,
     [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then
       ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi])
     if test "$ac_cv_have_dev_random" = yes; then
         AC_DEFINE(HAVE_DEV_RANDOM,1,
                  [defined if the system supports a random device] )
     fi
 else
     AC_MSG_CHECKING(for random device)
     ac_cv_have_dev_random=no
     AC_MSG_RESULT(has been disabled)
 fi
 
 # Figure out the random modules for this configuration.
 if test "$random" = "default"; then
 
     # Select default value.
     if test "$ac_cv_func_getentropy" = yes; then
         random_modules="getentropy"
     elif test "$ac_cv_have_dev_random" = yes; then
         # Try Linuxish random device.
         random_modules="linux"
     else
         case "${host}" in
         *-*-mingw32ce*)
           # WindowsCE random device.
           random_modules="w32ce"
           ;;
         *-*-mingw32*|*-*-cygwin*)
           # Windows random device.
           random_modules="w32"
           ;;
         *)
           # Build everything, allow to select at runtime.
           random_modules="$auto_random_modules"
           ;;
         esac
     fi
 else
     if test "$random" = "auto"; then
         # Build everything, allow to select at runtime.
         random_modules="$auto_random_modules"
     else
         random_modules="$random"
     fi
 fi
 
 
 #
 # Other defines
 #
 if test mym4_isgit = "yes"; then
     AC_DEFINE(IS_DEVELOPMENT_VERSION,1,
               [Defined if this is not a regular release])
 fi
 
 
 AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes)
 
 
 # This is handy for debugging so the compiler doesn't rearrange
 # things and eliminate variables.
 AC_ARG_ENABLE(optimization,
        AS_HELP_STRING([--disable-optimization],
                       [disable compiler optimization]),
                       [if test $enableval = no ; then
                          CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'`
                        fi])
 
 AC_MSG_NOTICE([checking for cc features])
 # CFLAGS mangling when using gcc.
 if test "$GCC" = yes; then
     AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks])
     _gcc_cflags_save=$CFLAGS
     CFLAGS="-fno-delete-null-pointer-checks"
     AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
     AC_MSG_RESULT($_gcc_wopt)
     CFLAGS=$_gcc_cflags_save;
     if test x"$_gcc_wopt" = xyes ; then
        CFLAGS="$CFLAGS -fno-delete-null-pointer-checks"
     fi
 
     CFLAGS="$CFLAGS -Wall"
     if test "$USE_MAINTAINER_MODE" = "yes"; then
         CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes"
         CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security"
 
         # If -Wno-missing-field-initializers is supported we can enable a
         # a bunch of really useful warnings.
         AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wno-missing-field-initializers"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast"
           CFLAGS="$CFLAGS -Wwrite-strings"
           CFLAGS="$CFLAGS -Wdeclaration-after-statement"
           CFLAGS="$CFLAGS -Wno-missing-field-initializers"
           CFLAGS="$CFLAGS -Wno-sign-compare"
         fi
 
         AC_MSG_CHECKING([if gcc supports -Wpointer-arith])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wpointer-arith"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -Wpointer-arith"
         fi
     fi
 fi
 
 # Check whether as(1) supports a noeexecstack feature.  This test
 # includes an override option.
 CL_AS_NOEXECSTACK
 
 
 AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION)
 AC_SUBST(LIBGCRYPT_CONFIG_LIBS)
 AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS)
 AC_SUBST(LIBGCRYPT_CONFIG_HOST)
 AC_SUBST(LIBGCRYPT_THREAD_MODULES)
 
 AC_CONFIG_COMMANDS([gcrypt-conf],[[
 chmod +x src/libgcrypt-config
 ]],[[
 prefix=$prefix
 exec_prefix=$exec_prefix
 libdir=$libdir
 datadir=$datadir
 DATADIRNAME=$DATADIRNAME
 ]])
 
 #####################
 #### Conclusion. ####
 #####################
 
 # Check that requested feature can actually be used and define
 # ENABLE_foo_SUPPORT macros.
 
 if test x"$aesnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$shaextsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
     shaextsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$sse41support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then
     sse41support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx512support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx512" != "yes" ; then
     avx512support="no (unsupported by compiler)"
   fi
 fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
 if test x"$shaextsupport" = xyes ; then
   AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
             [Enable support for Intel SHAEXT instructions.])
 fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
 fi
 if test x"$sse41support" = xyes ; then
   AC_DEFINE(ENABLE_SSE41_SUPPORT, 1,
             [Enable support for Intel SSE4.1 instructions.])
 fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
 fi
 if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
 if test x"$avx512support" = xyes ; then
   AC_DEFINE(ENABLE_AVX512_SUPPORT,1,
             [Enable support for Intel AVX512 instructions.])
 fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
 fi
 if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
 if test x"$ppccryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1,
             [Enable support for POWER 8 (PowerISA 2.07) crypto extension.])
 fi
 if test x"$jentsupport" = xyes ; then
   AC_DEFINE(ENABLE_JENT_SUPPORT, 1,
             [Enable support for the jitter entropy collector.])
 fi
 if test x"$padlocksupport" = xyes ; then
   AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1,
             [Enable support for the PadLock engine.])
 fi
 if test x"$drngsupport" = xyes ; then
   AC_DEFINE(ENABLE_DRNG_SUPPORT, 1,
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
 
 if test x"$force_soft_hwfeatures" = xyes ; then
   AC_DEFINE(ENABLE_FORCE_SOFT_HWFEATURES, 1,
             [Enable forcing 'soft' HW feature bits on (for testing).])
 fi
 
 # Define conditional sources and config.h symbols depending on the
 # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules.
 
 LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS arcfour-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS des-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-amd64.lo"
 
          # Build with the SSSE3 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64-asm.lo"
 
          # Build with the VAES/AVX2 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-avx2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-arm.lo"
 
          # Build with the ARMv8/AArch32 CE implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aarch64.lo"
 
          # Build with the ARMv8/AArch64 CE implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc9le.lo"
 
          if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
             test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
             # Build with AES-GCM bulk implementation for P10
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-gcm-p10le.lo"
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-p10le.lo"
          fi
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
       s390x-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-s390x.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
          # Build with the AES-NI implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aesni.lo"
 
          # Build with the Padlock implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-padlock.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo"
    AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-amd64.lo"
 
          if test x"$avx2support" = xyes ; then
             # Build with the AVX2 implementation
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-avx2-amd64.lo"
          fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-aarch64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the SSE2 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-sse2-amd64.lo"
       ;;
    esac
 
    if test x"$avx2support" = xyes ; then
       # Build with the AVX2 implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx2-amd64.lo"
    fi
 
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo"
    AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(seed, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo"
    AC_DEFINE(USE_SEED, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
 
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
       ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 
    if test x"$avx2support" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx2-amd64.lo"
 
         # Build with the VAES/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
       fi
    fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo"
    AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-amd64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo"
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-aarch64.lo"
       ;;
       powerpc64le-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc64-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       s390x-*-*)
          # Build with the s390x/zSeries vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sm4, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4.lo"
    AC_DEFINE(USE_SM4, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo"
    esac
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
    AC_DEFINE(USE_DSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo"
    AC_DEFINE(USE_RSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(elgamal, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo"
    AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(ecc, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \
                           ecc.lo ecc-curves.lo ecc-misc.lo \
                           ecc-ecdh.lo ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo \
                           ecc-sm2.lo"
    AC_DEFINE(USE_ECC, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(crc, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
    AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
 
    case "${host}" in
       i?86-*-* | x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-intel-pclmul.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-ce.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
       powerpc64-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
       powerpc-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(gostr3411-94, $enabled_digests)
 if test "$found" = "1" ; then
    # GOST R 34.11-94 internally uses GOST 28147-89
    LIST_MEMBER(gost28147, $enabled_ciphers)
    if test "$found" = "1" ; then
       GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo"
       AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included])
    fi
 fi
 
 LIST_MEMBER(stribog, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo"
    AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo"
    AC_DEFINE(USE_MD2, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md4, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo"
    AC_DEFINE(USE_MD4, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md5, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo"
    AC_DEFINE(USE_MD5, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rmd160, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo"
    AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(sha256, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo"
    AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
    esac
 
    case "$mpi_cpu_arch" in
      x86)
        # Build with the SHAEXT implementation
        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-intel-shaext.lo"
      ;;
    esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx2-bmi2-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx512-amd64.lo"
       ;;
       i?86-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-i386.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo"
    AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          :
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo"
    AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(whirlpool, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo"
    AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS whirlpool-sse2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
    AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
 
    case "${host}" in
      x86_64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-avx-bmi2-amd64.lo"
      ;;
      aarch64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
      ;;
    esac
 fi
 
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
 case "${host}" in
   x86_64-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-bmi2-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx2-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv7-neon.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch64-ce.lo"
   ;;
 esac
 
 case "$mpi_cpu_arch" in
   x86)
     # Build with the SHAEXT implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-intel-shaext.lo"
   ;;
 esac
 
 # Arch specific GCM implementations
 case "${host}" in
   i?86-*-* | x86_64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-intel-pclmul.lo"
   ;;
   arm*-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv7-neon.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch64-ce.lo"
   ;;
   powerpc64le-*-* | powerpc64-*-* | powerpc-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-ppc.lo"
   ;;
 esac
 
 # Arch specific MAC implementations
 case "${host}" in
   s390x-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
   ;;
+  x86_64-*-*)
+    GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo"
+  ;;
 esac
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
    AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(getentropy, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndgetentropy.lo"
    AC_DEFINE(USE_RNDGETENTROPY, 1, [Defined if the getentropy RNG should be used.])
 fi
 
 LIST_MEMBER(linux, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndoldlinux.lo"
    AC_DEFINE(USE_RNDOLDLINUX, 1, [Defined if the /dev/random RNG should be used.])
 fi
 
 LIST_MEMBER(unix, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo"
    AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.])
 fi
 
 LIST_MEMBER(egd, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo"
    AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.])
 fi
 
 LIST_MEMBER(w32, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo"
    AC_DEFINE(USE_RNDW32, 1,
              [Defined if the Windows specific RNG should be used.])
 fi
 
 LIST_MEMBER(w32ce, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo"
    AC_DEFINE(USE_RNDW32CE, 1,
              [Defined if the WindowsCE specific RNG should be used.])
 fi
 
 if test "$try_asm_modules" = yes ; then
   # Build with assembly implementations
   GCRYPT_CIPHERS="$GCRYPT_CIPHERS $GCRYPT_ASM_CIPHERS"
   GCRYPT_DIGESTS="$GCRYPT_DIGESTS $GCRYPT_ASM_DIGESTS"
 fi
 
 AC_SUBST([GCRYPT_CIPHERS])
 AC_SUBST([GCRYPT_PUBKEY_CIPHERS])
 AC_SUBST([GCRYPT_DIGESTS])
 AC_SUBST([GCRYPT_KDFS])
 AC_SUBST([GCRYPT_RANDOM])
 
 AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers)
 AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers)
 AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests)
 
 # For printing the configuration we need a colon separated list of
 # algorithm names.
 tmp=`echo "$enabled_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp",
                    [List of available cipher algorithms])
 tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp",
                    [List of available public key cipher algorithms])
 tmp=`echo "$enabled_digests" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp",
                    [List of available digest algorithms])
 tmp=`echo "$enabled_kdfs" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp",
                    [List of available KDF algorithms])
 
 
 #
 # Define conditional sources depending on the used hardware platform.
 # Note that all possible modules must also be listed in
 # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES).
 #
 GCRYPT_HWF_MODULES=
 case "$mpi_cpu_arch" in
      x86)
         AC_DEFINE(HAVE_CPU_ARCH_X86, 1,   [Defined for the x86 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo"
         ;;
      alpha)
         AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms])
         ;;
      sparc)
         AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms])
         ;;
      mips)
         AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1,  [Defined for MIPS platforms])
         ;;
      m68k)
         AC_DEFINE(HAVE_CPU_ARCH_M68K, 1,  [Defined for M68k platforms])
         ;;
      ppc)
         AC_DEFINE(HAVE_CPU_ARCH_PPC, 1,   [Defined for PPC platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-ppc.lo"
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      aarch64)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM AArch64 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      s390x)
         AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
 
 #
 # Option to disable building of doc file
 #
 build_doc=yes
 AC_ARG_ENABLE([doc], AS_HELP_STRING([--disable-doc],
                                     [do not build the documentation]),
                      build_doc=$enableval, build_doc=yes)
 AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno])
 
 
 #
 # Provide information about the build.
 #
 BUILD_REVISION="mym4_revision"
 AC_SUBST(BUILD_REVISION)
 AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION",
                    [GIT commit id revision used to build this package])
 
 changequote(,)dnl
 BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'`
 changequote([,])dnl
 BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec"
 BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,`
 AC_SUBST(BUILD_VERSION)
 AC_SUBST(BUILD_FILEVERSION)
 
 AC_ARG_ENABLE([build-timestamp],
   AS_HELP_STRING([--enable-build-timestamp],
                  [set an explicit build timestamp for reproducibility.
                   (default is the current time in ISO-8601 format)]),
      [if test "$enableval" = "yes"; then
         BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date`
       else
         BUILD_TIMESTAMP="$enableval"
       fi],
      [BUILD_TIMESTAMP="<none>"])
 AC_SUBST(BUILD_TIMESTAMP)
 AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP",
                    [The time this package was configured for a build])
 
 
 # And create the files.
 AC_CONFIG_FILES([
 Makefile
 m4/Makefile
 compat/Makefile
 mpi/Makefile
 cipher/Makefile
 random/Makefile
 doc/Makefile
 src/Makefile
 src/gcrypt.h
 src/libgcrypt-config
 src/libgcrypt.pc
 src/versioninfo.rc
 tests/Makefile
 ])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
 
 
 detection_module="${GCRYPT_HWF_MODULES%.lo}"
 test -n "$detection_module" || detection_module="none"
 
 # Give some feedback
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:])
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Platform:                 ],[$PRINTABLE_OS_NAME ($host)])
 GCRY_MSG_SHOW([Hardware detection module:],[$detection_module])
 GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers])
 GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests])
 GCRY_MSG_WRAP([Enabled kdf algorithms:   ],[$enabled_kdfs])
 GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers])
 GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([FIPS module version:      ],[$fips_module_version])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
 GCRY_MSG_SHOW([Try using Intel AVX512:   ],[$avx512support])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
 GCRY_MSG_SHOW([Try using PPC crypto:     ],[$ppccryptosupport])
 GCRY_MSG_SHOW([],[])
 
 if test "x${gpg_config_script_warn}" != x; then
 cat <<G10EOF
         Mismatches between the target platform and the to
         be used libraries have been been detected for:
          ${gpg_config_script_warn}
         Please check above for warning messages.
 
 G10EOF
 fi
 
 if test "$gcry_cv_gcc_attribute_aligned" != "yes" ; then
 cat <<G10EOF
    Please not that your compiler does not support the GCC style
    aligned attribute. Using this software may evoke bus errors.
 
 G10EOF
 fi
 
 if test -n "$gpl"; then
   echo "Please note that you are building a version of Libgcrypt with"
   echo "  $gpl"
   echo "included.  These parts are licensed under the GPL and thus the"
   echo "use of this library has to comply with the conditions of the GPL."
   echo ""
 fi