diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index 77143ff0..ec945f84 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -1,532 +1,506 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Note: Based on the SSSE3 implementation.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 #define	VMOVDQ vmovdqu /* assume buffers not aligned */
 
-.macro ROR p1 p2
-	/* shld is faster than ror on Intel Sandybridge */
-	shld	\p1, \p1, (32 - \p2)
-.endm
+#define ROR(p1, p2) \
+	/* shld is faster than ror on Intel Sandybridge */ \
+	shld	p1, p1, (32 - p2);
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
-	VMOVDQ \p1, \p2
-	vpshufb \p1, \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+	VMOVDQ p1, p2; \
+	vpshufb p1, p1, p3;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
 
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER  = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
 
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
 
-NUM_BLKS = rdx	/* 3rd arg */
-CTX = rsi	/* 2nd arg */
-INP = rdi	/* 1st arg */
+#define NUM_BLKS rdx	/* 3rd arg */
+#define CTX rsi	/* 2nd arg */
+#define INP rdi	/* 1st arg */
 
-SRND = rdi	/* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi	/* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
 
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
 
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
 
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
 
 
 
 #define _INP_END_SIZE	8
 #define _INP_SIZE	8
 #define _XFER_SIZE	8
 #define _XMM_SAVE_SIZE	0
 /* STACK_SIZE plus pushes must be an odd multiple of 8 */
 #define _ALIGN_SIZE	8
 
 #define _INP_END	0
 #define _INP		(_INP_END  + _INP_END_SIZE)
 #define _XFER		(_INP      + _INP_SIZE)
 #define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
 
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-		/* compute s0 four at a time and s1 two at a time
-		 * compute W[-16] + W[-7] 4 at a time */
-	mov	y0, e		/* y0 = e */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-		vpalignr	XTMP0, X3, X2, 4	/* XTMP0 = W[-7] */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		vpaddd	XTMP0, XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		/* compute s0 */
-		vpalignr	XTMP1, X1, X0, 4	/* XTMP1 = W[-15] */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpslld	XTMP2, XTMP1, (32-7)
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		vpsrld	XTMP3, XTMP1, 7
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		vpor	XTMP3, XTMP3, XTMP2	/* XTMP1 = W[-15] ror 7 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		/* compute s0 four at a time and s1 two at a time */; \
+		/* compute W[-16] + W[-7] 4 at a time */; \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		vpalignr	XTMP0, X3, X2, 4	/* XTMP0 = W[-7] */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpaddd	XTMP0, XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		/* compute s0 */; \
+		vpalignr	XTMP1, X1, X0, 4	/* XTMP1 = W[-15] */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpslld	XTMP2, XTMP1, (32-7); \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpsrld	XTMP3, XTMP1, 7; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		vpor	XTMP3, XTMP3, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-		vpslld	XTMP2, XTMP1, (32-18)
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-		vpsrld	XTMP4, XTMP1, 18
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		vpxor	XTMP4, XTMP4, XTMP3
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		vpsrld	XTMP1, XTMP1, 3	/* XTMP4 = W[-15] >> 3 */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-		vpxor	XTMP1, XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpxor	XTMP1, XTMP1, XTMP4	/* XTMP1 = s0 */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		/* compute low s1 */
-		vpshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		vpaddd	XTMP0, XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+		vpslld	XTMP2, XTMP1, (32-18); \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrld	XTMP4, XTMP1, 18; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		vpxor	XTMP4, XTMP4, XTMP3; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		vpsrld	XTMP1, XTMP1, 3	/* XTMP4 = W[-15] >> 3 */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+		vpxor	XTMP1, XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpxor	XTMP1, XTMP1, XTMP4	/* XTMP1 = s0 */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		/* compute low s1 */; \
+		vpshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		vpaddd	XTMP0, XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */
-	xor	y2, g		/* y2 = f^g */
-		vpsrlq	XTMP4, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-		vpsrld	XTMP2, XTMP2, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		vpxor	XTMP2, XTMP2, XTMP3
-	add	y2, y0		/* y2 = S1 + CH */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */
-		vpxor	XTMP4, XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpshufb	XTMP4, XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		vpaddd	XTMP0, XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		/* compute high s1 */
-		vpshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrlq	XTMP4, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+		vpsrld	XTMP2, XTMP2, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
+		vpxor	XTMP4, XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpaddd	XTMP0, XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		/* compute high s1 */; \
+		vpshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-	mov	y0, e		/* y0 = e */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		vpsrlq	X0, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		vpsrld	XTMP2, XTMP2,    10	/* X0 = W[-2] >> 10 {DDCC} */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		vpxor	XTMP2, XTMP2, XTMP3
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */
-		vpxor	X0, X0, XTMP2	/* X0 = s1 {xDxC} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpshufb	X0, X0, SHUF_DC00	/* X0 = s1 {DC00} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		vpaddd	X0, X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrlq	X0, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		vpsrld	XTMP2, XTMP2,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
+		vpxor	X0, X0, XTMP2	/* X0 = s1 {xDxC} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpshufb	X0, X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpaddd	X0, X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
 
 /* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
-	mov	y0, e		/* y0 = e */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	add	y2, y0		/* y2 = S1 + CH */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + \i1 * 4]	/* y2 = k + w + S1 + CH */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
-	ROTATE_ARGS
-.endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 */
 .text
 .globl _gcry_sha256_transform_amd64_avx
 ELF(.type  _gcry_sha256_transform_amd64_avx,@function;)
 .align 16
 _gcry_sha256_transform_amd64_avx:
 	CFI_STARTPROC()
 	vzeroupper
 
 	push	rbx
 	CFI_PUSH(rbx)
 	push	rbp
 	CFI_PUSH(rbp)
 	push	r13
 	CFI_PUSH(r13)
 	push	r14
 	CFI_PUSH(r14)
 	push	r15
 	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
 	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
 	add	NUM_BLKS, INP	/* pointer to end of data */
 	mov	[rsp + _INP_END], NUM_BLKS
 
 	/* load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 .Loop0:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* byte swap first 16 dwords */
-	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
 
 	mov	[rsp + _INP], INP
 
 	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
 	mov	SRND, 3
 .align 16
 .Loop1:
 	vpaddd	XFER, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
 
-	vpaddd	XFER, X0, [TBL + 1*16]
+	vpaddd	XFER, X1, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
 
-	vpaddd	XFER, X0, [TBL + 2*16]
+	vpaddd	XFER, X2, [TBL + 2*16]
 	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
 
-	vpaddd	XFER, X0, [TBL + 3*16]
+	vpaddd	XFER, X3, [TBL + 3*16]
 	vmovdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
 	sub	SRND, 1
 	jne	.Loop1
 
 	mov	SRND, 2
 .Loop2:
 	vpaddd	X0, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], X0
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, a, b, c, d, e, f, g, h)
+	DO_ROUND(1, h, a, b, c, d, e, f, g)
+	DO_ROUND(2, g, h, a, b, c, d, e, f)
+	DO_ROUND(3, f, g, h, a, b, c, d, e)
 	vpaddd	X1, X1, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, e, f, g, h, a, b, c, d)
+	DO_ROUND(1, d, e, f, g, h, a, b, c)
+	DO_ROUND(2, c, d, e, f, g, h, a, b)
+	DO_ROUND(3, b, c, d, e, f, g, h, a)
 
 	vmovdqa	X0, X2
 	vmovdqa	X1, X3
 
 	sub	SRND, 1
 	jne	.Loop2
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 	cmp	INP, [rsp + _INP_END]
 	jne	.Loop0
 
 .Ldone_hash:
 	vzeroall
 
 	vmovdqa	[rsp + _XFER], XFER
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
 	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
 	CFI_POP(r15)
 	pop	r14
 	CFI_POP(r14)
 	pop	r13
 	CFI_POP(r13)
 	pop	rbp
 	CFI_POP(rbp)
 	pop	rbx
 	CFI_POP(rbx)
 
 	ret
 	CFI_ENDPROC()
 
 
 .align 16
 .LK256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
 
 /* shuffle xBxA -> 00BA */
 .L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 /* shuffle xDxC -> DC00 */
 .L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 52be1a07..faefba17 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -1,575 +1,520 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 2 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA256)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 #define	VMOVDQ vmovdqu /* ; assume buffers not aligned  */
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros */
 
 /*  addm [mem], reg */
 /*  Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-X0 = ymm4
-X1 = ymm5
-X2 = ymm6
-X3 = ymm7
+#define X0 ymm4
+#define X1 ymm5
+#define X2 ymm6
+#define X3 ymm7
 
 /*  XMM versions of above */
-XWORD0 = xmm4
-XWORD1 = xmm5
-XWORD2 = xmm6
-XWORD3 = xmm7
-
-XTMP0 = ymm0
-XTMP1 = ymm1
-XTMP2 = ymm2
-XTMP3 = ymm3
-XTMP4 = ymm8
-XFER =  ymm9
-XTMP5 = ymm11
-
-SHUF_00BA = ymm10 /*  shuffle xBxA -> 00BA */
-SHUF_DC00 = ymm12 /*  shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = ymm13
-
-X_BYTE_FLIP_MASK = xmm13 /*  XMM version of BYTE_FLIP_MASK */
-
-NUM_BLKS = rdx	/*  3rd arg */
-CTX =	rsi   	/*  2nd arg */
-INP =	rdi	/*  1st arg */
-c =	ecx
-d =	r8d
-e =	edx	/*  clobbers NUM_BLKS */
-y3 =	edi	/*  clobbers INP */
-
-TBL =	rbp
-SRND =	CTX	/*  SRND is same register as CTX */
-
-a =	eax
-b =	ebx
-f =	r9d
-g =	r10d
-h =	r11d
-old_h =	r11d
-
-T1 = r12d
-y0 = r13d
-y1 = r14d
-y2 = r15d
-
-
-_XFER_SIZE	= 2*64*4	/*  2 blocks, 64 rounds, 4 bytes/round */
-_XMM_SAVE_SIZE  = 0
-_INP_END_SIZE	= 8
-_INP_SIZE	= 8
-_CTX_SIZE	= 8
-_RSP_SIZE	= 8
-
-_XFER		= 0
-_XMM_SAVE	= _XFER     + _XFER_SIZE
-_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
-_INP 		= _INP_END  + _INP_END_SIZE
-_CTX		= _INP      + _INP_SIZE
-_RSP		= _CTX      + _CTX_SIZE
-STACK_SIZE	= _RSP      + _RSP_SIZE
-
-/*  rotate_Xs */
-/*  Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/*  ROTATE_ARGS */
-/*  Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-old_h = h
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro ONE_ROUND_PART1 XFER
-	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
-	 * d += h;
-	 * h += Sum0 (a) + Maj (a, b, c);
-	 *
-	 * Ch(x, y, z) => ((x & y) + (~x & z))
-	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
-	 */
-
-	mov y3, e
-	add h, [\XFER]
-	and y3, f
-	rorx y0, e, 25
-	rorx y1, e, 11
+#define XWORD0 xmm4
+#define XWORD1 xmm5
+#define XWORD2 xmm6
+#define XWORD3 xmm7
+
+#define XTMP0 ymm0
+#define XTMP1 ymm1
+#define XTMP2 ymm2
+#define XTMP3 ymm3
+#define XTMP4 ymm8
+#define XFER ymm9
+#define XTMP5 ymm11
+
+#define SHUF_00BA ymm10 /*  shuffle xBxA -> 00BA */
+#define SHUF_DC00 ymm12 /*  shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK ymm13
+
+#define X_BYTE_FLIP_MASK xmm13 /*  XMM version of BYTE_FLIP_MASK */
+
+#define NUM_BLKS rdx /*  3rd arg */
+#define CTX rsi      /*  2nd arg */
+#define INP rdi      /*  1st arg */
+#define c ecx
+#define d r8d
+#define e edx        /*  clobbers NUM_BLKS */
+#define y3 edi       /*  clobbers INP */
+
+#define TBL rbp
+#define SRND CTX     /*  SRND is same register as CTX */
+
+#define a eax
+#define b ebx
+#define f r9d
+#define g r10d
+#define h r11d
+#define old_h r11d
+
+#define T1 r12d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
+
+
+#define _XFER_SIZE 2*64*4	/*  2 blocks, 64 rounds, 4 bytes/round */
+#define _XMM_SAVE_SIZE 0
+#define _INP_END_SIZE 8
+#define _INP_SIZE 8
+#define _CTX_SIZE 8
+#define _RSP_SIZE 8
+
+#define _XFER 0
+#define _XMM_SAVE  _XFER     + _XFER_SIZE
+#define _INP_END   _XMM_SAVE + _XMM_SAVE_SIZE
+#define _INP       _INP_END  + _INP_END_SIZE
+#define _CTX       _INP      + _INP_SIZE
+#define _RSP       _CTX      + _CTX_SIZE
+#define STACK_SIZE _RSP      + _RSP_SIZE
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \
+	/* d += h; */ \
+	/* h += Sum0 (a) + Maj (a, b, c); */ \
+	\
+	/* Ch(x, y, z) => ((x & y) + (~x & z)) */ \
+	/* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \
+	\
+	mov y3, e; \
+	add h, [XFERIN]; \
+	and y3, f; \
+	rorx y0, e, 25; \
+	rorx y1, e, 11; \
+	lea h, [h + y3]; \
+	andn y3, e, g; \
+	rorx T1, a, 13; \
+	xor y0, y1; \
 	lea h, [h + y3]
-	andn y3, e, g
-	rorx T1, a, 13
-	xor y0, y1
-	lea h, [h + y3]
-.endm
-.macro ONE_ROUND_PART2
-	rorx y2, a, 22
-	rorx y1, e, 6
-	mov y3, a
-	xor T1, y2
-	xor y0, y1
-	xor y3, b
-	lea h, [h + y0]
-	mov y0, a
-	rorx y2, a, 2
-	add d, h
-	and y3, c
-	xor T1, y2
-	lea h, [h + y3]
-	lea h, [h + T1]
-	and y0, b
-	lea h, [h + y0]
-.endm
-
-.macro ONE_ROUND XFER
-	ONE_ROUND_PART1 \XFER
-	ONE_ROUND_PART2
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */
-		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */
-		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */
-		vpsrld	XTMP2, XTMP1, 7
-		vpslld	XTMP3, XTMP1, (32-7)
-		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */
-		vpsrld	XTMP2, XTMP1,18
-
-	ONE_ROUND 0*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */
-		vpslld	XTMP1, XTMP1, (32-18)
-		vpxor	XTMP3, XTMP3, XTMP1
-		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */
-		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */
-		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */
-		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */
-		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */
-
-	ONE_ROUND 1*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */
-		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */
-		vpxor	XTMP2, XTMP2, XTMP3
-		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */
-		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */
-		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */
-		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */
-
-	ONE_ROUND 2*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */
-		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */
-		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */
-		vpxor	XTMP2, XTMP2, XTMP3
-		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */
-		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */
-		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */
-		vpaddd	XFER, X0, [TBL + \XFEROUT]
-
-	ONE_ROUND_PART1 3*4+\XFER
-		vmovdqa [rsp + _XFER + \XFEROUT], XFER
-	ONE_ROUND_PART2
-	ROTATE_ARGS
-	rotate_Xs
-.endm
-
-.macro DO_4ROUNDS XFER
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND 0*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND 1*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND 2*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+	rorx y2, a, 22; \
+	rorx y1, e, 6; \
+	mov y3, a; \
+	xor T1, y2; \
+	xor y0, y1; \
+	xor y3, b; \
+	lea h, [h + y0]; \
+	mov y0, a; \
+	rorx y2, a, 2; \
+	add d, h; \
+	and y3, c; \
+	xor T1, y2; \
+	lea h, [h + y3]; \
+	lea h, [h + T1]; \
+	and y0, b; \
+	lea h, [h + y0]
 
-	ONE_ROUND 3*4+\XFER
-	ROTATE_ARGS
-.endm
+#define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \
+	ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \
+	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */; \
+		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \
+		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */; \
+		vpsrld	XTMP2, XTMP1, 7; \
+		vpslld	XTMP3, XTMP1, (32-7); \
+		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */; \
+		vpsrld	XTMP2, XTMP1,18; \
+	\
+	ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */; \
+		vpslld	XTMP1, XTMP1, (32-18); \
+		vpxor	XTMP3, XTMP3, XTMP1; \
+		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */; \
+		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */; \
+		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */; \
+		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */; \
+	\
+	ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */; \
+		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */; \
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */; \
+		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */; \
+		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */; \
+	\
+	ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */; \
+		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */; \
+		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */; \
+		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */; \
+		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */; \
+		vpaddd	XFER, X0, [TBL + XFEROUT]; \
+	\
+	ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \
+		vmovdqa [rsp + _XFER + XFEROUT], XFER; \
+	ONE_ROUND_PART2(f, g, h, a, b, c, d, e);
+
+#define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \
+	ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+	ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+	ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+	ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 */
 .text
 .globl _gcry_sha256_transform_amd64_avx2
 ELF(.type _gcry_sha256_transform_amd64_avx2,@function)
 .align 32
 _gcry_sha256_transform_amd64_avx2:
 	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
 	je .Lnowork
 
 	push	rbx
 	CFI_PUSH(rbx)
 	push	rbp
 	CFI_PUSH(rbp)
 	push	r12
 	CFI_PUSH(r12)
 	push	r13
 	CFI_PUSH(r13)
 	push	r14
 	CFI_PUSH(r14)
 	push	r15
 	CFI_PUSH(r15)
 
 	vzeroupper
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 	mov	rax, rsp
 	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, STACK_SIZE
 	and	rsp, ~63
 	mov	[rsp + _RSP], rax
 	CFI_CFA_ON_STACK(_RSP, 6 * 8)
 
 	shl	NUM_BLKS, 6	/*  convert to bytes */
 	lea	NUM_BLKS, [NUM_BLKS + INP - 64] /*  pointer to last block */
 	mov	[rsp + _INP_END], NUM_BLKS
 
 	/* ; load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	mov	[rsp + _CTX], CTX
 
 .Loop0:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* ; Load first 16 dwords from two blocks */
 	VMOVDQ	XTMP0, [INP + 0*32]
 	VMOVDQ	XTMP1, [INP + 1*32]
 	VMOVDQ	XTMP2, [INP + 2*32]
 	VMOVDQ	XTMP3, [INP + 3*32]
 
 	/* ; byte swap data */
 	vpshufb	XTMP0, XTMP0, BYTE_FLIP_MASK
 	vpshufb	XTMP1, XTMP1, BYTE_FLIP_MASK
 	vpshufb	XTMP2, XTMP2, BYTE_FLIP_MASK
 	vpshufb	XTMP3, XTMP3, BYTE_FLIP_MASK
 
 	/* ; transpose data into high/low halves */
 	vperm2i128	X0, XTMP0, XTMP2, 0x20
 	vperm2i128	X1, XTMP0, XTMP2, 0x31
 	vperm2i128	X2, XTMP1, XTMP3, 0x20
 	vperm2i128	X3, XTMP1, XTMP3, 0x31
 
 .Last_block_enter:
 	add	INP, 64
 	mov	[rsp + _INP], INP
 
 	/* ; schedule 48 input dwords, by doing 3 rounds of 12 each */
 	xor	SRND, SRND
 
 	vpaddd	XFER, X0, [TBL + 0*32]
 	vmovdqa [rsp + _XFER + 0*32], XFER
 	vpaddd	XFER, X1, [TBL + 1*32]
 	vmovdqa [rsp + _XFER + 1*32], XFER
 	vpaddd	XFER, X2, [TBL + 2*32]
 	vmovdqa [rsp + _XFER + 2*32], XFER
 	vpaddd	XFER, X3, [TBL + 3*32]
 	vmovdqa [rsp + _XFER + 3*32], XFER
 
 .align 16
 .Loop1:
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 0*32, SRND + 4*32
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 1*32, SRND + 5*32
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 2*32, SRND + 6*32
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 3*32, SRND + 7*32
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
 	add	SRND, 4*32
 	cmp	SRND, 3 * 4*32
 	jb	.Loop1
 
 	/* ; Do last 16 rounds with no scheduling */
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 0*32)
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 1*32)
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 2*32)
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 3*32)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d)
 
 	mov	CTX, [rsp + _CTX]
 	mov	INP, [rsp + _INP]
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	cmp	INP, [rsp + _INP_END]
 	ja	.Ldone_hash
 
 	/* ;;; Do second block using previously scheduled results */
 	xor	SRND, SRND
 .align 16
 .Loop3:
-	DO_4ROUNDS	rsp + _XFER + SRND + 0*32 + 16
-	DO_4ROUNDS	rsp + _XFER + SRND + 1*32 + 16
+	DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d)
 	add	SRND, 2*32
 	cmp	SRND, 4 * 4*32
 	jb .Loop3
 
 	mov	CTX, [rsp + _CTX]
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	cmp	INP, [rsp + _INP_END]
 	jb	.Loop0
 	ja	.Ldone_hash
 
 .Ldo_last_block:
 	/* ;;; do last block */
 	lea	TBL, [.LK256 ADD_RIP]
 
 	VMOVDQ	XWORD0, [INP + 0*16]
 	VMOVDQ	XWORD1, [INP + 1*16]
 	VMOVDQ	XWORD2, [INP + 2*16]
 	VMOVDQ	XWORD3, [INP + 3*16]
 
 	vpshufb	XWORD0, XWORD0, X_BYTE_FLIP_MASK
 	vpshufb	XWORD1, XWORD1, X_BYTE_FLIP_MASK
 	vpshufb	XWORD2, XWORD2, X_BYTE_FLIP_MASK
 	vpshufb	XWORD3, XWORD3, X_BYTE_FLIP_MASK
 
 	jmp	.Last_block_enter
 
 .Lonly_one_block:
 
 	/* ; load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 	mov	[rsp + _CTX], CTX
 	jmp	.Ldo_last_block
 
 .Ldone_hash:
 	vzeroall
 
 	/* burn stack */
 	vmovdqa [rsp + _XFER + 0 * 32], ymm0
 	vmovdqa [rsp + _XFER + 1 * 32], ymm0
 	vmovdqa [rsp + _XFER + 2 * 32], ymm0
 	vmovdqa [rsp + _XFER + 3 * 32], ymm0
 	vmovdqa [rsp + _XFER + 4 * 32], ymm0
 	vmovdqa [rsp + _XFER + 5 * 32], ymm0
 	vmovdqa [rsp + _XFER + 6 * 32], ymm0
 	vmovdqa [rsp + _XFER + 7 * 32], ymm0
 	vmovdqa [rsp + _XFER + 8 * 32], ymm0
 	vmovdqa [rsp + _XFER + 9 * 32], ymm0
 	vmovdqa [rsp + _XFER + 10 * 32], ymm0
 	vmovdqa [rsp + _XFER + 11 * 32], ymm0
 	vmovdqa [rsp + _XFER + 12 * 32], ymm0
 	vmovdqa [rsp + _XFER + 13 * 32], ymm0
 	vmovdqa [rsp + _XFER + 14 * 32], ymm0
 	vmovdqa [rsp + _XFER + 15 * 32], ymm0
 	xor     eax, eax
 
 	mov	rsp, [rsp + _RSP]
 	CFI_DEF_CFA_REGISTER(rsp)
 
 	pop	r15
 	CFI_POP(r15)
 	pop	r14
 	CFI_POP(r14)
 	pop	r13
 	CFI_POP(r13)
 	pop	r12
 	CFI_POP(r12)
 	pop	rbp
 	CFI_POP(rbp)
 	pop	rbx
 	CFI_POP(rbx)
 
 .Lnowork:
 	ret
 	CFI_ENDPROC()
 
 .align 64
 .LK256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 .LPSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 
 /*  shuffle xBxA -> 00BA */
 .L_SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 /*  shuffle xDxC -> DC00 */
 .L_SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 0fb94c1b..098b0eb6 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -1,553 +1,528 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Note: original implementation was named as SHA256-SSE4. However, only SSSE3
  *       is required.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 #define	MOVDQ movdqu /* assume buffers not aligned */
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
-	MOVDQ \p1, \p2
-	pshufb \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+	MOVDQ p1, p2; \
+	pshufb p1, p3;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
 
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER  = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
 
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
 
-NUM_BLKS = rdx	/* 3rd arg */
-CTX = rsi	/* 2nd arg */
-INP = rdi	/* 1st arg */
+#define NUM_BLKS rdx	/* 3rd arg */
+#define CTX rsi	/* 2nd arg */
+#define INP rdi	/* 1st arg */
 
-SRND = rdi	/* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi	/* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
 
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
 
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
 
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
 
 
 
 #define _INP_END_SIZE	8
 #define _INP_SIZE	8
 #define _XFER_SIZE	8
 #define _XMM_SAVE_SIZE	0
 /* STACK_SIZE plus pushes must be an odd multiple of 8 */
 #define _ALIGN_SIZE	8
 
 #define _INP_END	0
 #define _INP		(_INP_END  + _INP_END_SIZE)
 #define _XFER		(_INP      + _INP_SIZE)
 #define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
 
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-		/* compute s0 four at a time and s1 two at a time
-		 * compute W[-16] + W[-7] 4 at a time */
-		movdqa	XTMP0, X3
-	mov	y0, e		/* y0 = e */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		movdqa	XTMP1, X1
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		/* compute s0 */
-		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */
-		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pslld	XTMP1, (32-7)
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		psrld	XTMP2, 7
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		/* compute s0 four at a time and s1 two at a time */; \
+		/* compute W[-16] + W[-7] 4 at a time */; \
+		movdqa	XTMP0, X3; \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		movdqa	XTMP1, X1; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		/* compute s0 */; \
+		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
+		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pslld	XTMP1, (32-7); \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		psrld	XTMP2, 7; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-		pslld	XTMP3, (32-18)
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-		psrld	XTMP2, 18
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		pxor	XTMP1, XTMP3
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		/* compute low s1 */
-		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */; \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+		pslld	XTMP3, (32-18); \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrld	XTMP2, 18; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		pxor	XTMP1, XTMP3; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		/* compute low s1 */; \
+		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */
-	xor	y2, g		/* y2 = f^g */
-		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		pxor	XTMP2, XTMP3
-	add	y2, y0		/* y2 = S1 + CH */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */
-		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		/* compute high s1 */
-		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */; \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		pxor	XTMP2, XTMP3; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
+		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		/* compute high s1 */; \
+		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */
-	mov	y0, e		/* y0 = e */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		pxor	XTMP2, XTMP3
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */
-		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */; \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		pxor	XTMP2, XTMP3; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
+		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
 
 /* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
-	mov	y0, e		/* y0 = e */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	add	y2, y0		/* y2 = S1 + CH */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + \i1 * 4]	/* y2 = k + w + S1 + CH */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
-	ROTATE_ARGS
-.endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 */
 .text
 .globl _gcry_sha256_transform_amd64_ssse3
 ELF(.type  _gcry_sha256_transform_amd64_ssse3,@function;)
 .align 16
 _gcry_sha256_transform_amd64_ssse3:
 	CFI_STARTPROC()
 	push	rbx
 	CFI_PUSH(rbx)
 	push	rbp
 	CFI_PUSH(rbp)
 	push	r13
 	CFI_PUSH(r13)
 	push	r14
 	CFI_PUSH(r14)
 	push	r15
 	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
 	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
 	add	NUM_BLKS, INP	/* pointer to end of data */
 	mov	[rsp + _INP_END], NUM_BLKS
 
 	/* load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	movdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	movdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	movdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 .Loop0:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* byte swap first 16 dwords */
-	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
 
 	mov	[rsp + _INP], INP
 
 	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
 	mov	SRND, 3
 .align 16
 .Loop1:
 	movdqa	XFER, [TBL + 0*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
 
 	movdqa	XFER, [TBL + 1*16]
-	paddd	XFER, X0
+	paddd	XFER, X1
 	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
 
 	movdqa	XFER, [TBL + 2*16]
-	paddd	XFER, X0
+	paddd	XFER, X2
 	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
 
 	movdqa	XFER, [TBL + 3*16]
-	paddd	XFER, X0
+	paddd	XFER, X3
 	movdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
 	sub	SRND, 1
 	jne	.Loop1
 
 	mov	SRND, 2
 .Loop2:
 	paddd	X0, [TBL + 0*16]
 	movdqa	[rsp + _XFER], X0
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, a, b, c, d, e, f, g, h)
+	DO_ROUND(1, h, a, b, c, d, e, f, g)
+	DO_ROUND(2, g, h, a, b, c, d, e, f)
+	DO_ROUND(3, f, g, h, a, b, c, d, e)
 	paddd	X1, [TBL + 1*16]
 	movdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, e, f, g, h, a, b, c, d)
+	DO_ROUND(1, d, e, f, g, h, a, b, c)
+	DO_ROUND(2, c, d, e, f, g, h, a, b)
+	DO_ROUND(3, b, c, d, e, f, g, h, a)
 
 	movdqa	X0, X2
 	movdqa	X1, X3
 
 	sub	SRND, 1
 	jne	.Loop2
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 	cmp	INP, [rsp + _INP_END]
 	jne	.Loop0
 
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
 	pxor	xmm2, xmm2
 	pxor	xmm3, xmm3
 	pxor	xmm4, xmm4
 	pxor	xmm5, xmm5
 	pxor	xmm6, xmm6
 	pxor	xmm7, xmm7
 	pxor	xmm8, xmm8
 	pxor	xmm9, xmm9
 	pxor	xmm10, xmm10
 	pxor	xmm11, xmm11
 	pxor	xmm12, xmm12
 
 .Ldone_hash:
 	pxor	XFER, XFER
 	movdqa	[rsp + _XFER], XFER
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
 	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
 	CFI_POP(r15)
 	pop	r14
 	CFI_POP(r14)
 	pop	r13
 	CFI_POP(r13)
 	pop	rbp
 	CFI_POP(rbp)
 	pop	rbx
 	CFI_POP(rbx)
 
 	ret
 	CFI_ENDPROC()
 
 
 .align 16
 .LK256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
 
 /* shuffle xBxA -> 00BA */
 .L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 /* shuffle xDxC -> DC00 */
 .L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 991fd639..75f7b070 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -1,431 +1,461 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
-msg = rdi /* ARG1 */
-digest = rsi /* ARG2 */
-msglen = rdx /* ARG3 */
-T1 = rcx
-T2 = r8
-a_64 = r9
-b_64 = r10
-c_64 = r11
-d_64 = r12
-e_64 = r13
-f_64 = r14
-g_64 = r15
-h_64 = rbx
-tmp0 = rax
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
-frame_W      = 0 /* Message Schedule */
-frame_W_size = (80 * 8)
-frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
-frame_WK_size = (2 * 8)
-frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
-frame_GPRSAVE_size = (5 * 8)
-frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
 #define MSG(i)    msg    + 8*(i)               /* Input message (arg1) */
 #define DIGEST(i) digest + 8*(i)               /* Output Digest (arg2) */
 #define K_t(i)    .LK512   + 8*(i) ADD_RIP     /* SHA Constants (static mem) */
 #define W_t(i)    rsp + frame_W  + 8*(i)       /* Message Schedule (stack frame) */
 #define WK_2(i)   rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
-.macro RotateState
-	/* Rotate symbles a..h right */
-	__TMP = h_64
-	h_64 =  g_64
-	g_64 =  f_64
-	f_64 =  e_64
-	e_64 =  d_64
-	d_64 =  c_64
-	c_64 =  b_64
-	b_64 =  a_64
-	a_64 =  __TMP
-.endm
-
-.macro RORQ p1 p2
-	/* shld is faster than ror on Intel Sandybridge */
-	shld	\p1, \p1, (64 - \p2)
-.endm
-
-.macro SHA512_Round t
-	/* Compute Round %%t */
-	mov	T1,   f_64        /* T1 = f */
-	mov	tmp0, e_64        /* tmp = e */
-	xor	T1,   g_64        /* T1 = f ^ g */
-	RORQ	tmp0, 23 /* 41     ; tmp = e ror 23 */
-	and	T1,   e_64        /* T1 = (f ^ g) & e */
-	xor	tmp0, e_64        /* tmp = (e ror 23) ^ e */
-	xor	T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
-	add	T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
-	RORQ	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
-	xor	tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
-	mov	T2,   a_64        /* T2 = a */
-	add	T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
-	RORQ	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
-	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
-	mov	tmp0, a_64        /* tmp = a */
-	xor	T2,   c_64        /* T2 = a ^ c */
-	and	tmp0, c_64        /* tmp = a & c */
-	and	T2,   b_64        /* T2 = (a ^ c) & b */
-	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
-	mov	tmp0, a_64        /* tmp = a */
-	RORQ	tmp0, 5 /* 39      ; tmp = a ror 5 */
-	xor	tmp0, a_64        /* tmp = (a ror 5) ^ a */
-	add	d_64, T1          /* e(next_state) = d + T1  */
-	RORQ	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
-	xor	tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
-	lea	h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
-	RORQ	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
-	add	h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
-	RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx t
-/*	; Compute rounds %%t-2 and %%t-1
-	; Compute message schedule QWORDS %%t and %%t+1
-
-	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
-	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
-	; scheduler.
-	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
-	; They are then added to their respective SHA512 constants at
-	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
-	;   For brievity, the comments following vectored instructions only refer to
-	; the first of a pair of QWORDS.
-	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
-	;   The computation of the message schedule and the rounds are tightly
-	; stitched to take advantage of instruction-level parallelism.
-	; For clarity, integer instructions (for the rounds calculation) are indented
-	; by one tab. Vectored instructions (for the message scheduler) are indented
-	; by two tabs. */
-
-		vmovdqa	xmm4, [W_t(\t-2)]   /* XMM4 = W[t-2] */
-		vmovdqu	xmm5, [W_t(\t-15)]  /* XMM5 = W[t-15] */
-	mov	T1,   f_64
-		vpsrlq	xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */
-	mov	tmp0, e_64
-		vpsrlq	xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */
-	xor	T1,   g_64
-	RORQ	tmp0, 23 /* 41 */
-		vpsrlq	xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */
-	and	T1,   e_64
-	xor	tmp0, e_64
-		vpxor	xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */
-	xor	T1,   g_64
-	add	T1,   [WK_2(\t)];
-		vpsrlq	xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */
-	RORQ	tmp0, 4 /* 18 */
-		vpsrlq	xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */
-	xor	tmp0, e_64
-	mov	T2,   a_64
-	add	T1,   h_64
-		vpxor	xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */
-	RORQ	tmp0, 14 /* 14 */
-	add	T1,   tmp0
-		vpsrlq	xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */
-	mov 	tmp0, a_64
-	xor	T2,   c_64
-		vpsllq	xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */
-	and	tmp0, c_64
-	and	T2,   b_64
-		vpxor	xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */
-	xor	T2,   tmp0
-	mov	tmp0, a_64
-		vpsllq	xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */
-	RORQ	tmp0, 5 /* 39 */
-		vpxor	xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */
-	xor	tmp0, a_64
-	add	d_64, T1
-	RORQ	tmp0, 6 /* 34 */
-	xor	tmp0, a_64
-		vpxor	xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */
-	lea	h_64, [T1 + T2]
-	RORQ 	tmp0, 28 /* 28 */
-		vpsllq	xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */
-	add	h_64, tmp0
-	RotateState
-		vpxor	xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */
-	mov	T1, f_64
-		vpxor	xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */
-	mov	tmp0, e_64
-	xor	T1,   g_64
-		vpaddq	xmm0, xmm0, [W_t(\t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */
-		vmovdqu	xmm1, [W_t(\t- 7)]  /* XMM1 = W[t-7] */
-	RORQ	tmp0, 23 /* 41 */
-	and	T1,   e_64
-	xor	tmp0, e_64
-	xor	T1,   g_64
-		vpsllq	xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */
-	add	T1,   [WK_2(\t+1)]
-		vpxor	xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */
-	RORQ	tmp0, 4 /* 18 */
-		vpaddq	xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */
-	xor	tmp0, e_64
-		vpaddq	xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
-	mov	T2,   a_64
-	add	T1,   h_64
-	RORQ	tmp0, 14 /* 14 */
-	add	T1,   tmp0
-		vmovdqa	[W_t(\t)], xmm0      /* Store W[t] */
-		vpaddq	xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */
-		vmovdqa	[WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */
-	mov	tmp0, a_64
-	xor	T2,   c_64
-	and	tmp0, c_64
-	and	T2,   b_64
-	xor	T2,   tmp0
-	mov	tmp0, a_64
-	RORQ	tmp0, 5 /* 39 */
-	xor	tmp0, a_64
-	add	d_64, T1
-	RORQ	tmp0, 6 /* 34 */
-	xor	tmp0, a_64
-	lea	h_64, [T1 + T2]
-	RORQ	tmp0, 28 /* 28 */
-	add	h_64, tmp0
-	RotateState
-.endm
+#define RORQ(p1, p2) \
+	/* shld is faster than ror on Intel Sandybridge */ \
+	shld	p1, p1, (64 - p2)
+
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+	/* Compute Round %%t */; \
+	mov	T1,   f        /* T1 = f */; \
+	mov	tmp0, e        /* tmp = e */; \
+	xor	T1,   g        /* T1 = f ^ g */; \
+	RORQ(	tmp0, 23) /* 41     ; tmp = e ror 23 */; \
+	and	T1,   e        /* T1 = (f ^ g) & e */; \
+	xor	tmp0, e        /* tmp = (e ror 23) ^ e */; \
+	xor	T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+	add	T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+	RORQ(	tmp0, 4) /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+	xor	tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+	mov	T2,   a        /* T2 = a */; \
+	add	T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+	RORQ(	tmp0, 14) /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	xor	T2,   c        /* T2 = a ^ c */; \
+	and	tmp0, c        /* tmp = a & c */; \
+	and	T2,   b        /* T2 = (a ^ c) & b */; \
+	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	RORQ(	tmp0, 5) /* 39      ; tmp = a ror 5 */; \
+	xor	tmp0, a        /* tmp = (a ror 5) ^ a */; \
+	add	d, T1          /* e(next_state) = d + T1  */; \
+	RORQ(	tmp0, 6) /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+	xor	tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+	lea	h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+	RORQ(	tmp0, 28) /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+	add	h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \
+	/* \
+	; Compute rounds %%t-2 and %%t-1 \
+	; Compute message schedule QWORDS %%t and %%t+1 \
+	; \
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+	; scheduler. \
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+	; They are then added to their respective SHA512 constants at \
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+	;   For brievity, the comments following vectored instructions only refer to \
+	; the first of a pair of QWORDS. \
+	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \
+	;   The computation of the message schedule and the rounds are tightly \
+	; stitched to take advantage of instruction-level parallelism. \
+	; For clarity, integer instructions (for the rounds calculation) are indented \
+	; by one tab. Vectored instructions (for the message scheduler) are indented \
+	; by two tabs. \
+	*/ \
+	\
+		vmovdqa	xmm4, [W_t(t-2)]   /* XMM4 = W[t-2] */; \
+		vmovdqu	xmm5, [W_t(t-15)]  /* XMM5 = W[t-15] */; \
+	mov	T1,   f; \
+		vpsrlq	xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */; \
+	mov	tmp0, e; \
+		vpsrlq	xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */; \
+	xor	T1,   g; \
+	RORQ(	tmp0, 23) /* 41 */; \
+		vpsrlq	xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */; \
+	and	T1,   e; \
+	xor	tmp0, e; \
+		vpxor	xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \
+	xor	T1,   g; \
+	add	T1,   [WK_2(t)]; \
+		vpsrlq	xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */; \
+	RORQ(	tmp0, 4) /* 18 */; \
+		vpsrlq	xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */; \
+	xor	tmp0, e; \
+	mov	T2,   a; \
+	add	T1,   h; \
+		vpxor	xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \
+	RORQ(	tmp0, 14) /* 14 */; \
+	add	T1,   tmp0; \
+		vpsrlq	xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */; \
+	mov 	tmp0, a; \
+	xor	T2,   c; \
+		vpsllq	xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */; \
+	and	tmp0, c; \
+	and	T2,   b; \
+		vpxor	xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \
+	xor	T2,   tmp0; \
+	mov	tmp0, a; \
+		vpsllq	xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */; \
+	RORQ(	tmp0, 5) /* 39 */; \
+		vpxor	xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \
+	xor	tmp0, a; \
+	add	d, T1; \
+	RORQ(	tmp0, 6) /* 34 */; \
+	xor	tmp0, a; \
+		vpxor	xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \
+	lea	h, [T1 + T2]; \
+	RORQ(	tmp0, 28) /* 28 */; \
+		vpsllq	xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */; \
+	add	h, tmp0
+
+#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \
+		vpxor	xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \
+	mov	T1, f; \
+		vpxor	xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */; \
+	mov	tmp0, e; \
+	xor	T1,   g; \
+		vpaddq	xmm0, xmm0, [W_t(t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */; \
+		vmovdqu	xmm1, [W_t(t- 7)]  /* XMM1 = W[t-7] */; \
+	RORQ(	tmp0, 23) /* 41 */; \
+	and	T1,   e; \
+	xor	tmp0, e; \
+	xor	T1,   g; \
+		vpsllq	xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */; \
+	add	T1,   [WK_2(t+1)]; \
+		vpxor	xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */; \
+	RORQ(	tmp0, 4) /* 18 */; \
+		vpaddq	xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \
+	xor	tmp0, e; \
+		vpaddq	xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+	mov	T2,   a; \
+	add	T1,   h; \
+	RORQ(	tmp0, 14) /* 14 */; \
+	add	T1,   tmp0; \
+		vmovdqa	[W_t(t)], xmm0      /* Store W[t] */; \
+		vpaddq	xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */; \
+		vmovdqa	[WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */; \
+	mov	tmp0, a; \
+	xor	T2,   c; \
+	and	tmp0, c; \
+	and	T2,   b; \
+	xor	T2,   tmp0; \
+	mov	tmp0, a; \
+	RORQ(	tmp0, 5) /* 39 */; \
+	xor	tmp0, a; \
+	add	d, T1; \
+	RORQ(	tmp0, 6) /* 34 */; \
+	xor	tmp0, a; \
+	lea	h, [T1 + T2]; \
+	RORQ(	tmp0, 28) /* 28 */; \
+	add	h, tmp0
+
+#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \
+	SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \
+	SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_avx(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks
 */
 .globl _gcry_sha512_transform_amd64_avx
 ELF(.type _gcry_sha512_transform_amd64_avx,@function;)
 .align 16
 _gcry_sha512_transform_amd64_avx:
 	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp	msglen, 0
 	je	.Lnowork
 
 	vzeroupper
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
 	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
 	mov	[rsp + frame_GPRSAVE + 8 * 1], r12
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
 	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
 	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
 	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
 	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
 	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
 	/* Load state variables */
 	mov	a_64, [DIGEST(0)]
 	mov	b_64, [DIGEST(1)]
 	mov	c_64, [DIGEST(2)]
 	mov	d_64, [DIGEST(3)]
 	mov	e_64, [DIGEST(4)]
 	mov	f_64, [DIGEST(5)]
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 
-	t = 0
-	.rept 80/2 + 1
-	/* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
-	/* +1 iteration because the scheduler leads hashing by 1 iteration */
-		.if t < 2
-			/* BSWAP 2 QWORDS */
-			vmovdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
-			vmovdqu	xmm0, [MSG(t)]
-			vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
-			vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */
-			vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
-			vmovdqa	[WK_2(t)], xmm0      /* Store into WK for rounds */
-		.elseif t < 16
-			/* BSWAP 2 QWORDS, Compute 2 Rounds */
-			vmovdqu	xmm0, [MSG(t)]
-			vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
-			SHA512_Round (t - 2)         /* Round t-2 */
-			vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */
-			vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
-			SHA512_Round (t - 1)         /* Round t-1 */
-			vmovdqa	[WK_2(t)], xmm0      /* W[t]+K[t] into WK */
-		.elseif t < 79
-			/* Schedule 2 QWORDS; Compute 2 Rounds */
-			SHA512_2Sched_2Round_avx t
-		.else
-			/* Compute 2 Rounds */
-			SHA512_Round (t - 2)
-			SHA512_Round (t - 1)
-		.endif
-		t = ((t)+2)
-	.endr
+	/* BSWAP 2 QWORDS */
+	vmovdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+	vmovdqu	xmm0, [MSG(0)]
+	vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
+	vmovdqa	[W_t(0)], xmm0       /* Store Scheduled Pair */
+	vpaddq	xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */
+	vmovdqa	[WK_2(0)], xmm0      /* Store into WK for rounds */
+
+	#define T_2_14(t, a, b, c, d, e, f, g, h) \
+		/* BSWAP 2 QWORDS, Compute 2 Rounds */; \
+		vmovdqu	xmm0, [MSG(t)]; \
+		vpshufb	xmm0, xmm0, xmm1     /* BSWAP */; \
+		SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+				        e##_64, f##_64, g##_64, h##_64); \
+		vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */; \
+		vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \
+		SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+				        d##_64, e##_64, f##_64, g##_64); \
+		vmovdqa	[WK_2(t)], xmm0      /* W[t]+K[t] into WK */
+
+	#define T_16_78(t, a, b, c, d, e, f, g, h) \
+		SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \
+					      e##_64, f##_64, g##_64, h##_64)
+
+	#define T_80(t, a, b, c, d, e, f, g, h) \
+		/* Compute 2 Rounds */; \
+		SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+				      e##_64, f##_64, g##_64, h##_64); \
+		SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+				      d##_64, e##_64, f##_64, g##_64)
+
+	T_2_14(2, a, b, c, d, e, f, g, h)
+	T_2_14(4, g, h, a, b, c, d, e, f)
+	T_2_14(6, e, f, g, h, a, b, c, d)
+	T_2_14(8, c, d, e, f, g, h, a, b)
+	T_2_14(10, a, b, c, d, e, f, g, h)
+	T_2_14(12, g, h, a, b, c, d, e, f)
+	T_2_14(14, e, f, g, h, a, b, c, d)
+	T_16_78(16, c, d, e, f, g, h, a, b)
+	T_16_78(18, a, b, c, d, e, f, g, h)
+	T_16_78(20, g, h, a, b, c, d, e, f)
+	T_16_78(22, e, f, g, h, a, b, c, d)
+	T_16_78(24, c, d, e, f, g, h, a, b)
+	T_16_78(26, a, b, c, d, e, f, g, h)
+	T_16_78(28, g, h, a, b, c, d, e, f)
+	T_16_78(30, e, f, g, h, a, b, c, d)
+	T_16_78(32, c, d, e, f, g, h, a, b)
+	T_16_78(34, a, b, c, d, e, f, g, h)
+	T_16_78(36, g, h, a, b, c, d, e, f)
+	T_16_78(38, e, f, g, h, a, b, c, d)
+	T_16_78(40, c, d, e, f, g, h, a, b)
+	T_16_78(42, a, b, c, d, e, f, g, h)
+	T_16_78(44, g, h, a, b, c, d, e, f)
+	T_16_78(46, e, f, g, h, a, b, c, d)
+	T_16_78(48, c, d, e, f, g, h, a, b)
+	T_16_78(50, a, b, c, d, e, f, g, h)
+	T_16_78(52, g, h, a, b, c, d, e, f)
+	T_16_78(54, e, f, g, h, a, b, c, d)
+	T_16_78(56, c, d, e, f, g, h, a, b)
+	T_16_78(58, a, b, c, d, e, f, g, h)
+	T_16_78(60, g, h, a, b, c, d, e, f)
+	T_16_78(62, e, f, g, h, a, b, c, d)
+	T_16_78(64, c, d, e, f, g, h, a, b)
+	T_16_78(66, a, b, c, d, e, f, g, h)
+	T_16_78(68, g, h, a, b, c, d, e, f)
+	T_16_78(70, e, f, g, h, a, b, c, d)
+	T_16_78(72, c, d, e, f, g, h, a, b)
+	T_16_78(74, a, b, c, d, e, f, g, h)
+	T_16_78(76, g, h, a, b, c, d, e, f)
+	T_16_78(78, e, f, g, h, a, b, c, d)
+	T_80(80, c, d, e, f, g, h, a, b)
 
 	/* Update digest */
 	add	[DIGEST(0)], a_64
 	add	[DIGEST(1)], b_64
 	add	[DIGEST(2)], c_64
 	add	[DIGEST(3)], d_64
 	add	[DIGEST(4)], e_64
 	add	[DIGEST(5)], f_64
 	add	[DIGEST(6)], g_64
 	add	[DIGEST(7)], h_64
 
 	/* Advance to next message block */
 	add	msg, 16*8
 	dec	msglen
 	jnz	.Lupdateblock
 
 	/* Restore GPRs */
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	r12, [rsp + frame_GPRSAVE + 8 * 1]
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
 	CFI_RESTORE(rbx)
 	CFI_RESTORE(r12)
 	CFI_RESTORE(r13)
 	CFI_RESTORE(r14)
 	CFI_RESTORE(r15)
 
 	vzeroall
 
 	/* Burn stack */
-	t = 0
-	.rept frame_W_size / 32
-		vmovups [rsp + frame_W + (t) * 32], ymm0
-		t = ((t)+1)
-	.endr
+	mov eax, 0
+.Lerase_stack:
+	vmovdqu [rsp + rax], ymm0
+	add eax, 32
+	cmp eax, frame_W_size
+	jne .Lerase_stack
 	vmovdqu [rsp + frame_WK], xmm0
 	xor     eax, eax
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
 	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
 	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Binary Data
 */
 
 .align 16
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .LXMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad 0x3956c25bf348b538,0x59f111f1b605d019
 	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad 0xd807aa98a3030242,0x12835b0145706fbe
 	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad 0x06ca6351e003826f,0x142929670a0e6e70
 	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad 0x81c2c92e47edaee6,0x92722c851482353b
 	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad 0xd192e819d6ef5218,0xd69906245565a910
 	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad 0x90befffa23631e28,0xa4506cebde82bde9
 	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad 0xca273eceea26619c,0xd186b8c721c0c207
 	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad 0x113f9804bef90dae,0x1b710b35131c471b
 	.quad 0x28db77f523047d84,0x32caab7b40c72493
 	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 #endif
 #endif
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 3b28ab6c..7f119e6c 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -1,568 +1,502 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA512)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
-Y_0 = ymm4
-Y_1 = ymm5
-Y_2 = ymm6
-Y_3 = ymm7
-
-YTMP0 = ymm0
-YTMP1 = ymm1
-YTMP2 = ymm2
-YTMP3 = ymm3
-YTMP4 = ymm8
-XFER =  YTMP0
-
-BYTE_FLIP_MASK =  ymm9
-MASK_YMM_LO    =  ymm10
-MASK_YMM_LOx   =  xmm10
-
-INP =         rdi /* 1st arg */
-CTX =         rsi /* 2nd arg */
-NUM_BLKS =    rdx /* 3rd arg */
-c =           rcx
-d =           r8
-e =           rdx
-y3 =          rdi
-
-TBL =   rbp
-
-a =     rax
-b =     rbx
-
-f =     r9
-g =     r10
-h =     r11
-old_h = rax
-
-T1 =    r12
-y0 =    r13
-y1 =    r14
-y2 =    r15
-
-y4 =    r12
+#define Y_0 ymm4
+#define Y_1 ymm5
+#define Y_2 ymm6
+#define Y_3 ymm7
+
+#define YTMP0 ymm0
+#define YTMP1 ymm1
+#define YTMP2 ymm2
+#define YTMP3 ymm3
+#define YTMP4 ymm8
+#define XFER YTMP0
+
+#define BYTE_FLIP_MASK ymm9
+#define MASK_YMM_LO ymm10
+#define MASK_YMM_LOx xmm10
+
+#define INP rdi /* 1st arg */
+#define CTX rsi /* 2nd arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define c rcx
+#define d r8
+#define e rdx
+#define y3 rdi
+
+#define TBL rbp
+
+#define a rax
+#define b rbx
+
+#define f r9
+#define g r10
+#define h r11
+
+#define T1 r12
+#define y0 r13
+#define y1 r14
+#define y2 r15
+
+#define y4 r12
 
 /* Local variables (stack frame) */
 #define frame_XFER      0
 #define frame_XFER_size (4*4*8)
 #define frame_SRND      (frame_XFER + frame_XFER_size)
 #define frame_SRND_size (1*8)
 #define frame_INP      (frame_SRND + frame_SRND_size)
 #define frame_INP_size (1*8)
 #define frame_NBLKS      (frame_INP + frame_INP_size)
 #define frame_NBLKS_size (1*8)
 #define frame_RSPSAVE      (frame_NBLKS + frame_NBLKS_size)
 #define frame_RSPSAVE_size (1*8)
 #define frame_GPRSAVE      (frame_RSPSAVE + frame_RSPSAVE_size)
 #define frame_GPRSAVE_size (6*8)
 #define frame_size (frame_GPRSAVE + frame_GPRSAVE_size)
 
 #define	VMOVDQ vmovdqu /*; assume buffers not aligned  */
 
 /* addm [mem], reg */
 /* Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 
 /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
 /* Load ymm with mem and byte swap each dword */
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
-	VMOVDQ \p1, \p2
-	vpshufb \p1, \p1, \p3
-.endm
-/* rotate_Ys */
-/* Rotate values of symbols Y0...Y3 */
-.macro rotate_Ys
-	__Y_ = Y_0
-	Y_0 = Y_1
-	Y_1 = Y_2
-	Y_2 = Y_3
-	Y_3 = __Y_
-.endm
-
-/* RotateState */
-.macro RotateState
-	/* Rotate symbles a..h right */
-	old_h =  h
-	__TMP_ = h
-	h =      g
-	g =      f
-	f =      e
-	e =      d
-	d =      c
-	c =      b
-	b =      a
-	a =      __TMP_
-.endm
+#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
+	VMOVDQ p1, p2; \
+	vpshufb p1, p1, p3
 
 /* %macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL */
 /* YDST = {YSRC1, YSRC2} >> RVAL*8 */
-.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
-	vperm2f128 	\YDST, \YSRC1, \YSRC2, 0x3	/* YDST = {YS1_LO, YS2_HI} */
-	vpalignr 	\YDST, \YDST, \YSRC2, \RVAL	/* YDST = {YDS1, YS2} >> RVAL*8 */
-.endm
-
-.macro ONE_ROUND_PART1 XFER
-	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
-	 * d += h;
-	 * h += Sum0 (a) + Maj (a, b, c);
-	 *
-	 * Ch(x, y, z) => ((x & y) + (~x & z))
-	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
-	 */
-
-	mov y3, e
-	add h, [\XFER]
-	and y3, f
-	rorx y0, e, 41
-	rorx y1, e, 18
+#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
+	vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \
+	vpalignr   YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \
+	 * d += h; \
+	 * h += Sum0 (a) + Maj (a, b, c); \
+	 * \
+	 * Ch(x, y, z) => ((x & y) + (~x & z)) \
+	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \
+	 */ \
+	\
+	mov y3, e; \
+	add h, [XFERIN]; \
+	and y3, f; \
+	rorx y0, e, 41; \
+	rorx y1, e, 18; \
+	lea h, [h + y3]; \
+	andn y3, e, g; \
+	rorx T1, a, 34; \
+	xor y0, y1; \
 	lea h, [h + y3]
-	andn y3, e, g
-	rorx T1, a, 34
-	xor y0, y1
-	lea h, [h + y3]
-.endm
-.macro ONE_ROUND_PART2
-	rorx y2, a, 39
-	rorx y1, e, 14
-	mov y3, a
-	xor T1, y2
-	xor y0, y1
-	xor y3, b
-	lea h, [h + y0]
-	mov y0, a
-	rorx y2, a, 28
-	add d, h
-	and y3, c
-	xor T1, y2
-	lea h, [h + y3]
-	lea h, [h + T1]
-	and y0, b
-	lea h, [h + y0]
-.endm
-
-.macro ONE_ROUND XFER
-	ONE_ROUND_PART1 \XFER
-	ONE_ROUND_PART2
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED X
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		/* Extract w[t-7] */
-		MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		/* YTMP0 = W[-7] */
-		/* Calculate w[t-16] + w[t-7] */
-		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */
-		/* Extract w[t-15] */
-		MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		/* YTMP1 = W[-15] */
-
-		/* Calculate sigma0 */
-
-		/* Calculate w[t-15] ror 1 */
-		vpsrlq		YTMP2, YTMP1, 1
-		vpsllq		YTMP3, YTMP1, (64-1)
-		vpor		YTMP3, YTMP3, YTMP2		/* YTMP3 = W[-15] ror 1 */
-		/* Calculate w[t-15] shr 7 */
-		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */
-
-	ONE_ROUND rsp+frame_XFER+0*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		/* Calculate w[t-15] ror 8 */
-		vpsrlq		YTMP2, YTMP1, 8
-		vpsllq		YTMP1, YTMP1, (64-8)
-		vpor		YTMP1, YTMP1, YTMP2		/* YTMP1 = W[-15] ror 8 */
-		/* XOR the three components */
-		vpxor		YTMP3, YTMP3, YTMP4		/* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */
-		vpxor		YTMP1, YTMP3, YTMP1		/* YTMP1 = s0 */
-
-
-		/* Add three components, w[t-16], w[t-7] and sigma0 */
-		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */
-		/* Move to appropriate lanes for calculating w[16] and w[17] */
-		vperm2f128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */
-		/* Move to appropriate lanes for calculating w[18] and w[19] */
-		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */
-
-		/* Calculate w[16] and w[17] in both 128 bit lanes */
-
-		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */
-		vperm2f128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */
-		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */
-
-	ONE_ROUND rsp+frame_XFER+1*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
 
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+	rorx y2, a, 39; \
+	rorx y1, e, 14; \
+	mov y3, a; \
+	xor T1, y2; \
+	xor y0, y1; \
+	xor y3, b; \
+	lea h, [h + y0]; \
+	mov y0, a; \
+	rorx y2, a, 28; \
+	add d, h; \
+	and y3, c; \
+	xor T1, y2; \
+	lea h, [h + y3]; \
+	lea h, [h + T1]; \
+	and y0, b; \
+	lea h, [h + y0]
 
-		vpsrlq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] >> 19 {BABA} */
-		vpsllq		YTMP1, YTMP2, (64-19)		/* YTMP1 = W[-2] << 19 {BABA} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {BABA} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */
-		vpsrlq		YTMP3, YTMP2, 61		/* YTMP3 = W[-2] >> 61 {BABA} */
-		vpsllq		YTMP1, YTMP2, (64-61)		/* YTMP1 = W[-2] << 61 {BABA} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {BABA} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */
-
-		/* Add sigma1 to the other compunents to get w[16] and w[17] */
-		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */
-
-		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */
-		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */
-
-	ONE_ROUND rsp+frame_XFER+2*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpsrlq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] >> 19 {DC--} */
-		vpsllq		YTMP1, Y_0, (64-19)		/* YTMP1 = W[-2] << 19 {DC--} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {DC--} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */
-		vpsrlq		YTMP3, Y_0, 61			/* YTMP3 = W[-2] >> 61 {DC--} */
-		vpsllq		YTMP1, Y_0, (64-61)		/* YTMP1 = W[-2] << 61 {DC--} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {DC--} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */
-
-		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */
-		vpaddq		YTMP2, YTMP0, YTMP4		/* YTMP2 = {W[3], W[2], --, --} */
-
-		/* Form w[19, w[18], w17], w[16] */
-		vpblendd		Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */
-
-	ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32
-		vpaddq		XFER, Y_0, [TBL + (4+\X)*32]
-		vmovdqa		[rsp + frame_XFER + \X*32], XFER
-	ONE_ROUND_PART2
-	RotateState
-	rotate_Ys
-.endm
-
-.macro DO_4ROUNDS X
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+0*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+1*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+2*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+3*8+\X*32
-	RotateState
-
-.endm
+#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \
+	ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \
+	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		/* Extract w[t-7] */; \
+		MY_VPALIGNR(	YTMP0, Y_3, Y_2, 8)		/* YTMP0 = W[-7] */; \
+		/* Calculate w[t-16] + w[t-7] */; \
+		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */; \
+		/* Extract w[t-15] */; \
+		MY_VPALIGNR(	YTMP1, Y_1, Y_0, 8)		/* YTMP1 = W[-15] */; \
+		\
+		/* Calculate sigma0 */; \
+		\
+		/* Calculate w[t-15] ror 1 */; \
+		vpsrlq		YTMP2, YTMP1, 1; \
+		vpsllq		YTMP3, YTMP1, (64-1); \
+		vpor		YTMP3, YTMP3, YTMP2		/* YTMP3 = W[-15] ror 1 */; \
+		/* Calculate w[t-15] shr 7 */; \
+		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		/* Calculate w[t-15] ror 8 */; \
+		vpsrlq		YTMP2, YTMP1, 8; \
+		vpsllq		YTMP1, YTMP1, (64-8); \
+		vpor		YTMP1, YTMP1, YTMP2		/* YTMP1 = W[-15] ror 8 */; \
+		/* XOR the three components */; \
+		vpxor		YTMP3, YTMP3, YTMP4		/* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \
+		vpxor		YTMP1, YTMP3, YTMP1		/* YTMP1 = s0 */; \
+		\
+		/* Add three components, w[t-16], w[t-7] and sigma0 */; \
+		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */; \
+		/* Move to appropriate lanes for calculating w[16] and w[17] */; \
+		vperm2i128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \
+		/* Move to appropriate lanes for calculating w[18] and w[19] */; \
+		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \
+		\
+		/* Calculate w[16] and w[17] in both 128 bit lanes */; \
+		\
+		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \
+		vperm2i128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */; \
+		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] >> 19 {BABA} */; \
+		vpsllq		YTMP1, YTMP2, (64-19)		/* YTMP1 = W[-2] << 19 {BABA} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {BABA} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \
+		vpsrlq		YTMP3, YTMP2, 61		/* YTMP3 = W[-2] >> 61 {BABA} */; \
+		vpsllq		YTMP1, YTMP2, (64-61)		/* YTMP1 = W[-2] << 61 {BABA} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {BABA} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \
+		\
+		/* Add sigma1 to the other compunents to get w[16] and w[17] */; \
+		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */; \
+		\
+		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \
+		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] >> 19 {DC--} */; \
+		vpsllq		YTMP1, Y_0, (64-19)		/* YTMP1 = W[-2] << 19 {DC--} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {DC--} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \
+		vpsrlq		YTMP3, Y_0, 61			/* YTMP3 = W[-2] >> 61 {DC--} */; \
+		vpsllq		YTMP1, Y_0, (64-61)		/* YTMP1 = W[-2] << 61 {DC--} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {DC--} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \
+		\
+		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \
+		vpaddq		YTMP2, YTMP0, YTMP4		/* YTMP2 = {W[3], W[2], --, --} */; \
+		\
+		/* Form w[19, w[18], w17], w[16] */; \
+		vpblendd	Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */; \
+	\
+	ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \
+		vpaddq		XFER, Y_0, [TBL + (4+X)*32]; \
+		vmovdqa		[rsp + frame_XFER + X*32], XFER; \
+	ONE_ROUND_PART2(f, g, h, a, b, c, d, e)
+
+#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \
+	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+	ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_rorx(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks
 */
 .globl _gcry_sha512_transform_amd64_avx2
 ELF(.type _gcry_sha512_transform_amd64_avx2,@function;)
 .align 16
 _gcry_sha512_transform_amd64_avx2:
 	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
 	je .Lnowork
 
 	vzeroupper
 
 	/* Allocate Stack Space */
 	mov	rax, rsp
 	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, frame_size
 	and	rsp, ~(0x40 - 1)
 	mov	[rsp + frame_RSPSAVE], rax
 	CFI_CFA_ON_STACK(frame_RSPSAVE, 0)
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbp
 	mov	[rsp + frame_GPRSAVE + 8 * 1], rbx
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r12
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 5], r15
 	CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0)
 	CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1)
 	CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2)
 	CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3)
 	CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4)
 	CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5)
 
 	mov	[rsp + frame_NBLKS], NUM_BLKS
 
 	/*; load initial digest */
 	mov	a,[8*0 + CTX]
 	mov	b,[8*1 + CTX]
 	mov	c,[8*2 + CTX]
 	mov	d,[8*3 + CTX]
 	mov	e,[8*4 + CTX]
 	mov	f,[8*5 + CTX]
 	mov	g,[8*6 + CTX]
 	mov	h,[8*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP]
 
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/*; byte swap first 16 dwords */
-	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
 	add	INP, 128
 	mov	[rsp + frame_INP], INP
 
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa [rsp + frame_XFER + 0*32], XFER
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa [rsp + frame_XFER + 1*32], XFER
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa [rsp + frame_XFER + 2*32], XFER
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
-	movq	[rsp + frame_SRND],4
+	mov	qword ptr [rsp + frame_SRND], 4
 
 .align 16
 .Loop0:
-	FOUR_ROUNDS_AND_SCHED 0
-	FOUR_ROUNDS_AND_SCHED 1
-	FOUR_ROUNDS_AND_SCHED 2
-	FOUR_ROUNDS_AND_SCHED 3
+	FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d)
+	FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d)
 	add	TBL, 4*32
 
-	subq	[rsp + frame_SRND], 1
+	sub	qword ptr [rsp + frame_SRND], 1
 	jne	.Loop0
 
-	subq	[rsp + frame_NBLKS], 1
+	sub	qword ptr [rsp + frame_NBLKS], 1
 	je	.Ldone_hash
 
 	mov	INP, [rsp + frame_INP]
 
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/* load next block and byte swap */
-	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
 	add	INP, 128
 	mov	[rsp + frame_INP], INP
 
-	DO_4ROUNDS 0
+	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa [rsp + frame_XFER + 0*32], XFER
-	DO_4ROUNDS 1
+	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa [rsp + frame_XFER + 1*32], XFER
-	DO_4ROUNDS 2
+	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa [rsp + frame_XFER + 2*32], XFER
-	DO_4ROUNDS 3
+	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
-	addm	[8*0 + CTX],a
-	addm	[8*1 + CTX],b
-	addm	[8*2 + CTX],c
-	addm	[8*3 + CTX],d
-	addm	[8*4 + CTX],e
-	addm	[8*5 + CTX],f
-	addm	[8*6 + CTX],g
-	addm	[8*7 + CTX],h
+	addm([8*0 + CTX],a)
+	addm([8*1 + CTX],b)
+	addm([8*2 + CTX],c)
+	addm([8*3 + CTX],d)
+	addm([8*4 + CTX],e)
+	addm([8*5 + CTX],f)
+	addm([8*6 + CTX],g)
+	addm([8*7 + CTX],h)
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
-	movq	[rsp + frame_SRND],4
+	mov	qword ptr [rsp + frame_SRND],4
 
 	jmp	.Loop0
 
 .Ldone_hash:
 	vzeroall
 
-	DO_4ROUNDS 0
+	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
 	vmovdqa	[rsp + frame_XFER + 0*32], ymm0 /* burn stack */
-	DO_4ROUNDS 1
+	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
 	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
-	DO_4ROUNDS 2
+	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
 	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
-	DO_4ROUNDS 3
+	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
 	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
 
-	addm	[8*0 + CTX],a
+	addm([8*0 + CTX],a)
 	xor     eax, eax /* burn stack */
-	addm	[8*1 + CTX],b
-	addm	[8*2 + CTX],c
-	addm	[8*3 + CTX],d
-	addm	[8*4 + CTX],e
-	addm	[8*5 + CTX],f
-	addm	[8*6 + CTX],g
-	addm	[8*7 + CTX],h
+	addm([8*1 + CTX],b)
+	addm([8*2 + CTX],c)
+	addm([8*3 + CTX],d)
+	addm([8*4 + CTX],e)
+	addm([8*5 + CTX],f)
+	addm([8*6 + CTX],g)
+	addm([8*7 + CTX],h)
 
 	/* Restore GPRs */
 	mov	rbp, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 1]
 	mov	r12, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
 	CFI_RESTORE(rbp)
 	CFI_RESTORE(rbx)
 	CFI_RESTORE(r12)
 	CFI_RESTORE(r13)
 	CFI_RESTORE(r14)
 	CFI_RESTORE(r15)
 
 	/* Restore Stack Pointer */
 	mov	rsp, [rsp + frame_RSPSAVE]
 	CFI_DEF_CFA_REGISTER(rsp)
 
 .Lnowork:
 	ret
 	CFI_ENDPROC()
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 /*;; Binary Data */
 
 .align 64
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad	0xd192e819d6ef5218,0xd69906245565a910
 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
 	.quad	0x28db77f523047d84,0x32caab7b40c72493
 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 .align 32
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
 			   .octa 0x18191a1b1c1d1e1f1011121314151617
 
 .LMASK_YMM_LO:		   .octa 0x00000000000000000000000000000000
 			   .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 39bfe362..6a1328a6 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -1,436 +1,467 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Note: original implementation was named as SHA512-SSE4. However, only SSSE3
  *       is required.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
 
 #include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
-msg = rdi /* ARG1 */
-digest = rsi /* ARG2 */
-msglen = rdx /* ARG3 */
-T1 = rcx
-T2 = r8
-a_64 = r9
-b_64 = r10
-c_64 = r11
-d_64 = r12
-e_64 = r13
-f_64 = r14
-g_64 = r15
-h_64 = rbx
-tmp0 = rax
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
-frame_W      = 0 /* Message Schedule */
-frame_W_size = (80 * 8)
-frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
-frame_WK_size = (2 * 8)
-frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
-frame_GPRSAVE_size = (5 * 8)
-frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
 #define MSG(i)    msg    + 8*(i)               /* Input message (arg1) */
 #define DIGEST(i) digest + 8*(i)               /* Output Digest (arg2) */
 #define K_t(i)    .LK512   + 8*(i) ADD_RIP     /* SHA Constants (static mem) */
 #define W_t(i)    rsp + frame_W  + 8*(i)       /* Message Schedule (stack frame) */
 #define WK_2(i)   rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
-.macro RotateState
-	/* Rotate symbles a..h right */
-	__TMP = h_64
-	h_64 =  g_64
-	g_64 =  f_64
-	f_64 =  e_64
-	e_64 =  d_64
-	d_64 =  c_64
-	c_64 =  b_64
-	b_64 =  a_64
-	a_64 =  __TMP
-.endm
-
-.macro SHA512_Round t
-	/* Compute Round %%t */
-	mov	T1,   f_64        /* T1 = f */
-	mov	tmp0, e_64        /* tmp = e */
-	xor	T1,   g_64        /* T1 = f ^ g */
-	ror	tmp0, 23 /* 41     ; tmp = e ror 23 */
-	and	T1,   e_64        /* T1 = (f ^ g) & e */
-	xor	tmp0, e_64        /* tmp = (e ror 23) ^ e */
-	xor	T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
-	add	T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
-	ror	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
-	xor	tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
-	mov	T2,   a_64        /* T2 = a */
-	add	T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
-	ror	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
-	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
-	mov	tmp0, a_64        /* tmp = a */
-	xor	T2,   c_64        /* T2 = a ^ c */
-	and	tmp0, c_64        /* tmp = a & c */
-	and	T2,   b_64        /* T2 = (a ^ c) & b */
-	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
-	mov	tmp0, a_64        /* tmp = a */
-	ror	tmp0, 5 /* 39      ; tmp = a ror 5 */
-	xor	tmp0, a_64        /* tmp = (a ror 5) ^ a */
-	add	d_64, T1          /* e(next_state) = d + T1  */
-	ror	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
-	xor	tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
-	lea	h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
-	ror	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
-	add	h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
-	RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse t
-/*	; Compute rounds %%t-2 and %%t-1
-	; Compute message schedule QWORDS %%t and %%t+1
-
-	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
-	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
-	; scheduler.
-	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
-	; They are then added to their respective SHA512 constants at
-	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
-	;   For brievity, the comments following vectored instructions only refer to
-	; the first of a pair of QWORDS.
-	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
-	;   The computation of the message schedule and the rounds are tightly
-	; stitched to take advantage of instruction-level parallelism.
-	; For clarity, integer instructions (for the rounds calculation) are indented
-	; by one tab. Vectored instructions (for the message scheduler) are indented
-	; by two tabs. */
-
-	mov	T1, f_64
-		movdqa	xmm2, [W_t(\t-2)]  /* XMM2 = W[t-2] */
-	xor	T1,   g_64
-	and	T1,   e_64
-		movdqa	xmm0, xmm2          /* XMM0 = W[t-2] */
-	xor	T1,   g_64
-	add	T1,   [WK_2(\t)]
-		movdqu	xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */
-	mov	tmp0, e_64
-	ror	tmp0, 23 /* 41 */
-		movdqa	xmm3, xmm5          /* XMM3 = W[t-15] */
-	xor	tmp0, e_64
-	ror	tmp0, 4 /* 18 */
-		psrlq	xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */
-	xor	tmp0, e_64
-	ror	tmp0, 14 /* 14 */
-		psrlq	xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */
-	add	T1,   tmp0
-	add	T1,   h_64
-		pxor	xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */
-	mov	T2,   a_64
-	xor	T2,   c_64
-		pxor	xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */
-	and	T2,   b_64
-	mov	tmp0, a_64
-		psrlq	xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */
-	and	tmp0, c_64
-	xor	T2,   tmp0
-		psrlq	xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */
-	mov	tmp0, a_64
-	ror	tmp0, 5 /* 39 */
-		pxor	xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */
-	xor	tmp0, a_64
-	ror	tmp0, 6 /* 34 */
-		pxor	xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */
-	xor	tmp0, a_64
-	ror	tmp0, 28 /* 28 */
-		psrlq	xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */
-	add	T2,   tmp0
-	add	d_64, T1
-		psrlq	xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */
-	lea	h_64, [T1 + T2]
-	RotateState
-		movdqa	xmm1, xmm2          /* XMM1 = W[t-2] */
-	mov	T1, f_64
-	xor	T1,   g_64
-		movdqa	xmm4, xmm5          /* XMM4 = W[t-15] */
-	and	T1,   e_64
-	xor	T1,   g_64
-		psllq	xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */
-	add	T1,   [WK_2(\t+1)]
-	mov	tmp0, e_64
-		psllq	xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */
-	ror	tmp0, 23 /* 41 */
-	xor	tmp0, e_64
-		pxor	xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */
-	ror	tmp0, 4 /* 18 */
-	xor	tmp0, e_64
-		pxor	xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */
-	ror	tmp0, 14 /* 14 */
-	add	T1,   tmp0
-		psllq	xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */
-	add	T1,   h_64
-	mov	T2,   a_64
-		psllq	xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */
-	xor	T2,   c_64
-	and	T2,   b_64
-		pxor	xmm0, xmm1          /* XMM0 = s1(W[t-2]) */
-	mov	tmp0, a_64
-	and	tmp0, c_64
-		movdqu	xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */
-	xor	T2,   tmp0
-		pxor	xmm3, xmm4          /* XMM3 = s0(W[t-15]) */
-	mov	tmp0, a_64
-		paddq	xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */
-	ror	tmp0, 5 /* 39 */
-		paddq	xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */
-	xor	tmp0, a_64
-		paddq	xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
-	ror	tmp0, 6 /* 34 */
-		movdqa	[W_t(\t)], xmm0     /* Store scheduled qwords */
-	xor	tmp0, a_64
-		paddq	xmm0, [K_t(t)]      /* Compute W[t]+K[t] */
-	ror	tmp0, 28 /* 28 */
-		movdqa	[WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */
-	add	T2,   tmp0
-	add	d_64, T1
-	lea	h_64, [T1 + T2]
-	RotateState
-.endm
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+	/* Compute Round %%t */; \
+	mov	T1,   f        /* T1 = f */; \
+	mov	tmp0, e        /* tmp = e */; \
+	xor	T1,   g        /* T1 = f ^ g */; \
+	ror	tmp0, 23 /* 41     ; tmp = e ror 23 */; \
+	and	T1,   e        /* T1 = (f ^ g) & e */; \
+	xor	tmp0, e        /* tmp = (e ror 23) ^ e */; \
+	xor	T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+	add	T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+	ror	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+	xor	tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+	mov	T2,   a        /* T2 = a */; \
+	add	T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+	ror	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+	add	T1,   tmp0     /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	xor	T2,   c        /* T2 = a ^ c */; \
+	and	tmp0, c        /* tmp = a & c */; \
+	and	T2,   b        /* T2 = (a ^ c) & b */; \
+	xor	T2,   tmp0     /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	ror	tmp0, 5 /* 39      ; tmp = a ror 5 */; \
+	xor	tmp0, a        /* tmp = (a ror 5) ^ a */; \
+	add	d, T1          /* e(next_state) = d + T1  */; \
+	ror	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+	xor	tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+	lea	h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+	ror	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+	add	h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \
+	/* \
+	; Compute rounds %%t-2 and %%t-1 \
+	; Compute message schedule QWORDS %%t and %%t+1 \
+	; \
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+	; scheduler. \
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+	; They are then added to their respective SHA512 constants at \
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+	;   For brievity, the comments following vectored instructions only refer to \
+	; the first of a pair of QWORDS. \
+	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \
+	;   The computation of the message schedule and the rounds are tightly \
+	; stitched to take advantage of instruction-level parallelism. \
+	; For clarity, integer instructions (for the rounds calculation) are indented \
+	; by one tab. Vectored instructions (for the message scheduler) are indented \
+	; by two tabs. \
+	*/ \
+	\
+	mov	T1, f; \
+		movdqa	xmm2, [W_t(t-2)]  /* XMM2 = W[t-2] */; \
+	xor	T1,   g; \
+	and	T1,   e; \
+		movdqa	xmm0, xmm2          /* XMM0 = W[t-2] */; \
+	xor	T1,   g; \
+	add	T1,   [WK_2(t)]; \
+		movdqu	xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \
+	mov	tmp0, e; \
+	ror	tmp0, 23 /* 41 */; \
+		movdqa	xmm3, xmm5          /* XMM3 = W[t-15] */; \
+	xor	tmp0, e; \
+	ror	tmp0, 4 /* 18 */; \
+		psrlq	xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */; \
+	xor	tmp0, e; \
+	ror	tmp0, 14 /* 14 */; \
+		psrlq	xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */; \
+	add	T1,   tmp0; \
+	add	T1,   h; \
+		pxor	xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \
+	mov	T2,   a; \
+	xor	T2,   c; \
+		pxor	xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \
+	and	T2,   b; \
+	mov	tmp0, a; \
+		psrlq	xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \
+	and	tmp0, c; \
+	xor	T2,   tmp0; \
+		psrlq	xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \
+	mov	tmp0, a; \
+	ror	tmp0, 5 /* 39 */; \
+		pxor	xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \
+	xor	tmp0, a; \
+	ror	tmp0, 6 /* 34 */; \
+		pxor	xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \
+	xor	tmp0, a; \
+	ror	tmp0, 28 /* 28 */; \
+		psrlq	xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \
+	add	T2,   tmp0; \
+	add	d, T1; \
+		psrlq	xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \
+	lea	h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \
+		movdqa	xmm1, xmm2          /* XMM1 = W[t-2] */; \
+	mov	T1,   f; \
+	xor	T1,   g; \
+		movdqa	xmm4, xmm5          /* XMM4 = W[t-15] */; \
+	and	T1,   e; \
+	xor	T1,   g; \
+		psllq	xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \
+	add	T1,   [WK_2(t+1)]; \
+	mov	tmp0, e; \
+		psllq	xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \
+	ror	tmp0, 23 /* 41 */; \
+	xor	tmp0, e; \
+		pxor	xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */; \
+	ror	tmp0, 4 /* 18 */; \
+	xor	tmp0, e; \
+		pxor	xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */; \
+	ror	tmp0, 14 /* 14 */; \
+	add	T1,   tmp0; \
+		psllq	xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \
+	add	T1,   h; \
+	mov	T2,   a; \
+		psllq	xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \
+	xor	T2,   c; \
+	and	T2,   b; \
+		pxor	xmm0, xmm1          /* XMM0 = s1(W[t-2]) */; \
+	mov	tmp0, a; \
+	and	tmp0, c; \
+		movdqu	xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \
+	xor	T2,   tmp0; \
+		pxor	xmm3, xmm4          /* XMM3 = s0(W[t-15]) */; \
+	mov	tmp0, a; \
+		paddq	xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \
+	ror	tmp0, 5 /* 39 */; \
+		paddq	xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \
+	xor	tmp0, a; \
+		paddq	xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+	ror	tmp0, 6 /* 34 */; \
+		movdqa	[W_t(t)], xmm0     /* Store scheduled qwords */; \
+	xor	tmp0, a; \
+		paddq	xmm0, [K_t(t)]      /* Compute W[t]+K[t] */; \
+	ror	tmp0, 28 /* 28 */; \
+		movdqa	[WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */; \
+	add	T2,   tmp0; \
+	add	d, T1; \
+	lea	h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \
+	SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \
+	SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_sse4(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks.
 */
 .globl _gcry_sha512_transform_amd64_ssse3
 ELF(.type _gcry_sha512_transform_amd64_ssse3,@function;)
 .align 16
 _gcry_sha512_transform_amd64_ssse3:
 	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp msglen, 0
 	je .Lnowork
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
 	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
 	mov	[rsp + frame_GPRSAVE + 8 * 1], r12
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
 	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
 	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
 	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
 	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
 	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
 	/* Load state variables */
 	mov	a_64, [DIGEST(0)]
 	mov	b_64, [DIGEST(1)]
 	mov	c_64, [DIGEST(2)]
 	mov	d_64, [DIGEST(3)]
 	mov	e_64, [DIGEST(4)]
 	mov	f_64, [DIGEST(5)]
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 
-	t = 0
-	.rept 80/2 + 1
-	/* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
-	/* +1 iteration because the scheduler leads hashing by 1 iteration */
-		.if t < 2
-			/* BSWAP 2 QWORDS */
-			movdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
-			movdqu	xmm0, [MSG(t)]
-			pshufb	xmm0, xmm1      /* BSWAP */
-			movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */
-			paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
-			movdqa	[WK_2(t)], xmm0 /* Store into WK for rounds */
-		.elseif t < 16
-			/* BSWAP 2 QWORDS; Compute 2 Rounds */
-			movdqu	xmm0, [MSG(t)]
-			pshufb	xmm0, xmm1      /* BSWAP */
-			SHA512_Round (t - 2)    /* Round t-2 */
-			movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */
-			paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
-			SHA512_Round (t - 1)    /* Round t-1 */
-			movdqa	[WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
-		.elseif t < 79
-			/* Schedule 2 QWORDS; Compute 2 Rounds */
-			SHA512_2Sched_2Round_sse t
-		.else
-			/* Compute 2 Rounds */
-			SHA512_Round (t - 2)
-			SHA512_Round (t - 1)
-		.endif
-		t = (t)+2
-	.endr
+	/* BSWAP 2 QWORDS */
+	movdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+	movdqu	xmm0, [MSG(0)]
+	pshufb	xmm0, xmm1      /* BSWAP */
+	movdqa	[W_t(0)], xmm0  /* Store Scheduled Pair */
+	paddq	xmm0, [K_t(0)]  /* Compute W[t]+K[t] */
+	movdqa	[WK_2(0)], xmm0 /* Store into WK for rounds */
+
+	#define T_2_14(t, a, b, c, d, e, f, g, h) \
+		/* BSWAP 2 QWORDS; Compute 2 Rounds */; \
+		movdqu	xmm0, [MSG(t)]; \
+		pshufb	xmm0, xmm1      /* BSWAP */; \
+		SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+				        e##_64, f##_64, g##_64, h##_64); \
+		movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */; \
+		paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */; \
+		SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+				        d##_64, e##_64, f##_64, g##_64); \
+		movdqa	[WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
+
+	#define T_16_78(t, a, b, c, d, e, f, g, h) \
+		SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \
+					      e##_64, f##_64, g##_64, h##_64)
+
+	#define T_80(t, a, b, c, d, e, f, g, h) \
+		/* Compute 2 Rounds */; \
+		SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+				      e##_64, f##_64, g##_64, h##_64); \
+		SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+				      d##_64, e##_64, f##_64, g##_64)
+
+	T_2_14(2, a, b, c, d, e, f, g, h)
+	T_2_14(4, g, h, a, b, c, d, e, f)
+	T_2_14(6, e, f, g, h, a, b, c, d)
+	T_2_14(8, c, d, e, f, g, h, a, b)
+	T_2_14(10, a, b, c, d, e, f, g, h)
+	T_2_14(12, g, h, a, b, c, d, e, f)
+	T_2_14(14, e, f, g, h, a, b, c, d)
+	T_16_78(16, c, d, e, f, g, h, a, b)
+	T_16_78(18, a, b, c, d, e, f, g, h)
+	T_16_78(20, g, h, a, b, c, d, e, f)
+	T_16_78(22, e, f, g, h, a, b, c, d)
+	T_16_78(24, c, d, e, f, g, h, a, b)
+	T_16_78(26, a, b, c, d, e, f, g, h)
+	T_16_78(28, g, h, a, b, c, d, e, f)
+	T_16_78(30, e, f, g, h, a, b, c, d)
+	T_16_78(32, c, d, e, f, g, h, a, b)
+	T_16_78(34, a, b, c, d, e, f, g, h)
+	T_16_78(36, g, h, a, b, c, d, e, f)
+	T_16_78(38, e, f, g, h, a, b, c, d)
+	T_16_78(40, c, d, e, f, g, h, a, b)
+	T_16_78(42, a, b, c, d, e, f, g, h)
+	T_16_78(44, g, h, a, b, c, d, e, f)
+	T_16_78(46, e, f, g, h, a, b, c, d)
+	T_16_78(48, c, d, e, f, g, h, a, b)
+	T_16_78(50, a, b, c, d, e, f, g, h)
+	T_16_78(52, g, h, a, b, c, d, e, f)
+	T_16_78(54, e, f, g, h, a, b, c, d)
+	T_16_78(56, c, d, e, f, g, h, a, b)
+	T_16_78(58, a, b, c, d, e, f, g, h)
+	T_16_78(60, g, h, a, b, c, d, e, f)
+	T_16_78(62, e, f, g, h, a, b, c, d)
+	T_16_78(64, c, d, e, f, g, h, a, b)
+	T_16_78(66, a, b, c, d, e, f, g, h)
+	T_16_78(68, g, h, a, b, c, d, e, f)
+	T_16_78(70, e, f, g, h, a, b, c, d)
+	T_16_78(72, c, d, e, f, g, h, a, b)
+	T_16_78(74, a, b, c, d, e, f, g, h)
+	T_16_78(76, g, h, a, b, c, d, e, f)
+	T_16_78(78, e, f, g, h, a, b, c, d)
+	T_80(80, c, d, e, f, g, h, a, b)
 
 	/* Update digest */
 	add	[DIGEST(0)], a_64
 	add	[DIGEST(1)], b_64
 	add	[DIGEST(2)], c_64
 	add	[DIGEST(3)], d_64
 	add	[DIGEST(4)], e_64
 	add	[DIGEST(5)], f_64
 	add	[DIGEST(6)], g_64
 	add	[DIGEST(7)], h_64
 
 	/* Advance to next message block */
 	add	msg, 16*8
 	dec	msglen
 	jnz	.Lupdateblock
 
 	/* Restore GPRs */
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	r12, [rsp + frame_GPRSAVE + 8 * 1]
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
 	CFI_RESTORE(rbx)
 	CFI_RESTORE(r12)
 	CFI_RESTORE(r13)
 	CFI_RESTORE(r14)
 	CFI_RESTORE(r15)
 
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
 	pxor	xmm2, xmm2
 	pxor	xmm3, xmm3
 	pxor	xmm4, xmm4
 	pxor	xmm5, xmm5
 
 	/* Burn stack */
-	t = 0
-	.rept frame_W_size / 16
-		movdqu [rsp + frame_W + (t) * 16], xmm0
-		t = ((t)+1)
-	.endr
+	mov eax, 0
+.Lerase_stack:
+	movdqu [rsp + rax], xmm0
+	add eax, 16
+	cmp eax, frame_W_size
+	jne .Lerase_stack
 	movdqu [rsp + frame_WK], xmm0
 	xor     eax, eax
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
 	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
 	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Binary Data
 */
 
 .align 16
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .LXMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad 0x3956c25bf348b538,0x59f111f1b605d019
 	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad 0xd807aa98a3030242,0x12835b0145706fbe
 	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad 0x06ca6351e003826f,0x142929670a0e6e70
 	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad 0x81c2c92e47edaee6,0x92722c851482353b
 	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad 0xd192e819d6ef5218,0xd69906245565a910
 	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad 0x90befffa23631e28,0xa4506cebde82bde9
 	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad 0xca273eceea26619c,0xd186b8c721c0c207
 	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad 0x113f9804bef90dae,0x1b710b35131c471b
 	.quad 0x28db77f523047d84,0x32caab7b40c72493
 	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 #endif
 #endif
diff --git a/configure.ac b/configure.ac
index f7339a3e..e4a10b78 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,3266 +1,3256 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 # Copyright (C) 2012-2021  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # (Process this file with autoconf to produce a configure script.)
 AC_REVISION($Revision$)
 AC_PREREQ([2.60])
 min_automake_version="1.14"
 
 # To build a release you need to create a tag with the version number
 # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force".  Please
 # bump the version number immediately after the release and do another
 # commit and push so that the git magic is able to work.  See below
 # for the LT versions.
 m4_define([mym4_package],[libgcrypt])
 m4_define([mym4_major], [1])
 m4_define([mym4_minor], [9])
 m4_define([mym4_micro], [1])
 
 # Below is m4 magic to extract and compute the git revision number,
 # the decimalized short revision number, a beta version string and a
 # flag indicating a development version (mym4_isbeta).  Note that the
 # m4 processing is done by autoconf and not during the configure run.
 m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \
                            mym4_package mym4_major mym4_minor mym4_micro),[:]))
 m4_define([mym4_isbeta],       m4_argn(2, mym4_verslist))
 m4_define([mym4_version],      m4_argn(4, mym4_verslist))
 m4_define([mym4_revision],     m4_argn(7, mym4_verslist))
 m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist))
 m4_esyscmd([echo ]mym4_version[>VERSION])
 AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org])
 
 # LT Version numbers, remember to change them just *before* a release.
 #   (Code changed:			REVISION++)
 #   (Interfaces added/removed/changed:	CURRENT++, REVISION=0)
 #   (Interfaces added:			AGE++)
 #   (Interfaces removed:		AGE=0)
 #
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
 LIBGCRYPT_LT_CURRENT=23
 LIBGCRYPT_LT_AGE=3
 LIBGCRYPT_LT_REVISION=0
 ################################################
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
 AC_SUBST(LIBGCRYPT_LT_AGE)
 AC_SUBST(LIBGCRYPT_LT_REVISION)
 
 # If the API is changed in an incompatible way: increment the next counter.
 #
 # 1.6: ABI and API change but the change is to most users irrelevant
 #      and thus the API version number has not been incremented.
 LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
 NEED_GPG_ERROR_VERSION=1.27
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
 AM_INIT_AUTOMAKE([serial-tests dist-bzip2])
 AC_CONFIG_HEADER(config.h)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_LIBOBJ_DIR([compat])
 AC_CANONICAL_HOST
 AM_MAINTAINER_MODE
 AM_SILENT_RULES
 
 AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory])
 
 AH_TOP([
 #ifndef _GCRYPT_CONFIG_H_INCLUDED
 #define _GCRYPT_CONFIG_H_INCLUDED
 
 /* Enable gpg-error's strerror macro for W32CE.  */
 #define GPG_ERR_ENABLE_ERRNO_MACROS 1
 ])
 
 AH_BOTTOM([
 #define _GCRYPT_IN_LIBGCRYPT 1
 
 /* Add .note.gnu.property section for Intel CET in assembler sources
    when CET is enabled.  */
 #if defined(__ASSEMBLER__) && defined(__CET__)
 # include <cet.h>
 #endif
 
 /* If the configure check for endianness has been disabled, get it from
    OS macros.  This is intended for making fat binary builds on OS X.  */
 #ifdef DISABLED_ENDIAN_CHECK
 # if defined(__BIG_ENDIAN__)
 #  define WORDS_BIGENDIAN 1
 # elif defined(__LITTLE_ENDIAN__)
 #  undef WORDS_BIGENDIAN
 # else
 #  error "No endianness found"
 # endif
 #endif /*DISABLED_ENDIAN_CHECK*/
 
 /* We basically use the original Camellia source.  Make sure the symbols
    properly prefixed.  */
 #define CAMELLIA_EXT_SYM_PREFIX _gcry_
 
 #endif /*_GCRYPT_CONFIG_H_INCLUDED*/
 ])
 
 AH_VERBATIM([_REENTRANT],
 [/* To allow the use of Libgcrypt in multithreaded programs we have to use
     special features from the library. */
 #ifndef _REENTRANT
 # define _REENTRANT 1
 #endif
 ])
 
 
 ######################
 ##  Basic checks.  ### (we need some results later on (e.g. $GCC)
 ######################
 
 AC_PROG_MAKE_SET
 missing_dir=`cd $ac_aux_dir && pwd`
 AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
 AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
 AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
 AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
 # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
 AC_PROG_CC
 AC_PROG_CPP
 AM_PROG_CC_C_O
 AM_PROG_AS
 AC_SEARCH_LIBS([strerror],[cposix])
 AC_PROG_INSTALL
 AC_PROG_AWK
 
 AC_USE_SYSTEM_EXTENSIONS
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
 dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has
 dnl the precedence over the run path, so that if a compatible MPFR library
 dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested
 dnl MPFR library will be this library instead of the MPFR library from the
 dnl build tree. Other OS with the same issue might be added later.
 dnl
 dnl References:
 dnl   https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732
 dnl   http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html
 dnl
 dnl We need to check whether --disable-new-dtags is supported as alternate
 dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc).
 dnl
 case $host in
   *-*-linux*)
     if test -n "$LD_LIBRARY_PATH"; then
       saved_LDFLAGS="$LDFLAGS"
       LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags"
       LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE"
       AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker)
       AC_LINK_IFELSE([AC_LANG_SOURCE([[
 int main (void) { return 0; }
       ]])],
       [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))],
       [AC_MSG_RESULT(no)
        LDADD_FOR_TESTS_KLUDGE=""
       ])
       LDFLAGS="$saved_LDFLAGS"
     fi
     ;;
 esac
 AC_SUBST([LDADD_FOR_TESTS_KLUDGE])
 
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \
                           mym4_minor mym4_micro)
 AC_SUBST(VERSION_NUMBER)
 
 # We need to compile and run a program on the build machine.
 AX_CC_FOR_BUILD
 
 
 LT_PREREQ([2.2.6])
 LT_INIT([win32-dll disable-static])
 LT_LANG([Windows Resource])
 
 
 ##########################
 ## General definitions. ##
 ##########################
 
 # Used by libgcrypt-config
 LIBGCRYPT_CONFIG_LIBS="-lgcrypt"
 LIBGCRYPT_CONFIG_CFLAGS=""
 LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 available_ciphers="$available_ciphers sm4"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
 available_pubkey_ciphers="dsa elgamal rsa ecc"
 enabled_pubkey_ciphers=""
 
 # Definitions for message digests.
 available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512"
 available_digests="$available_digests sha3 tiger whirlpool stribog blake2"
 available_digests="$available_digests sm3"
 enabled_digests=""
 
 # Definitions for kdfs (optional ones)
 available_kdfs="s2k pkdf2 scrypt"
 enabled_kdfs=""
 
 # Definitions for random modules.
 available_random_modules="linux egd unix"
 auto_random_modules="$available_random_modules"
 
 # Supported thread backends.
 LIBGCRYPT_THREAD_MODULES=""
 
 # Other definitions.
 have_w32_system=no
 have_w32ce_system=no
 have_pthread=no
 
 
 # Setup some stuff depending on host.
 case "${host}" in
     *-*-mingw32*)
       ac_cv_have_dev_random=no
       have_w32_system=yes
       case "${host}" in
         *-mingw32ce*)
             have_w32ce_system=yes
             available_random_modules="w32ce"
             ;;
         *)
             available_random_modules="w32"
             ;;
       esac
       AC_DEFINE(USE_ONLY_8DOT3,1,
                 [set this to limit filenames to the 8.3 format])
       AC_DEFINE(HAVE_DRIVE_LETTERS,1,
                 [defined if we must run on a stupid file system])
       AC_DEFINE(HAVE_DOSISH_SYSTEM,1,
                 [defined if we run on some of the PCDOS like systems
                  (DOS, Windoze. OS/2) with special properties like
                   no file modes])
       ;;
 
     i?86-emx-os2 | i?86-*-os2*emx)
         # OS/2 with the EMX environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     i?86-*-msdosdjgpp*)
         # DOS with the DJGPP environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     *-*-hpux*)
         if test -z "$GCC" ; then
             CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE"
         fi
         ;;
     *-dec-osf4*)
         if test -z "$GCC" ; then
             # Suppress all warnings
             # to get rid of the unsigned/signed char mismatch warnings.
             CFLAGS="$CFLAGS -w"
         fi
         ;;
     m68k-atari-mint)
         ;;
     *-apple-darwin*)
         AC_DEFINE(_DARWIN_C_SOURCE, 900000L,
                   Expose all libc features (__DARWIN_C_FULL).)
         AC_DEFINE(USE_POSIX_SPAWN_FOR_TESTS, 1,
                   [defined if we use posix_spawn in test program])
         ;;
     *)
       ;;
 esac
 
 if test "$have_w32_system" = yes; then
    AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system])
    if test "$have_w32ce_system" = yes; then
      AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE])
    fi
 fi
 AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes)
 AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes)
 
 
 
 # A printable OS Name is sometimes useful.
 case "${host}" in
     *-*-mingw32ce*)
         PRINTABLE_OS_NAME="W32CE"
         ;;
 
     *-*-mingw32*)
         PRINTABLE_OS_NAME="W32"
         ;;
 
     i?86-emx-os2 | i?86-*-os2*emx )
         PRINTABLE_OS_NAME="OS/2"
         ;;
 
     i?86-*-msdosdjgpp*)
         PRINTABLE_OS_NAME="MSDOS/DJGPP"
         ;;
 
     *-linux*)
         PRINTABLE_OS_NAME="GNU/Linux"
         ;;
 
     *)
         PRINTABLE_OS_NAME=`uname -s || echo "Unknown"`
         ;;
 esac
 
 NAME_OF_DEV_RANDOM="/dev/random"
 NAME_OF_DEV_URANDOM="/dev/urandom"
 
 AC_ARG_ENABLE(endian-check,
               AS_HELP_STRING([--disable-endian-check],
               [disable the endian check and trust the OS provided macros]),
 	      endiancheck=$enableval,endiancheck=yes)
 if test x"$endiancheck" = xyes ; then
   AC_C_BIGENDIAN
 else
   AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness])
 fi
 
 AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
 
 if test "$ac_cv_sizeof_unsigned_short" = "0" \
    || test "$ac_cv_sizeof_unsigned_int" = "0" \
    || test "$ac_cv_sizeof_unsigned_long" = "0"; then
     AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]);
 fi
 
 # Ensure that we have UINT64_C before we bother to check for uint64_t
 AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works],
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <inttypes.h>]],
        [[uint64_t foo=UINT64_C(42);]])],
      gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no))
 if test "$gnupg_cv_uint64_c_works" = "yes" ; then
    AC_CHECK_SIZEOF(uint64_t)
 fi
 
 # Do we have any 64-bit data types?
 if test "$ac_cv_sizeof_unsigned_int" != "8" \
    && test "$ac_cv_sizeof_unsigned_long" != "8" \
    && test "$ac_cv_sizeof_unsigned_long_long" != "8" \
    && test "$ac_cv_sizeof_uint64_t" != "8"; then
     AC_MSG_ERROR([[
 ***
 *** No 64-bit integer type available.
 *** It is not possible to build Libgcrypt on this platform.
 ***]])
 fi
 
 
 # If not specified otherwise, all available algorithms will be
 # included.
 default_ciphers="$available_ciphers"
 default_pubkey_ciphers="$available_pubkey_ciphers"
 default_digests="$available_digests"
 default_kdfs="$available_kdfs"
 # Blacklist MD2 by default
 default_digests=`echo $default_digests | sed -e 's/md2//g'`
 
 # Substitutions to set generated files in a Emacs buffer to read-only.
 AC_SUBST(emacs_local_vars_begin, ['Local Variables:'])
 AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t'])
 AC_SUBST(emacs_local_vars_end, ['End:'])
 
 ############################
 ## Command line switches. ##
 ############################
 
 # Implementation of the --enable-ciphers switch.
 AC_ARG_ENABLE(ciphers,
 	      AS_HELP_STRING([--enable-ciphers=ciphers],
                              [select the symmetric ciphers to include]),
 	      [enabled_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_ciphers=""])
 if test "x$enabled_ciphers" = "x" \
    -o "$enabled_ciphers" = "yes"  \
    -o "$enabled_ciphers" = "no"; then
    enabled_ciphers=$default_ciphers
 fi
 AC_MSG_CHECKING([which symmetric ciphers to include])
 for cipher in $enabled_ciphers; do
     LIST_MEMBER($cipher, $available_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported cipher "$cipher" specified])
     fi
 done
 AC_MSG_RESULT([$enabled_ciphers])
 
 # Implementation of the --enable-pubkey-ciphers switch.
 AC_ARG_ENABLE(pubkey-ciphers,
 	      AS_HELP_STRING([--enable-pubkey-ciphers=ciphers],
                              [select the public-key ciphers to include]),
 	      [enabled_pubkey_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_pubkey_ciphers=""])
 if test "x$enabled_pubkey_ciphers" = "x" \
    -o "$enabled_pubkey_ciphers" = "yes"  \
    -o "$enabled_pubkey_ciphers" = "no"; then
    enabled_pubkey_ciphers=$default_pubkey_ciphers
 fi
 AC_MSG_CHECKING([which public-key ciphers to include])
 for cipher in $enabled_pubkey_ciphers; do
     LIST_MEMBER($cipher, $available_pubkey_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported public-key cipher specified])
     fi
 done
 AC_MSG_RESULT([$enabled_pubkey_ciphers])
 
 # Implementation of the --enable-digests switch.
 AC_ARG_ENABLE(digests,
 	      AS_HELP_STRING([--enable-digests=digests],
                              [select the message digests to include]),
 	      [enabled_digests=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_digests=""])
 if test "x$enabled_digests" = "x" \
    -o "$enabled_digests" = "yes"  \
    -o "$enabled_digests" = "no"; then
    enabled_digests=$default_digests
 fi
 AC_MSG_CHECKING([which message digests to include])
 for digest in $enabled_digests; do
     LIST_MEMBER($digest, $available_digests)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported message digest specified])
     fi
 done
 AC_MSG_RESULT([$enabled_digests])
 
 # Implementation of the --enable-kdfs switch.
 AC_ARG_ENABLE(kdfs,
       AS_HELP_STRING([--enable-kfds=kdfs],
                      [select the KDFs to include]),
       [enabled_kdfs=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
       [enabled_kdfs=""])
 if test "x$enabled_kdfs" = "x" \
    -o "$enabled_kdfs" = "yes"  \
    -o "$enabled_kdfs" = "no"; then
    enabled_kdfs=$default_kdfs
 fi
 AC_MSG_CHECKING([which key derivation functions to include])
 for kdf in $enabled_kdfs; do
     LIST_MEMBER($kdf, $available_kdfs)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported key derivation function specified])
     fi
 done
 AC_MSG_RESULT([$enabled_kdfs])
 
 # Implementation of the --enable-random switch.
 AC_ARG_ENABLE(random,
 	      AS_HELP_STRING([--enable-random=name],
                              [select which random number generator to use]),
 	      [random=`echo $enableval | tr '[A-Z]' '[a-z]'`],
 	      [])
 if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then
     random=default
 fi
 AC_MSG_CHECKING([which random module to use])
 if test "$random" != "default" -a "$random" != "auto"; then
     LIST_MEMBER($random, $available_random_modules)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported random module specified])
     fi
 fi
 AC_MSG_RESULT($random)
 
 # Implementation of the --disable-dev-random switch.
 AC_MSG_CHECKING([whether use of /dev/random is requested])
 AC_ARG_ENABLE(dev-random,
 [  --disable-dev-random    disable the use of dev random],
     try_dev_random=$enableval, try_dev_random=yes)
 AC_MSG_RESULT($try_dev_random)
 
 # Implementation of the --with-egd-socket switch.
 AC_ARG_WITH(egd-socket,
     [  --with-egd-socket=NAME  Use NAME for the EGD socket)],
             egd_socket_name="$withval", egd_socket_name="" )
 AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
 # Implementation of the --enable-random-daemon
 AC_MSG_CHECKING([whether the experimental random daemon is requested])
 AC_ARG_ENABLE([random-daemon],
               AS_HELP_STRING([--enable-random-daemon],
                              [Build and support the experimental gcryptrnd]),
               [use_random_daemon=$enableval],
               [use_random_daemon=no])
 AC_MSG_RESULT($use_random_daemon)
 if test x$use_random_daemon = xyes ; then
     AC_DEFINE(USE_RANDOM_DAEMON,1,
               [Define to support the experimental random daemon])
 fi
 AM_CONDITIONAL(USE_RANDOM_DAEMON, test x$use_random_daemon = xyes)
 
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested])
 AC_ARG_ENABLE([asm],
               AS_HELP_STRING([--disable-asm],
                              [Disable MPI and cipher assembler modules]),
               [try_asm_modules=$enableval],
               [try_asm_modules=yes])
 AC_MSG_RESULT($try_asm_modules)
 
 # Implementation of the --enable-m-guard switch.
 AC_MSG_CHECKING([whether memory guard is requested])
 AC_ARG_ENABLE(m-guard,
               AS_HELP_STRING([--enable-m-guard],
                              [Enable memory guard facility]),
               [use_m_guard=$enableval], [use_m_guard=no])
 AC_MSG_RESULT($use_m_guard)
 if test "$use_m_guard" = yes ; then
     AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature])
 fi
 
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
               AS_HELP_STRING([--enable-large-data-tests],
                  [Enable the real long ruinning large data tests]),
 	      large_data_tests=$enableval,large_data_tests=no)
 AC_MSG_RESULT($large_data_tests)
 AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests)
 
 # Implementation of --enable-force-soft-hwfeatures
 AC_MSG_CHECKING([whether 'soft' HW feature bits are forced on])
 AC_ARG_ENABLE([force-soft-hwfeatures],
               AS_HELP_STRING([--enable-force-soft-hwfeatures],
                              [Enable forcing 'soft' HW feature bits on]),
               [force_soft_hwfeatures=$enableval],
               [force_soft_hwfeatures=no])
 AC_MSG_RESULT($force_soft_hwfeatures)
 
 
 # Implementation of the --with-capabilities switch.
 # Check whether we want to use Linux capabilities
 AC_MSG_CHECKING([whether use of capabilities is requested])
 AC_ARG_WITH(capabilities,
             AS_HELP_STRING([--with-capabilities],
                            [Use linux capabilities [default=no]]),
             [use_capabilities="$withval"],[use_capabilities=no])
 AC_MSG_RESULT($use_capabilities)
 
 # Implementation of the --enable-hmac-binary-check.
 AC_MSG_CHECKING([whether a HMAC binary check is requested])
 AC_ARG_ENABLE(hmac-binary-check,
               AS_HELP_STRING([--enable-hmac-binary-check],
                              [Enable library integrity check]),
               [use_hmac_binary_check=$enableval],
               [use_hmac_binary_check=no])
 AC_MSG_RESULT($use_hmac_binary_check)
 if test "$use_hmac_binary_check" = yes ; then
     AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1,
               [Define to support an HMAC based integrity check])
 fi
 
 
 # Implementation of the --disable-jent-support switch.
 AC_MSG_CHECKING([whether jitter entropy support is requested])
 AC_ARG_ENABLE(jent-support,
               AS_HELP_STRING([--disable-jent-support],
                         [Disable support for the Jitter entropy collector]),
 	      jentsupport=$enableval,jentsupport=yes)
 AC_MSG_RESULT($jentsupport)
 
 # Implementation of the --disable-padlock-support switch.
 AC_MSG_CHECKING([whether padlock support is requested])
 AC_ARG_ENABLE(padlock-support,
               AS_HELP_STRING([--disable-padlock-support],
                         [Disable support for the PadLock Engine of VIA processors]),
 	      padlocksupport=$enableval,padlocksupport=yes)
 AC_MSG_RESULT($padlocksupport)
 
 # Implementation of the --disable-aesni-support switch.
 AC_MSG_CHECKING([whether AESNI support is requested])
 AC_ARG_ENABLE(aesni-support,
               AS_HELP_STRING([--disable-aesni-support],
                  [Disable support for the Intel AES-NI instructions]),
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
 # Implementation of the --disable-shaext-support switch.
 AC_MSG_CHECKING([whether SHAEXT support is requested])
 AC_ARG_ENABLE(shaext-support,
               AS_HELP_STRING([--disable-shaext-support],
                  [Disable support for the Intel SHAEXT instructions]),
               shaextsupport=$enableval,shaextsupport=yes)
 AC_MSG_RESULT($shaextsupport)
 
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
               AS_HELP_STRING([--disable-pclmul-support],
                  [Disable support for the Intel PCLMUL instructions]),
 	      pclmulsupport=$enableval,pclmulsupport=yes)
 AC_MSG_RESULT($pclmulsupport)
 
 # Implementation of the --disable-sse41-support switch.
 AC_MSG_CHECKING([whether SSE4.1 support is requested])
 AC_ARG_ENABLE(sse41-support,
               AS_HELP_STRING([--disable-sse41-support],
                  [Disable support for the Intel SSE4.1 instructions]),
 	      sse41support=$enableval,sse41support=yes)
 AC_MSG_RESULT($sse41support)
 
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
               AS_HELP_STRING([--disable-drng-support],
                  [Disable support for the Intel DRNG (RDRAND instruction)]),
 	      drngsupport=$enableval,drngsupport=yes)
 AC_MSG_RESULT($drngsupport)
 
 # Implementation of the --disable-avx-support switch.
 AC_MSG_CHECKING([whether AVX support is requested])
 AC_ARG_ENABLE(avx-support,
               AS_HELP_STRING([--disable-avx-support],
                  [Disable support for the Intel AVX instructions]),
 	      avxsupport=$enableval,avxsupport=yes)
 AC_MSG_RESULT($avxsupport)
 
 # Implementation of the --disable-avx2-support switch.
 AC_MSG_CHECKING([whether AVX2 support is requested])
 AC_ARG_ENABLE(avx2-support,
               AS_HELP_STRING([--disable-avx2-support],
                  [Disable support for the Intel AVX2 instructions]),
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
               AS_HELP_STRING([--disable-neon-support],
                  [Disable support for the ARM NEON instructions]),
 	      neonsupport=$enableval,neonsupport=yes)
 AC_MSG_RESULT($neonsupport)
 
 # Implementation of the --disable-arm-crypto-support switch.
 AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested])
 AC_ARG_ENABLE(arm-crypto-support,
               AS_HELP_STRING([--disable-arm-crypto-support],
                  [Disable support for the ARMv8 Crypto Extension instructions]),
 	      armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
 # Implementation of the --disable-ppc-crypto-support switch.
 AC_MSG_CHECKING([whether PPC crypto support is requested])
 AC_ARG_ENABLE(ppc-crypto-support,
               AS_HELP_STRING([--disable-ppc-crypto-support],
                  [Disable support for the PPC crypto instructions introduced in POWER 8 (PowerISA 2.07)]),
               ppccryptosupport=$enableval,ppccryptosupport=yes)
 AC_MSG_RESULT($ppccryptosupport)
 
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
               AS_HELP_STRING([--disable-O-flag-munging],
                  [Disable modification of the cc -O flag]),
               [enable_o_flag_munging=$enableval],
               [enable_o_flag_munging=yes])
 AC_MSG_RESULT($enable_o_flag_munging)
 AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes")
 
 # Implementation of the --disable-instrumentation-munging switch.
 AC_MSG_CHECKING([whether a instrumentation (-fprofile, -fsanitize) munging is requested])
 AC_ARG_ENABLE([instrumentation-munging],
               AS_HELP_STRING([--disable-instrumentation-munging],
                  [Disable modification of the cc instrumentation options]),
               [enable_instrumentation_munging=$enableval],
               [enable_instrumentation_munging=yes])
 AC_MSG_RESULT($enable_instrumentation_munging)
 AM_CONDITIONAL(ENABLE_INSTRUMENTATION_MUNGING,
 	       test "$enable_instrumentation_munging" = "yes")
 
 # Implementation of the --disable-amd64-as-feature-detection switch.
 AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection])
 AC_ARG_ENABLE(amd64-as-feature-detection,
               AS_HELP_STRING([--disable-amd64-as-feature-detection],
                  [Disable the auto-detection of AMD64 as(1) features]),
 	      amd64_as_feature_detection=$enableval,
               amd64_as_feature_detection=yes)
 AC_MSG_RESULT($amd64_as_feature_detection)
 
 
 AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME",
                    [A human readable text with the name of the OS])
 
 # For some systems we know that we have ld_version scripts.
 # Use it then as default.
 have_ld_version_script=no
 case "${host}" in
     *-*-linux*)
 	have_ld_version_script=yes
         ;;
     *-*-gnu*)
 	have_ld_version_script=yes
         ;;
 esac
 AC_ARG_ENABLE([ld-version-script],
               AS_HELP_STRING([--enable-ld-version-script],
                              [enable/disable use of linker version script.
                               (default is system dependent)]),
               [have_ld_version_script=$enableval],
               [ : ] )
 AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes")
 
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM",
                    [defined to the name of the strong random device])
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM",
                    [defined to the name of the weaker random device])
 
 
 ###############################
 #### Checks for libraries. ####
 ###############################
 
 #
 # gpg-error is required.
 #
 AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION")
 if test "x$GPG_ERROR_LIBS" = "x"; then
   AC_MSG_ERROR([libgpg-error is needed.
                 See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .])
 fi
 
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
 #
 # Check whether the GNU Pth library is available.  We require this
 # to build the optional gcryptrnd program.
 #
 AC_ARG_WITH(pth-prefix,
             AS_HELP_STRING([--with-pth-prefix=PFX],
                            [prefix where GNU Pth is installed (optional)]),
      pth_config_prefix="$withval", pth_config_prefix="")
 if test x$pth_config_prefix != x ; then
    PTH_CONFIG="$pth_config_prefix/bin/pth-config"
 fi
 if test "$use_random_daemon" = "yes"; then
   AC_PATH_PROG(PTH_CONFIG, pth-config, no)
   if test "$PTH_CONFIG" = "no"; then
     AC_MSG_WARN([[
 ***
 *** To build the Libgcrypt's random number daemon
 *** we need the support of the GNU Portable Threads Library.
 *** Download it from ftp://ftp.gnu.org/gnu/pth/
 *** On a Debian GNU/Linux system you might want to try
 ***   apt-get install libpth-dev
 ***]])
   else
     GNUPG_PTH_VERSION_CHECK([1.3.7])
     if test $have_pth = yes; then
        PTH_CFLAGS=`$PTH_CONFIG --cflags`
        PTH_LIBS=`$PTH_CONFIG --ldflags`
        PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`"
        AC_DEFINE(USE_GNU_PTH, 1,
                 [Defined if the GNU Portable Thread Library should be used])
        AC_DEFINE(HAVE_PTH, 1,
                 [Defined if the GNU Pth is available])
     fi
   fi
 fi
 AC_SUBST(PTH_CFLAGS)
 AC_SUBST(PTH_LIBS)
 
 #
 # Check whether pthreads is available
 #
 if test "$have_w32_system" != yes; then
   AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes)
   if test "$have_pthread" = yes; then
     AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.])
   fi
 fi
 
 
 # Solaris needs -lsocket and -lnsl. Unisys system includes
 # gethostbyname in libsocket but needs libnsl for socket.
 AC_SEARCH_LIBS(setsockopt, [socket], ,
 	[AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])])
 AC_SEARCH_LIBS(setsockopt, [nsl])
 
 ##################################
 #### Checks for header files. ####
 ##################################
 
 AC_HEADER_STDC
 AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h sys/auxv.h)
 INSERT_SYS_SELECT_H=
 if test x"$ac_cv_header_sys_select_h" = xyes; then
   INSERT_SYS_SELECT_H=" include <sys/select.h>"
 fi
 AC_SUBST(INSERT_SYS_SELECT_H)
 
 
 ##########################################
 #### Checks for typedefs, structures, ####
 ####  and compiler characteristics.   ####
 ##########################################
 
 AC_C_CONST
 AC_C_INLINE
 AC_TYPE_SIZE_T
 AC_TYPE_PID_T
 
 AC_CHECK_TYPES([byte, ushort, u16, u32, u64])
 
 gl_TYPE_SOCKLEN_T
 case "${host}" in
   *-*-mingw32*)
     # socklen_t may or may not be defined depending on what headers
     # are included.  To be safe we use int as this is the actual type.
     FALLBACK_SOCKLEN_T="typedef int gcry_socklen_t;"
     ;;
   *)
     if test ".$gl_cv_socklen_t_equiv" = "."; then
       FALLBACK_SOCKLEN_T="typedef socklen_t gcry_socklen_t;"
     else
       FALLBACK_SOCKLEN_T="typedef ${gl_cv_socklen_t_equiv} gcry_socklen_t;"
     fi
 esac
 AC_SUBST(FALLBACK_SOCKLEN_T)
 
 
 #
 # Check for __builtin_bswap32 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap32,
        [gcry_cv_have_builtin_bswap32],
        [gcry_cv_have_builtin_bswap32=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [int x = 0; int y = __builtin_bswap32(x); return y;])],
           [gcry_cv_have_builtin_bswap32=yes])])
 if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
              [Defined if compiler has '__builtin_bswap32' intrinsic])
 fi
 
 
 #
 # Check for __builtin_bswap64 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap64,
        [gcry_cv_have_builtin_bswap64],
        [gcry_cv_have_builtin_bswap64=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [long long x = 0; long long y = __builtin_bswap64(x); return y;])],
           [gcry_cv_have_builtin_bswap64=yes])])
 if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
              [Defined if compiler has '__builtin_bswap64' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctz,
        [gcry_cv_have_builtin_ctz],
        [gcry_cv_have_builtin_ctz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_ctz(x); return y;])],
           [gcry_cv_have_builtin_ctz=yes])])
 if test "$gcry_cv_have_builtin_ctz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZ, 1,
              [Defined if compiler has '__builtin_ctz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctzl,
        [gcry_cv_have_builtin_ctzl],
        [gcry_cv_have_builtin_ctzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_ctzl(x); return y;])],
           [gcry_cv_have_builtin_ctzl=yes])])
 if test "$gcry_cv_have_builtin_ctzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZL, 1,
              [Defined if compiler has '__builtin_ctzl' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clz,
        [gcry_cv_have_builtin_clz],
        [gcry_cv_have_builtin_clz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_clz(x); return y;])],
           [gcry_cv_have_builtin_clz=yes])])
 if test "$gcry_cv_have_builtin_clz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
              [Defined if compiler has '__builtin_clz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clzl,
        [gcry_cv_have_builtin_clzl],
        [gcry_cv_have_builtin_clzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_clzl(x); return y;])],
           [gcry_cv_have_builtin_clzl=yes])])
 if test "$gcry_cv_have_builtin_clzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZL, 1,
              [Defined if compiler has '__builtin_clzl' intrinsic])
 fi
 
 
 #
 # Check for __sync_synchronize intrinsic.
 #
 AC_CACHE_CHECK(for __sync_synchronize,
        [gcry_cv_have_sync_synchronize],
        [gcry_cv_have_sync_synchronize=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [__sync_synchronize(); return 0;])],
           [gcry_cv_have_sync_synchronize=yes])])
 if test "$gcry_cv_have_sync_synchronize" = "yes" ; then
    AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1,
              [Defined if compiler has '__sync_synchronize' intrinsic])
 fi
 
 
 #
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,
        [gcry_cv_have_vla],
        [gcry_cv_have_vla=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void f1(char *, int);
             char foo(int i) {
               char b[(i < 0 ? 0 : i) + 1];
               f1(b, sizeof b); return b[0];}]])],
           [gcry_cv_have_vla=yes])])
 if test "$gcry_cv_have_vla" = "yes" ; then
    AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported])
 fi
 
 
 #
 # Check for ELF visibility support.
 #
 AC_CACHE_CHECK(whether the visibility attribute is supported,
        gcry_cv_visibility_attribute,
        [gcry_cv_visibility_attribute=no
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo __attribute__ ((visibility ("hidden"))) = 1;
             int bar __attribute__ ((visibility ("protected"))) = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
             if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then
                 if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then
                     gcry_cv_visibility_attribute=yes
                 fi
             fi
         fi
        ])
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken visibility attribute,
        gcry_cv_broken_visibility_attribute,
        [gcry_cv_broken_visibility_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo (int x);
             int bar (int x) __asm__ ("foo")
                             __attribute__ ((visibility ("hidden")));
             int bar (int x) { return x; }
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep '\.hidden@<:@ 	_@:>@foo' conftest.s >/dev/null 2>&1;
             then
                gcry_cv_broken_visibility_attribute=no
            fi
         fi
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken alias attribute,
        gcry_cv_broken_alias_attribute,
        [gcry_cv_broken_alias_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[extern int foo (int x) __asm ("xyzzy");
             int bar (int x) { return x; }
             extern __typeof (bar) foo __attribute ((weak, alias ("bar")));
             extern int dfoo;
             extern __typeof (dfoo) dfoo __asm ("abccb");
             int dfoo = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \
               grep 'abccb' conftest.s >/dev/null 2>&1; then
               gcry_cv_broken_alias_attribute=no
            fi
         fi
         ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(if gcc supports -fvisibility=hidden,
        gcry_cv_gcc_has_f_visibility,
        [gcry_cv_gcc_has_f_visibility=no
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-fvisibility=hidden"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],
                           gcry_cv_gcc_has_f_visibility=yes)
         CFLAGS=$_gcc_cflags_save;
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes" \
    && test "$gcry_cv_broken_visibility_attribute" != "yes" \
    && test "$gcry_cv_broken_alias_attribute" != "yes" \
    && test "$gcry_cv_gcc_has_f_visibility" = "yes"
  then
    AC_DEFINE(GCRY_USE_VISIBILITY, 1,
                [Define to use the GNU C visibility attribute.])
    CFLAGS="$CFLAGS -fvisibility=hidden"
 fi
 
 
 # Following attribute tests depend on warnings to cause compile to fail,
 # so set -Werror temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether the compiler supports the GCC style aligned attribute
 #
 AC_CACHE_CHECK([whether the GCC style aligned attribute is supported],
        [gcry_cv_gcc_attribute_aligned],
        [gcry_cv_gcc_attribute_aligned=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct { int a; } foo __attribute__ ((aligned (16)));]])],
           [gcry_cv_gcc_attribute_aligned=yes])])
 if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1,
      [Defined if a GCC style "__attribute__ ((aligned (n))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style packed attribute
 #
 AC_CACHE_CHECK([whether the GCC style packed attribute is supported],
        [gcry_cv_gcc_attribute_packed],
        [gcry_cv_gcc_attribute_packed=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct foolong_s { long b; } __attribute__ ((packed));
             struct foo_s { char a; struct foolong_s b; }
               __attribute__ ((packed));
             enum bar {
               FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))),
             };]])],
           [gcry_cv_gcc_attribute_packed=yes])])
 if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1,
      [Defined if a GCC style "__attribute__ ((packed))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style may_alias attribute
 #
 AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported],
        [gcry_cv_gcc_attribute_may_alias],
        [gcry_cv_gcc_attribute_may_alias=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[typedef struct foo_s { int a; }
             __attribute__ ((may_alias)) foo_t;]])],
           [gcry_cv_gcc_attribute_may_alias=yes])])
 if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1,
      [Defined if a GCC style "__attribute__ ((may_alias))" is supported])
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether the compiler supports 'asm' or '__asm__' keyword for
 # assembler blocks.
 #
 AC_CACHE_CHECK([whether 'asm' assembler keyword is supported],
        [gcry_cv_have_asm],
        [gcry_cv_have_asm=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm("":::"memory"); }]])],
           [gcry_cv_have_asm=yes])])
 AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported],
        [gcry_cv_have___asm__],
        [gcry_cv_have___asm__=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { __asm__("":::"memory"); }]])],
           [gcry_cv_have___asm__=yes])])
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_DEFINE(asm,__asm__,
         [Define to supported assembler block keyword, if plain 'asm' was not
          supported])
    fi
 fi
 
 
 #
 # Check whether the compiler supports inline assembly memory barrier.
 #
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
           [gcry_cv_have_asm_volatile_memory],
           [gcry_cv_have_asm_volatile_memory=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void a(int x)
                {
                  __asm__ volatile("":::"memory");
                  __asm__ volatile("":"+r"(x)::"memory");
                }]])],
              [gcry_cv_have_asm_volatile_memory=yes])])
    fi
 else
    AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
        [gcry_cv_have_asm_volatile_memory],
        [gcry_cv_have_asm_volatile_memory=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(int x)
             {
               asm volatile("":::"memory");
               asm volatile("":"+r"(x)::"memory"); }]])],
           [gcry_cv_have_asm_volatile_memory=yes])])
 fi
 if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1,
      [Define if inline asm memory barrier is supported])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARM
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations],
        [gcry_cv_gcc_arm_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_arm_platform_as_ok="n/a"
         else
           gcry_cv_gcc_arm_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
                 "add %r0, %r0, %r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
               );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_arm_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARM assembly implementations])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARMv8/Aarch64
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations],
        [gcry_cv_gcc_aarch64_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_aarch64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_aarch64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 ".text\n\t"
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
               );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_aarch64_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations])
 fi
 
 #
 # Check whether GCC assembler supports for CFI directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
        [gcry_cv_gcc_asm_cfi_directives],
        [gcry_cv_gcc_asm_cfi_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".text\n\t"
                 "ac_test:\n\t"
                 ".cfi_startproc\n\t"
                 ".cfi_remember_state\n\t"
                 ".cfi_adjust_cfa_offset 8\n\t"
                 ".cfi_rel_offset 0, 8\n\t"
                 ".cfi_def_cfa_register 1\n\t"
                 ".cfi_register 2, 3\n\t"
                 ".cfi_restore 2\n\t"
                 ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
                 ".cfi_restore_state\n\t"
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
             );]])],
           [gcry_cv_gcc_asm_cfi_directives=yes])])
 if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
              [Defined if underlying assembler supports for CFI directives])
 fi
 
 
 #
 # Check whether GCC assembler supports for ELF directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for ELF directives],
        [gcry_cv_gcc_asm_elf_directives],
        [gcry_cv_gcc_asm_elf_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if ELF directives '.type' and '.size' are supported. */
                 ".text\n\t"
                 "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,STT_FUNC;\n\t"
             );]])],
           [gcry_cv_gcc_asm_elf_directives=yes])])
 if test "$gcry_cv_gcc_asm_elf_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_ELF_DIRECTIVES,1,
              [Defined if underlying assembler supports for ELF directives])
 fi
 
 
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
 #
 GNUPG_SYS_SYMBOL_UNDERSCORE()
 
 
 #################################
 ####                         ####
 #### Setup assembler stuff.  ####
 #### Define mpi_cpu_arch.    ####
 ####                         ####
 #################################
 AC_ARG_ENABLE(mpi-path,
               AS_HELP_STRING([--enable-mpi-path=EXTRA_PATH],
               [prepend EXTRA_PATH to list of CPU specific optimizations]),
 	      mpi_extra_path="$enableval",mpi_extra_path="")
 AC_MSG_CHECKING(architecture and mpi assembler functions)
 if test -f $srcdir/mpi/config.links ; then
     . $srcdir/mpi/config.links
     AC_CONFIG_LINKS("$mpi_ln_list")
     ac_cv_mpi_sflags="$mpi_sflags"
     AC_MSG_RESULT($mpi_cpu_arch)
 else
     AC_MSG_RESULT(failed)
     AC_MSG_ERROR([mpi/config.links missing!])
 fi
 MPI_SFLAGS="$ac_cv_mpi_sflags"
 AC_SUBST(MPI_SFLAGS)
 
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
    shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
    padlocksupport="n/a"
    drngsupport="n/a"
 fi
 
 if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
    fi
 fi
 
 if test "$mpi_cpu_arch" != "ppc"; then
    ppccryptosupport="n/a"
 fi
 
 #############################################
 ####                                     ####
 #### Platform specific compiler checks.  ####
 ####                                     ####
 #############################################
 
 
 # Following tests depend on warnings to cause compile to fail, so set -Werror
 # temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute],
        [gcry_cv_gcc_attribute_ms_abi],
        [gcry_cv_gcc_attribute_ms_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((ms_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_ms_abi=yes])])
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1,
      [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute])
 fi
 
 
 #
 # Check whether compiler supports 'sysv_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute],
        [gcry_cv_gcc_attribute_sysv_abi],
        [gcry_cv_gcc_attribute_sysv_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((sysv_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_sysv_abi=yes])])
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1,
      [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute])
 fi
 
 
 #
 # Check whether default calling convention is 'ms_abi'.
 #
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'ms_abi'],
           [gcry_cv_gcc_default_abi_is_ms_abi],
           [gcry_cv_gcc_default_abi_is_ms_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((ms_abi))(*msabi_func)(void);
                  /* warning on SysV abi targets, passes on Windows based targets */
                  msabi_func = def_func;
                  return msabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_ms_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1,
         [Defined if default calling convention is 'ms_abi'])
    fi
 fi
 
 
 #
 # Check whether default calling convention is 'sysv_abi'.
 #
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'],
           [gcry_cv_gcc_default_abi_is_sysv_abi],
           [gcry_cv_gcc_default_abi_is_sysv_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((sysv_abi))(*sysvabi_func)(void);
                  /* warning on MS ABI targets, passes on SysV ABI targets */
                  sysvabi_func = def_func;
                  return sysvabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_sysv_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1,
         [Defined if default calling convention is 'sysv_abi'])
    fi
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports SSSE3 instructions
 # This is required for the AES-NI instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
        [gcry_cv_gcc_inline_asm_ssse3],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1,
      [Defined if inline assembler supports SSSE3 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PCLMUL instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
        [gcry_cv_gcc_inline_asm_pclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SHA Extensions instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
        [gcry_cv_gcc_inline_asm_shaext],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_shaext="n/a"
         else
           gcry_cv_gcc_inline_asm_shaext=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_shaext=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
      [Defined if inline assembler supports SHA Extensions instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
        [gcry_cv_gcc_inline_asm_sse41],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1,
      [Defined if inline assembler supports SSE4.1 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
        [gcry_cv_gcc_inline_asm_avx],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
      [Defined if inline assembler supports AVX instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
        [gcry_cv_gcc_inline_asm_avx2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1,
      [Defined if inline assembler supports AVX2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
        [gcry_cv_gcc_inline_asm_bmi2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[unsigned int a(unsigned int x, unsigned int y) {
               unsigned int tmp1, tmp2;
               asm ("rorxl %2, %1, %0"
                    : "=r" (tmp1)
                    : "rm0" (x), "J" (32 - ((23) & 31)));
               asm ("andnl %2, %1, %0"
                    : "=r" (tmp2)
                    : "r0" (x), "rm" (y));
               return tmp1 + tmp2;
             }]], [ a(1, 2); ] )],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1,
      [Defined if inline assembler supports BMI2 instructions])
 fi
 
 
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
             [fn();])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
     # Add '-Wa,--divide' to CPPFLAGS and try check again.
     #
     _gcc_cppflags_save="$CPPFLAGS"
     CPPFLAGS="$CPPFLAGS -Wa,--divide"
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
               [fn();])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
       CPPFLAGS="$_gcc_cppflags_save"
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
        [gcry_cv_gcc_amd64_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
                 ".text\n\t"
 		"asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
 		/* Test if assembler allows use of '/' for constant division
 		 * (Solaris/x86 issue). If previous constant division check
 		 * and "-Wa,--divide" workaround failed, this causes assembly
 		 * to be disable on this machine. */
 		"xorl \$(123456789/12345678), %ebp;\n\t"
             );]], [ asmfunc(); ])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
      AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
       AC_LINK_IFELSE([AC_LANG_PROGRAM(
         [[__asm__(
               ".text\n\t"
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
           );]], [ asmfunc(); ])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for assembly
 # implementations that use Intel syntax
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations],
        [gcry_cv_gcc_platform_as_ok_for_intel_syntax],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
                 ".text\n\t"
                 "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
-                /* Intel syntax implementation also use GAS macros, so check
-                 * for them here. */
-                "VAL_A = xmm4\n\t"
-                "VAL_B = xmm2\n\t"
-                ".macro SET_VAL_A p1\n\t"
-                "  VAL_A = \\\\p1 \n\t"
-                ".endm\n\t"
-                ".macro SET_VAL_B p1\n\t"
-                "  VAL_B = \\\\p1 \n\t"
-                ".endm\n\t"
-                "vmovdqa VAL_A, VAL_B;\n\t"
-                "SET_VAL_A eax\n\t"
-                "SET_VAL_B ebp\n\t"
-                "add VAL_A, VAL_B;\n\t"
-                "add VAL_B, 0b10101;\n\t"
+                "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t"
+                "add eax, ebp;\n\t"
+                "rorx eax, ebp, 1;\n\t"
+                "sub eax, [esp + 4];\n\t"
+                "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
             );]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
   AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1,
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
 
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
 AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture],
        [gcry_cv_cc_arm_arch_is_v6],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_cc_arm_arch_is_v6="n/a"
         else
           gcry_cv_cc_arm_arch_is_v6=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[
            #if defined(__arm__) && \
              ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
              || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
              || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
              || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
              || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
              || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
              || defined(__ARM_ARCH_7EM__))
              /* empty */
            #else
              /* fail compile if not ARMv6. */
              not_armv6 not_armv6 = (not_armv6)not_armv6;
            #endif
           ]])],
           [gcry_cv_cc_arm_arch_is_v6=yes])
         fi])
 if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
    AC_DEFINE(HAVE_ARM_ARCH_V6,1,
      [Defined if ARM architecture is v6 or newer])
 fi
 
 
 #
 # Check whether GCC inline assembler supports NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
        [gcry_cv_gcc_inline_asm_neon],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
                 "vrev64.8 %q0, %q3;\n\t"
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
      [Defined if inline assembler supports NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch32_crypto],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
                 ".text\n\t"
 
                 "testfn:\n\t"
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha1su0.32 q0, q0, q0;\n\t"
                 "sha1su1.32 q0, q0;\n\t"
 
                 "sha256h.32 q0, q0, q0;\n\t"
                 "sha256h2.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha256su0.32 q0, q0;\n\t"
                 "sha256su1.32 q0, q0, q15;\n\t"
 
                 "aese.8 q0, q0;\n\t"
                 "aesd.8 q0, q0;\n\t"
                 "aesmc.8 q0, q0;\n\t"
                 "aesimc.8 q0, q0;\n\t"
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1,
      [Defined if inline assembler supports AArch32 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions],
        [gcry_cv_gcc_inline_asm_aarch64_neon],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1,
      [Defined if inline assembler supports AArch64 NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch64_crypto],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
 
                 "sha1h s0, s0;\n\t"
                 "sha1c q0, s0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
                 "sha1su1 v0.4s, v0.4s;\n\t"
 
                 "sha256h q0, q0, v0.4s;\n\t"
                 "sha256h2 q0, q0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha256su0 v0.4s, v0.4s;\n\t"
                 "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
 
                 "aese v0.16b, v0.16b;\n\t"
                 "aesd v0.16b, v0.16b;\n\t"
                 "aesmc v0.16b, v0.16b;\n\t"
                 "aesimc v0.16b, v0.16b;\n\t"
 
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1,
      [Defined if inline assembler supports AArch64 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether PowerPC AltiVec/VSX intrinsics
 #
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX intrinsics],
       [gcry_cv_cc_ppc_altivec],
       [if test "$mpi_cpu_arch" != "ppc" ||
 	  test "$try_asm_modules" != "yes" ; then
 	gcry_cv_cc_ppc_altivec="n/a"
       else
 	gcry_cv_cc_ppc_altivec=no
 	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
 	[[#include <altivec.h>
 	  typedef vector unsigned char block;
 	  typedef vector unsigned int vecu32;
 	  block fn(block in)
 	  {
 	    block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	    vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	    return vec_cipher_be (t, in) ^ (block)y;
 	  }
 	  ]])],
 	[gcry_cv_cc_ppc_altivec=yes])
       fi])
 if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	    [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
 fi
 
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
     test "$mpi_cpu_arch" = "ppc" &&
     test "$try_asm_modules" == "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
     AC_COMPILE_IFELSE([AC_LANG_SOURCE(
       [[#include <altivec.h>
 	typedef vector unsigned char block;
 	typedef vector unsigned int vecu32;
 	block fn(block in)
 	{
 	  block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	  return vec_cipher_be (t, in) ^ (block)y;
 	}]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags])
   fi
 fi
 
 AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS,
 	       test "$gcry_cv_cc_ppc_altivec_cflags" = "yes")
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions],
        [gcry_cv_gcc_inline_asm_ppc_altivec],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_altivec="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_altivec=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".globl testfn;\n"
                     ".text\n\t"
 		    "testfn:\n"
 		    "stvx %v31,%r12,%r0;\n"
 		    "lvx  %v20,%r12,%r0;\n"
 		    "vcipher %v0, %v1, %v22;\n"
 		    "lxvw4x %vs32, %r0, %r1;\n"
 		    "vadduwm %v0, %v1, %v22;\n"
 		    "vshasigmaw %v0, %v1, 0, 15;\n"
 		    "vshasigmad %v0, %v1, 0, 15;\n"
 		    "vpmsumd %v11, %v11, %v11;\n"
 		  );
             ]], [ testfn(); ] )],
           [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1,
      [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PowerISA 3.00 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions],
        [gcry_cv_gcc_inline_asm_ppc_arch_3_00],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_arch_3_00=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\t"
 		    ".globl testfn;\n"
 		    "testfn:\n"
 		    "stxvb16x %r1,%v12,%v30;\n"
 		  );
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1,
      [Defined if inline assembler supports PowerISA 3.00 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
       [gcry_cv_gcc_inline_asm_s390x],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x=no
 	  AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	  [[typedef unsigned int u128_t __attribute__ ((mode (TI)));
 	    unsigned int testfunc(unsigned int x, void *y, unsigned int z)
 	    {
 	      unsigned long fac[8];
 	      register unsigned long reg0 asm("0") = 0;
 	      register unsigned long reg1 asm("1") = x;
 	      u128_t r1 = ((u128_t)(unsigned long)y << 64) | (unsigned long)z;
 	      u128_t r2 = 0;
 	      u128_t r3 = 0;
 	      asm volatile (".insn rre,0xb92e << 16, %[r1], %[r2]\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      asm volatile (".insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      reg0 = 8 - 1;
 	      asm ("stfle %1\n\t"
 	           : "+d" (reg0), "=Q" (fac[0])
 	           :
 	           : "cc", "memory");
 	      asm volatile ("mvc 0(16, %0), 0(%1)\n\t"
 			    :
 			    : "a" (y), "a" (fac)
 			    : "memory");
 	      asm volatile ("xc 0(16, %0), 0(%0)\n\t"
 			    :
 			    : "a" (fac)
 			    : "memory");
 	      asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t"
 			    :
 			    :
 			    : "memory", "r11");
 	      asm volatile ("algrk %%r14, %%r14, %%r14\n\t"
 			    :
 			    :
 			    : "memory", "r14");
 	      return (unsigned int)r1 ^ reg0;
 	    }
 	    ]] , [ testfunc(0, 0, 0); ])],
 	  [gcry_cv_gcc_inline_asm_s390x=yes])
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X,1,
      [Defined if inline assembler supports zSeries instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries vector instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instructions],
       [gcry_cv_gcc_inline_asm_s390x_vx],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x_vx="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x_vx=no
 	  if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
 	    AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	    [[void testfunc(void)
 	      {
 		asm volatile (".machine \"z13+vx\"\n\t"
 			      "vx %%v0, %%v1, %%v31\n\t"
 			      "verllf %%v11, %%v11, (16)(0)\n\t"
 			      :
 			      :
 			      : "memory");
 	      }
 	      ]], [ testfunc(); ])],
 	    [gcry_cv_gcc_inline_asm_s390x_vx=yes])
 	  fi
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X_VX,1,
      [Defined if inline assembler supports zSeries vector instructions])
 fi
 
 
 #######################################
 #### Checks for library functions. ####
 #######################################
 
 AC_FUNC_VPRINTF
 # We have replacements for these in src/missing-string.c
 AC_CHECK_FUNCS(stpcpy strcasecmp)
 # We have replacements for these in src/g10lib.h
 AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
 AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy)
 
 GNUPG_CHECK_MLOCK
 
 #
 # Replacement functions.
 #
 AC_REPLACE_FUNCS([getpid clock])
 
 
 #
 # Check whether it is necessary to link against libdl.
 #
 DL_LIBS=""
 if test "$use_hmac_binary_check" = yes ; then
   _gcry_save_libs="$LIBS"
   LIBS=""
   AC_SEARCH_LIBS(dlopen, c dl,,,)
   DL_LIBS=$LIBS
   LIBS="$_gcry_save_libs"
 fi
 AC_SUBST(DL_LIBS)
 
 
 #
 # Check whether we can use Linux capabilities as requested.
 #
 if test "$use_capabilities" = "yes" ; then
 use_capabilities=no
 AC_CHECK_HEADERS(sys/capability.h)
 if test "$ac_cv_header_sys_capability_h" = "yes" ; then
   AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1)
   if test "$ac_cv_lib_cap_cap_init" = "yes"; then
      AC_DEFINE(USE_CAPABILITIES,1,
                [define if capabilities should be used])
      LIBS="$LIBS -lcap"
      use_capabilities=yes
   fi
 fi
 if test "$use_capabilities" = "no" ; then
     AC_MSG_WARN([[
 ***
 *** The use of capabilities on this system is not possible.
 *** You need a recent Linux kernel and some patches:
 ***   fcaps-2.2.9-990610.patch      (kernel patch for 2.2.9)
 ***   fcap-module-990613.tar.gz     (kernel module)
 ***   libcap-1.92.tar.gz            (user mode library and utilities)
 *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN
 *** set (filesystems menu). Be warned: This code is *really* ALPHA.
 ***]])
 fi
 fi
 
 # Check whether a random device is available.
 if test "$try_dev_random" = yes ; then
     AC_CACHE_CHECK(for random device, ac_cv_have_dev_random,
     [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then
       ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi])
     if test "$ac_cv_have_dev_random" = yes; then
         AC_DEFINE(HAVE_DEV_RANDOM,1,
                  [defined if the system supports a random device] )
     fi
 else
     AC_MSG_CHECKING(for random device)
     ac_cv_have_dev_random=no
     AC_MSG_RESULT(has been disabled)
 fi
 
 # Figure out the random modules for this configuration.
 if test "$random" = "default"; then
 
     # Select default value.
     if test "$ac_cv_have_dev_random" = yes; then
         # Try Linuxish random device.
         random_modules="linux"
     else
         case "${host}" in
         *-*-mingw32ce*)
           # WindowsCE random device.
           random_modules="w32ce"
           ;;
         *-*-mingw32*|*-*-cygwin*)
           # Windows random device.
           random_modules="w32"
           ;;
         *)
           # Build everything, allow to select at runtime.
           random_modules="$auto_random_modules"
           ;;
         esac
     fi
 else
     if test "$random" = "auto"; then
         # Build everything, allow to select at runtime.
         random_modules="$auto_random_modules"
     else
         random_modules="$random"
     fi
 fi
 
 
 #
 # Other defines
 #
 if test mym4_isgit = "yes"; then
     AC_DEFINE(IS_DEVELOPMENT_VERSION,1,
               [Defined if this is not a regular release])
 fi
 
 
 AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes)
 
 
 # This is handy for debugging so the compiler doesn't rearrange
 # things and eliminate variables.
 AC_ARG_ENABLE(optimization,
        AS_HELP_STRING([--disable-optimization],
                       [disable compiler optimization]),
                       [if test $enableval = no ; then
                          CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'`
                        fi])
 
 AC_MSG_NOTICE([checking for cc features])
 # CFLAGS mangling when using gcc.
 if test "$GCC" = yes; then
     AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks])
     _gcc_cflags_save=$CFLAGS
     CFLAGS="-fno-delete-null-pointer-checks"
     AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
     AC_MSG_RESULT($_gcc_wopt)
     CFLAGS=$_gcc_cflags_save;
     if test x"$_gcc_wopt" = xyes ; then
        CFLAGS="$CFLAGS -fno-delete-null-pointer-checks"
     fi
 
     CFLAGS="$CFLAGS -Wall"
     if test "$USE_MAINTAINER_MODE" = "yes"; then
         CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes"
         CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security"
 
         # If -Wno-missing-field-initializers is supported we can enable a
         # a bunch of really useful warnings.
         AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wno-missing-field-initializers"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast"
           CFLAGS="$CFLAGS -Wwrite-strings"
           CFLAGS="$CFLAGS -Wdeclaration-after-statement"
           CFLAGS="$CFLAGS -Wno-missing-field-initializers"
           CFLAGS="$CFLAGS -Wno-sign-compare"
         fi
 
         AC_MSG_CHECKING([if gcc supports -Wpointer-arith])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wpointer-arith"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -Wpointer-arith"
         fi
     fi
 fi
 
 # Check whether as(1) supports a noeexecstack feature.  This test
 # includes an override option.
 CL_AS_NOEXECSTACK
 
 
 AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION)
 AC_SUBST(LIBGCRYPT_CONFIG_LIBS)
 AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS)
 AC_SUBST(LIBGCRYPT_CONFIG_HOST)
 AC_SUBST(LIBGCRYPT_THREAD_MODULES)
 
 AC_CONFIG_COMMANDS([gcrypt-conf],[[
 chmod +x src/libgcrypt-config
 ]],[[
 prefix=$prefix
 exec_prefix=$exec_prefix
 libdir=$libdir
 datadir=$datadir
 DATADIRNAME=$DATADIRNAME
 ]])
 
 #####################
 #### Conclusion. ####
 #####################
 
 # Check that requested feature can actually be used and define
 # ENABLE_foo_SUPPORT macros.
 
 if test x"$aesnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$shaextsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
     shaextsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$sse41support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then
     sse41support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
 if test x"$shaextsupport" = xyes ; then
   AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
             [Enable support for Intel SHAEXT instructions.])
 fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
 fi
 if test x"$sse41support" = xyes ; then
   AC_DEFINE(ENABLE_SSE41_SUPPORT, 1,
             [Enable support for Intel SSE4.1 instructions.])
 fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
 fi
 if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
 fi
 if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
 if test x"$ppccryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1,
             [Enable support for POWER 8 (PowerISA 2.07) crypto extension.])
 fi
 if test x"$jentsupport" = xyes ; then
   AC_DEFINE(ENABLE_JENT_SUPPORT, 1,
             [Enable support for the jitter entropy collector.])
 fi
 if test x"$padlocksupport" = xyes ; then
   AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1,
             [Enable support for the PadLock engine.])
 fi
 if test x"$drngsupport" = xyes ; then
   AC_DEFINE(ENABLE_DRNG_SUPPORT, 1,
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
 
 if test x"$force_soft_hwfeatures" = xyes ; then
   AC_DEFINE(ENABLE_FORCE_SOFT_HWFEATURES, 1,
             [Enable forcing 'soft' HW feature bits on (for testing).])
 fi
 
 # Define conditional sources and config.h symbols depending on the
 # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules.
 
 LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo"
 
          # Build with the SSSE3 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo"
 
          # Build with the ARMv8/AArch32 CE implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aarch64.lo"
 
          # Build with the ARMv8/AArch64 CE implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc9le.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
       ;;
       s390x-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-s390x.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
          # Build with the AES-NI implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aesni.lo"
 
          # Build with the Padlock implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-padlock.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo"
    AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
 
          if test x"$avx2support" = xyes ; then
             # Build with the AVX2 implementation
             GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo"
          fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-aarch64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the SSE2 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo"
       ;;
    esac
 
    if test x"$avx2support" = xyes ; then
       # Build with the AVX2 implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-avx2-amd64.lo"
    fi
 
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo"
    AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(seed, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo"
    AC_DEFINE(USE_SEED, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
 
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aarch64.lo"
       ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 
    if test x"$avx2support" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX2 implementation
         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx2-amd64.lo"
       fi
    fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo"
    AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo"
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
       ;;
       powerpc64le-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc64-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo"
       ;;
       s390x-*-*)
          # Build with the s390x/zSeries vector implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-s390x.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sm4, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4.lo"
    AC_DEFINE(USE_SM4, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4-aesni-avx-amd64.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4-aesni-avx2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
    AC_DEFINE(USE_DSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo"
    AC_DEFINE(USE_RSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(elgamal, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo"
    AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(ecc, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \
                           ecc.lo ecc-curves.lo ecc-misc.lo \
                           ecc-ecdh.lo ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo \
                           ecc-sm2.lo"
    AC_DEFINE(USE_ECC, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(crc, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
    AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
 
    case "${host}" in
       i?86-*-* | x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo"
       ;;
       powerpc64-*-*)
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo"
       ;;
       powerpc-*-*)
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(gostr3411-94, $enabled_digests)
 if test "$found" = "1" ; then
    # GOST R 34.11-94 internally uses GOST 28147-89
    LIST_MEMBER(gost28147, $enabled_ciphers)
    if test "$found" = "1" ; then
       GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo"
       AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included])
    fi
 fi
 
 LIST_MEMBER(stribog, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo"
    AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo"
    AC_DEFINE(USE_MD2, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md4, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo"
    AC_DEFINE(USE_MD4, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md5, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo"
    AC_DEFINE(USE_MD5, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rmd160, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo"
    AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(sha256, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo"
    AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo"
    esac
 
    case "$mpi_cpu_arch" in
      x86)
        # Build with the SHAEXT implementation
        GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-intel-shaext.lo"
      ;;
    esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo"
       ;;
       i?86-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-i386.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo"
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo"
    AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          :
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo"
    AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(whirlpool, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo"
    AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool-sse2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2b-amd64-avx2.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2s-amd64-avx.lo"
       ;;
    esac
 fi
 
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
 case "${host}" in
   x86_64-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx2-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv7-neon.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch64-ce.lo"
   ;;
 esac
 
 case "$mpi_cpu_arch" in
   x86)
     # Build with the SHAEXT implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-intel-shaext.lo"
   ;;
 esac
 
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
    AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
    AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(linux, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndlinux.lo"
    AC_DEFINE(USE_RNDLINUX, 1, [Defined if the /dev/random RNG should be used.])
 fi
 
 LIST_MEMBER(unix, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo"
    AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.])
 fi
 
 LIST_MEMBER(egd, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo"
    AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.])
 fi
 
 LIST_MEMBER(w32, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo"
    AC_DEFINE(USE_RNDW32, 1,
              [Defined if the Windows specific RNG should be used.])
 fi
 
 LIST_MEMBER(w32ce, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo"
    AC_DEFINE(USE_RNDW32CE, 1,
              [Defined if the WindowsCE specific RNG should be used.])
 fi
 
 AC_SUBST([GCRYPT_CIPHERS])
 AC_SUBST([GCRYPT_PUBKEY_CIPHERS])
 AC_SUBST([GCRYPT_DIGESTS])
 AC_SUBST([GCRYPT_KDFS])
 AC_SUBST([GCRYPT_RANDOM])
 
 AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers)
 AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers)
 AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests)
 
 # For printing the configuration we need a colon separated list of
 # algorithm names.
 tmp=`echo "$enabled_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp",
                    [List of available cipher algorithms])
 tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp",
                    [List of available public key cipher algorithms])
 tmp=`echo "$enabled_digests" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp",
                    [List of available digest algorithms])
 tmp=`echo "$enabled_kdfs" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp",
                    [List of available KDF algorithms])
 
 
 #
 # Define conditional sources depending on the used hardware platform.
 # Note that all possible modules must also be listed in
 # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES).
 #
 GCRYPT_HWF_MODULES=
 case "$mpi_cpu_arch" in
      x86)
         AC_DEFINE(HAVE_CPU_ARCH_X86, 1,   [Defined for the x86 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo"
         ;;
      alpha)
         AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms])
         ;;
      sparc)
         AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms])
         ;;
      mips)
         AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1,  [Defined for MIPS platforms])
         ;;
      m68k)
         AC_DEFINE(HAVE_CPU_ARCH_M68K, 1,  [Defined for M68k platforms])
         ;;
      ppc)
         AC_DEFINE(HAVE_CPU_ARCH_PPC, 1,   [Defined for PPC platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-ppc.lo"
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      aarch64)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM AArch64 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      s390x)
         AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
 
 #
 # Option to disable building of doc file
 #
 build_doc=yes
 AC_ARG_ENABLE([doc], AS_HELP_STRING([--disable-doc],
                                     [do not build the documentation]),
                      build_doc=$enableval, build_doc=yes)
 AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno])
 
 
 #
 # Provide information about the build.
 #
 BUILD_REVISION="mym4_revision"
 AC_SUBST(BUILD_REVISION)
 AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION",
                    [GIT commit id revision used to build this package])
 
 changequote(,)dnl
 BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'`
 changequote([,])dnl
 BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec"
 BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,`
 AC_SUBST(BUILD_VERSION)
 AC_SUBST(BUILD_FILEVERSION)
 
 AC_ARG_ENABLE([build-timestamp],
   AS_HELP_STRING([--enable-build-timestamp],
                  [set an explicit build timestamp for reproducibility.
                   (default is the current time in ISO-8601 format)]),
      [if test "$enableval" = "yes"; then
         BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date`
       else
         BUILD_TIMESTAMP="$enableval"
       fi],
      [BUILD_TIMESTAMP="<none>"])
 AC_SUBST(BUILD_TIMESTAMP)
 AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP",
                    [The time this package was configured for a build])
 
 
 # And create the files.
 AC_CONFIG_FILES([
 Makefile
 m4/Makefile
 compat/Makefile
 mpi/Makefile
 cipher/Makefile
 random/Makefile
 doc/Makefile
 src/Makefile
 src/gcrypt.h
 src/libgcrypt-config
 src/libgcrypt.pc
 src/versioninfo.rc
 tests/Makefile
 ])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
 
 
 detection_module="${GCRYPT_HWF_MODULES%.lo}"
 test -n "$detection_module" || detection_module="none"
 
 # Give some feedback
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:])
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Platform:                 ],[$PRINTABLE_OS_NAME ($host)])
 GCRY_MSG_SHOW([Hardware detection module:],[$detection_module])
 GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers])
 GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests])
 GCRY_MSG_WRAP([Enabled kdf algorithms:   ],[$enabled_kdfs])
 GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers])
 GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
 GCRY_MSG_SHOW([Try using PPC crypto:     ],[$ppccryptosupport])
 GCRY_MSG_SHOW([],[])
 
 if test "x${gpg_config_script_warn}" != x; then
 cat <<G10EOF
         Mismatches between the target platform and the to
         be used libraries have been been detected for:
          ${gpg_config_script_warn}
         Please check above for warning messages.
 
 G10EOF
 fi
 
 if test "$gcry_cv_gcc_attribute_aligned" != "yes" ; then
 cat <<G10EOF
    Please not that your compiler does not support the GCC style
    aligned attribute. Using this software may evoke bus errors.
 
 G10EOF
 fi
 
 if test -n "$gpl"; then
   echo "Please note that you are building a version of Libgcrypt with"
   echo "  $gpl"
   echo "included.  These parts are licensed under the GPL and thus the"
   echo "use of this library has to comply with the conditions of the GPL."
   echo ""
 fi