diff --git a/cipher/blowfish-arm.S b/cipher/blowfish-arm.S
index b30aa31f..a5101b5c 100644
--- a/cipher/blowfish-arm.S
+++ b/cipher/blowfish-arm.S
@@ -1,743 +1,743 @@
 /* blowfish-arm.S  -  ARM assembly implementation of Blowfish cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(__ARMEL__)
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 .text
 
 .syntax unified
 .arm
 
 /* structure of crypto context */
 #define s0	0
 #define s1	(s0 + (1 * 256) * 4)
 #define s2	(s0 + (2 * 256) * 4)
 #define s3	(s0 + (3 * 256) * 4)
 #define p	(s3 + (1 * 256) * 4)
 
 /* register macros */
-#define CTXs0 %r0
-#define CTXs1 %r9
-#define CTXs2 %r8
-#define CTXs3 %r10
-#define RMASK %lr
-#define RKEYL %r2
-#define RKEYR %ip
+#define CTXs0 r0
+#define CTXs1 r9
+#define CTXs2 r8
+#define CTXs3 r10
+#define RMASK lr
+#define RKEYL r2
+#define RKEYR ip
 
-#define RL0 %r3
-#define RR0 %r4
+#define RL0 r3
+#define RR0 r4
 
-#define RL1 %r9
-#define RR1 %r10
+#define RL1 r9
+#define RR1 r10
 
-#define RT0 %r11
-#define RT1 %r7
-#define RT2 %r5
-#define RT3 %r6
+#define RT0 r11
+#define RT1 r7
+#define RT2 r5
+#define RT3 r6
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 0)]; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 3)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 0)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 1)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 2)]; \
 	strb rtmp0, [rdst, #((offs) + 3)];
 
 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 3)]; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 0)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 3)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 2)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 1)]; \
 	strb rtmp0, [rdst, #((offs) + 0)];
 
 #ifdef __ARMEL__
 	#define ldr_unaligned_host ldr_unaligned_le
 	#define str_unaligned_host str_unaligned_le
 
 	/* bswap on little-endian */
 #ifdef HAVE_ARM_ARCH_V6
 	#define host_to_be(reg, rtmp) \
 		rev reg, reg;
 	#define be_to_host(reg, rtmp) \
 		rev reg, reg;
 #else
 	#define host_to_be(reg, rtmp) \
 		eor	rtmp, reg, reg, ror #16; \
 		mov	rtmp, rtmp, lsr #8; \
 		bic	rtmp, rtmp, #65280; \
 		eor	reg, rtmp, reg, ror #8;
 	#define be_to_host(reg, rtmp) \
 		eor	rtmp, reg, reg, ror #16; \
 		mov	rtmp, rtmp, lsr #8; \
 		bic	rtmp, rtmp, #65280; \
 		eor	reg, rtmp, reg, ror #8;
 #endif
 #else
 	#define ldr_unaligned_host ldr_unaligned_be
 	#define str_unaligned_host str_unaligned_be
 
 	/* nop on big-endian */
 	#define host_to_be(reg, rtmp) /*_*/
 	#define be_to_host(reg, rtmp) /*_*/
 #endif
 
 #define host_to_host(x, y) /*_*/
 
 /***********************************************************************
  * 1-way blowfish
  ***********************************************************************/
 #define F(l, r) \
 	and RT0, RMASK, l, lsr#(24 - 2); \
 	and RT1, RMASK, l, lsr#(16 - 2); \
 	ldr RT0, [CTXs0, RT0]; \
 	and RT2, RMASK, l, lsr#(8 - 2); \
 	ldr RT1, [CTXs1, RT1]; \
 	and RT3, RMASK, l, lsl#2; \
 	ldr RT2, [CTXs2, RT2]; \
 	add RT0, RT1; \
 	ldr RT3, [CTXs3, RT3]; \
 	eor RT0, RT2; \
 	add RT0, RT3; \
 	eor r, RT0;
 
 #define load_roundkey_enc(n) \
 	ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
 	ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
 
 #define add_roundkey_enc() \
 	eor RL0, RKEYL; \
 	eor RR0, RKEYR;
 
 #define round_enc(n) \
 	add_roundkey_enc(); \
 	load_roundkey_enc(n); \
 	\
 	F(RL0, RR0); \
 	F(RR0, RL0);
 
 #define load_roundkey_dec(n) \
 	ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
 	ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
 
 #define add_roundkey_dec() \
 	eor RL0, RKEYL; \
 	eor RR0, RKEYR;
 
 #define round_dec(n) \
 	add_roundkey_dec(); \
 	load_roundkey_dec(n); \
 	\
 	F(RL0, RR0); \
 	F(RR0, RL0);
 
 #define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
 	ldr l0, [rin, #((offs) + 0)]; \
 	ldr r0, [rin, #((offs) + 4)]; \
 	convert(l0, rtmp); \
 	convert(r0, rtmp);
 
 #define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
 	convert(l0, rtmp); \
 	convert(r0, rtmp); \
 	str l0, [rout, #((offs) + 0)]; \
 	str r0, [rout, #((offs) + 4)];
 
 #ifdef __ARM_FEATURE_UNALIGNED
 	/* unaligned word reads allowed */
 	#define read_block(rin, offs, l0, r0, rtmp0) \
 		read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
 
 	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
 		write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
 
 	#define read_block_host(rin, offs, l0, r0, rtmp0) \
 		read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
 
 	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
 		write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
 #else
 	/* need to handle unaligned reads by byte reads */
 	#define read_block(rin, offs, l0, r0, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
 			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
 		2:;
 
 	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
 			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
 		2:;
 
 	#define read_block_host(rin, offs, l0, r0, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
 			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
 		2:;
 
 	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
 			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block_aligned(rout, offs, l0, r0, host_to_host); \
 		2:;
 #endif
 
 .align 3
 .type  __blowfish_enc_blk1,%function;
 
 __blowfish_enc_blk1:
 	/* input:
 	 *	preloaded: CTX
 	 *	[RL0, RR0]: src
 	 * output:
 	 *	[RR0, RL0]: dst
 	 */
-	push {%lr};
+	push {lr};
 
 	add CTXs1, CTXs0, #(s1 - s0);
 	add CTXs2, CTXs0, #(s2 - s0);
 	mov RMASK, #(0xff << 2); /* byte mask */
 	add CTXs3, CTXs1, #(s3 - s1);
 
 	load_roundkey_enc(0);
 	round_enc(2);
 	round_enc(4);
 	round_enc(6);
 	round_enc(8);
 	round_enc(10);
 	round_enc(12);
 	round_enc(14);
 	round_enc(16);
 	add_roundkey_enc();
 
-	pop {%pc};
+	pop {pc};
 .size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
 
 .align 8
 .globl  _gcry_blowfish_arm_do_encrypt
 .type   _gcry_blowfish_arm_do_encrypt,%function;
 
 _gcry_blowfish_arm_do_encrypt:
 	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: u32 *ret_xl
-	 *	%r2: u32 *ret_xr
+	 *	r0: ctx, CTX
+	 *	r1: u32 *ret_xl
+	 *	r2: u32 *ret_xr
 	 */
-	push {%r2, %r4-%r11, %ip, %lr};
+	push {r2, r4-r11, ip, lr};
 
-	ldr RL0, [%r1];
-	ldr RR0, [%r2];
+	ldr RL0, [r1];
+	ldr RR0, [r2];
 
 	bl __blowfish_enc_blk1;
 
-	pop {%r2};
-	str RR0, [%r1];
-	str RL0, [%r2];
+	pop {r2};
+	str RR0, [r1];
+	str RL0, [r2];
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
 
 .align 3
 .globl _gcry_blowfish_arm_encrypt_block
 .type   _gcry_blowfish_arm_encrypt_block,%function;
 
 _gcry_blowfish_arm_encrypt_block:
 	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx, CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	bl __blowfish_enc_blk1;
 
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
 
 .align 3
 .globl _gcry_blowfish_arm_decrypt_block
 .type   _gcry_blowfish_arm_decrypt_block,%function;
 
 _gcry_blowfish_arm_decrypt_block:
 	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx, CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
 	add CTXs1, CTXs0, #(s1 - s0);
 	add CTXs2, CTXs0, #(s2 - s0);
 	mov RMASK, #(0xff << 2); /* byte mask */
 	add CTXs3, CTXs1, #(s3 - s1);
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	load_roundkey_dec(17);
 	round_dec(15);
 	round_dec(13);
 	round_dec(11);
 	round_dec(9);
 	round_dec(7);
 	round_dec(5);
 	round_dec(3);
 	round_dec(1);
 	add_roundkey_dec();
 
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
 
 /***********************************************************************
  * 2-way blowfish
  ***********************************************************************/
 #define F2(n, l0, r0, l1, r1, set_nextk, dec) \
 	\
 	and RT0, RMASK, l0, lsr#(24 - 2); \
 	and RT1, RMASK, l0, lsr#(16 - 2); \
 	and RT2, RMASK, l0, lsr#(8 - 2); \
 	add RT1, #(s1 - s0); \
 	\
 	ldr RT0, [CTXs0, RT0]; \
 	and RT3, RMASK, l0, lsl#2; \
 	ldr RT1, [CTXs0, RT1]; \
 	add RT3, #(s3 - s2); \
 	ldr RT2, [CTXs2, RT2]; \
 	add RT0, RT1; \
 	ldr RT3, [CTXs2, RT3]; \
 	\
 	and RT1, RMASK, l1, lsr#(24 - 2); \
 	eor RT0, RT2; \
 	and RT2, RMASK, l1, lsr#(16 - 2); \
 	add RT0, RT3; \
 	add RT2, #(s1 - s0); \
 	and RT3, RMASK, l1, lsr#(8 - 2); \
 	eor r0, RT0; \
 	\
 	ldr RT1, [CTXs0, RT1]; \
 	and RT0, RMASK, l1, lsl#2; \
 	ldr RT2, [CTXs0, RT2]; \
 	add RT0, #(s3 - s2); \
 	ldr RT3, [CTXs2, RT3]; \
 	add RT1, RT2; \
 	ldr RT0, [CTXs2, RT0]; \
 	\
 	and RT2, RMASK, r0, lsr#(24 - 2); \
 	eor RT1, RT3; \
 	and RT3, RMASK, r0, lsr#(16 - 2); \
 	add RT1, RT0; \
 	add RT3, #(s1 - s0); \
 	and RT0, RMASK, r0, lsr#(8 - 2); \
 	eor r1, RT1; \
 	\
 	ldr RT2, [CTXs0, RT2]; \
 	and RT1, RMASK, r0, lsl#2; \
 	ldr RT3, [CTXs0, RT3]; \
 	add RT1, #(s3 - s2); \
 	ldr RT0, [CTXs2, RT0]; \
 	add RT2, RT3; \
 	ldr RT1, [CTXs2, RT1]; \
 	\
 	and RT3, RMASK, r1, lsr#(24 - 2); \
 	eor RT2, RT0; \
 	and RT0, RMASK, r1, lsr#(16 - 2); \
 	add RT2, RT1; \
 	add RT0, #(s1 - s0); \
 	and RT1, RMASK, r1, lsr#(8 - 2); \
 	eor l0, RT2; \
 	\
 	ldr RT3, [CTXs0, RT3]; \
 	and RT2, RMASK, r1, lsl#2; \
 	ldr RT0, [CTXs0, RT0]; \
 	add RT2, #(s3 - s2); \
 	ldr RT1, [CTXs2, RT1]; \
 	eor l1, RKEYL; \
 	ldr RT2, [CTXs2, RT2]; \
 	\
 	eor r0, RKEYR; \
 	add RT3, RT0; \
 	eor r1, RKEYR; \
 	eor RT3, RT1; \
 	eor l0, RKEYL; \
 	add RT3, RT2; \
 	set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
 	eor l1, RT3; \
 	set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
 
 #define load_n_add_roundkey_enc2(n) \
 	load_roundkey_enc(n); \
 	eor RL0, RKEYL; \
 	eor RR0, RKEYR; \
 	eor RL1, RKEYL; \
 	eor RR1, RKEYR; \
 	load_roundkey_enc((n) + 2);
 
 #define next_key(reg, offs) \
 	ldr reg, [CTXs2, #(offs)];
 
 #define dummy(x, y) /* do nothing */
 
 #define round_enc2(n, load_next_key) \
 	F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
 
 #define load_n_add_roundkey_dec2(n) \
 	load_roundkey_dec(n); \
 	eor RL0, RKEYL; \
 	eor RR0, RKEYR; \
 	eor RL1, RKEYL; \
 	eor RR1, RKEYR; \
 	load_roundkey_dec((n) - 2);
 
 #define round_dec2(n, load_next_key) \
 	F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
 
 #define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
 	ldr l0, [rin, #(0)]; \
 	ldr r0, [rin, #(4)]; \
 	convert(l0, rtmp); \
 	ldr l1, [rin, #(8)]; \
 	convert(r0, rtmp); \
 	ldr r1, [rin, #(12)]; \
 	convert(l1, rtmp); \
 	convert(r1, rtmp);
 
 #define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
 	convert(l0, rtmp); \
 	convert(r0, rtmp); \
 	convert(l1, rtmp); \
 	str l0, [rout, #(0)]; \
 	convert(r1, rtmp); \
 	str r0, [rout, #(4)]; \
 	str l1, [rout, #(8)]; \
 	str r1, [rout, #(12)];
 
 #ifdef __ARM_FEATURE_UNALIGNED
 	/* unaligned word reads allowed */
 	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
 		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
 
 	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
 
 	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
 		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
 
 	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
 #else
 	/* need to handle unaligned reads by byte reads */
 	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_be(l0, rin, 0, rtmp0); \
 			ldr_unaligned_be(r0, rin, 4, rtmp0); \
 			ldr_unaligned_be(l1, rin, 8, rtmp0); \
 			ldr_unaligned_be(r1, rin, 12, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
 		2:;
 
 	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
 			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
 			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
 			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
 		2:;
 
 	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_host(l0, rin, 0, rtmp0); \
 			ldr_unaligned_host(r0, rin, 4, rtmp0); \
 			ldr_unaligned_host(l1, rin, 8, rtmp0); \
 			ldr_unaligned_host(r1, rin, 12, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
 		2:;
 
 	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
 			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
 			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
 			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
 		2:;
 #endif
 
 .align 3
 .type  _gcry_blowfish_arm_enc_blk2,%function;
 
 _gcry_blowfish_arm_enc_blk2:
 	/* input:
 	 *	preloaded: CTX
 	 *	[RL0, RR0], [RL1, RR1]: src
 	 * output:
 	 *	[RR0, RL0], [RR1, RL1]: dst
 	 */
-	push {RT0,%lr};
+	push {RT0,lr};
 
 	add CTXs2, CTXs0, #(s2 - s0);
 	mov RMASK, #(0xff << 2); /* byte mask */
 
 	load_n_add_roundkey_enc2(0);
 	round_enc2(2, next_key);
 	round_enc2(4, next_key);
 	round_enc2(6, next_key);
 	round_enc2(8, next_key);
 	round_enc2(10, next_key);
 	round_enc2(12, next_key);
 	round_enc2(14, next_key);
 	round_enc2(16, dummy);
 
 	host_to_be(RR0, RT0);
 	host_to_be(RL0, RT0);
 	host_to_be(RR1, RT0);
 	host_to_be(RL1, RT0);
 
-	pop {RT0,%pc};
+	pop {RT0,pc};
 .size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
 
 .align 3
 .globl _gcry_blowfish_arm_cfb_dec;
 .type  _gcry_blowfish_arm_cfb_dec,%function;
 
 _gcry_blowfish_arm_cfb_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r2, %r4-%r11, %ip, %lr};
+	push {r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
-	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-	ldm %r3, {RL0, RR0};
+	/* Load input (iv/r3 is aligned, src/r2 might not be) */
+	ldm r3, {RL0, RR0};
 	host_to_be(RL0, RT0);
 	host_to_be(RR0, RT0);
-	read_block(%r2, 0, RL1, RR1, RT0);
+	read_block(r2, 0, RL1, RR1, RT0);
 
 	/* Update IV, load src[1] and save to iv[0] */
-	read_block_host(%r2, 8, %r5, %r6, RT0);
-	stm %lr, {%r5, %r6};
+	read_block_host(r2, 8, r5, r6, RT0);
+	stm lr, {r5, r6};
 
 	bl _gcry_blowfish_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r1: dst, %r0: %src */
-	pop {%r0};
+	/* r1: dst, r0: src */
+	pop {r0};
 
 	/* dst = src ^ result */
-	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r0, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
 
 .align 3
 .globl _gcry_blowfish_arm_ctr_enc;
 .type  _gcry_blowfish_arm_ctr_enc,%function;
 
 _gcry_blowfish_arm_ctr_enc:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit, big-endian)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit, big-endian)
 	 */
-	push {%r2, %r4-%r11, %ip, %lr};
+	push {r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
 	/* Load IV (big => host endian) */
-	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
+	read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT0);
 
 	/* Construct IVs */
 	adds RR1, RR0, #1; /* +1 */
 	adc RL1, RL0, #0;
-	adds %r6, RR1, #1; /* +2 */
-	adc %r5, RL1, #0;
+	adds r6, RR1, #1; /* +2 */
+	adc r5, RL1, #0;
 
 	/* Store new IV (host => big-endian) */
-	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
+	write_block_aligned(lr, 0, r5, r6, host_to_be, RT0);
 
 	bl _gcry_blowfish_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r1: dst, %r0: %src */
-	pop {%r0};
+	/* r1: dst, r0: src */
+	pop {r0};
 
 	/* XOR key-stream with plaintext */
-	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r0, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
 
 .align 3
 .type  _gcry_blowfish_arm_dec_blk2,%function;
 
 _gcry_blowfish_arm_dec_blk2:
 	/* input:
 	 *	preloaded: CTX
 	 *	[RL0, RR0], [RL1, RR1]: src
 	 * output:
 	 *	[RR0, RL0], [RR1, RL1]: dst
 	 */
 	add CTXs2, CTXs0, #(s2 - s0);
 	mov RMASK, #(0xff << 2); /* byte mask */
 
 	load_n_add_roundkey_dec2(17);
 	round_dec2(15, next_key);
 	round_dec2(13, next_key);
 	round_dec2(11, next_key);
 	round_dec2(9, next_key);
 	round_dec2(7, next_key);
 	round_dec2(5, next_key);
 	round_dec2(3, next_key);
 	round_dec2(1, dummy);
 
 	host_to_be(RR0, RT0);
 	host_to_be(RL0, RT0);
 	host_to_be(RR1, RT0);
 	host_to_be(RL1, RT0);
 
 	b .Ldec_cbc_tail;
 .ltorg
 .size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
 
 .align 3
 .globl _gcry_blowfish_arm_cbc_dec;
 .type  _gcry_blowfish_arm_cbc_dec,%function;
 
 _gcry_blowfish_arm_cbc_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r2-%r11, %ip, %lr};
+	push {r2-r11, ip, lr};
 
-	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+	read_block2(r2, RL0, RR0, RL1, RR1, RT0);
 
 	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
 	 * of function call. */
 	b _gcry_blowfish_arm_dec_blk2;
 .Ldec_cbc_tail:
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: %src, %r1: dst, %r2: iv */
-	pop {%r0, %r2};
+	/* r0: src, r1: dst, r2: iv */
+	pop {r0, r2};
 
-	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r0, 0, %r7, %r8, %r5);
-	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-	ldm %r2, {%r5, %r6};
+	/* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+	read_block_host(r0, 0, r7, r8, r5);
+	/* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+	ldm r2, {r5, r6};
 
 	/* out[1] ^= IV+1 */
-	eor %r10, %r7;
-	eor %r9, %r8;
+	eor r10, r7;
+	eor r9, r8;
 	/* out[0] ^= IV */
-	eor %r4, %r5;
-	eor %r3, %r6;
+	eor r4, r5;
+	eor r3, r6;
 
-	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r0, 8, %r7, %r8, %r5);
+	/* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+	read_block_host(r0, 8, r7, r8, r5);
 	/* store IV+2 to iv[0] (aligned). */
-	stm %r2, {%r7, %r8};
+	stm r2, {r7, r8};
 
 	/* store result to dst[0-3]. Might be unaligned. */
-	write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+	write_block2_host(r1, r4, r3, r10, r9, r5, r6);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
 
 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
 #endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S
index a3d87d11..decd40c2 100644
--- a/cipher/camellia-arm.S
+++ b/cipher/camellia-arm.S
@@ -1,626 +1,626 @@
 /* camellia-arm.S  -  ARM assembly implementation of Camellia cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(__ARMEL__)
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 .text
 
 .syntax unified
 .arm
 
 #ifdef __PIC__
 #  define GET_DATA_POINTER(reg, name, rtmp) \
 		ldr reg, 1f; \
 		ldr rtmp, 2f; \
 		b 3f; \
 	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
 	2:	.word name(GOT); \
 	3:	add reg, pc, reg; \
 		ldr reg, [reg, rtmp];
 #else
 #  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
 #endif
 
 /* struct camellia_ctx: */
 #define key_table 0
 
 /* register macros */
-#define CTX %r0
-#define RTAB1 %ip
-#define RTAB3 %r1
-#define RMASK %lr
+#define CTX r0
+#define RTAB1 ip
+#define RTAB3 r1
+#define RMASK lr
 
-#define IL %r2
-#define IR %r3
+#define IL r2
+#define IR r3
 
-#define XL %r4
-#define XR %r5
-#define YL %r6
-#define YR %r7
+#define XL r4
+#define XR r5
+#define YL r6
+#define YR r7
 
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
+#define RT0 r8
+#define RT1 r9
+#define RT2 r10
+#define RT3 r11
 
 /* helper macros */
 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 3)]; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 0)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 3)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 2)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 1)]; \
 	strb rtmp0, [rdst, #((offs) + 0)];
 
 #ifdef __ARMEL__
 #ifdef HAVE_ARM_ARCH_V6
 	#define host_to_be(reg, rtmp) \
 		rev reg, reg;
 	#define be_to_host(reg, rtmp) \
 		rev reg, reg;
 #else
 	#define host_to_be(reg, rtmp) \
 		eor	rtmp, reg, reg, ror #16; \
 		mov	rtmp, rtmp, lsr #8; \
 		bic	rtmp, rtmp, #65280; \
 		eor	reg, rtmp, reg, ror #8;
 	#define be_to_host(reg, rtmp) \
 		eor	rtmp, reg, reg, ror #16; \
 		mov	rtmp, rtmp, lsr #8; \
 		bic	rtmp, rtmp, #65280; \
 		eor	reg, rtmp, reg, ror #8;
 #endif
 #else
 	/* nop on big-endian */
 	#define host_to_be(reg, rtmp) /*_*/
 	#define be_to_host(reg, rtmp) /*_*/
 #endif
 
 #define ldr_input_aligned_be(rin, a, b, c, d, rtmp) \
 	ldr a, [rin, #0]; \
 	ldr b, [rin, #4]; \
 	be_to_host(a, rtmp); \
 	ldr c, [rin, #8]; \
 	be_to_host(b, rtmp); \
 	ldr d, [rin, #12]; \
 	be_to_host(c, rtmp); \
 	be_to_host(d, rtmp);
 
 #define str_output_aligned_be(rout, a, b, c, d, rtmp) \
 	be_to_host(a, rtmp); \
 	be_to_host(b, rtmp); \
 	str a, [rout, #0]; \
 	be_to_host(c, rtmp); \
 	str b, [rout, #4]; \
 	be_to_host(d, rtmp); \
 	str c, [rout, #8]; \
 	str d, [rout, #12];
 
 #ifdef __ARM_FEATURE_UNALIGNED
 	/* unaligned word reads/writes allowed */
 	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \
 		ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp)
 
 	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
 		str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0)
 #else
 	/* need to handle unaligned reads/writes by byte reads */
 	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_be(ra, rin, 0, rtmp0); \
 			ldr_unaligned_be(rb, rin, 4, rtmp0); \
 			ldr_unaligned_be(rc, rin, 8, rtmp0); \
 			ldr_unaligned_be(rd, rin, 12, rtmp0); \
 			b 2f; \
 		1:;\
 			ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp0); \
 		2:;
 
 	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_be(ra, rout, 0, rtmp0, rtmp1); \
 			str_unaligned_be(rb, rout, 4, rtmp0, rtmp1); \
 			str_unaligned_be(rc, rout, 8, rtmp0, rtmp1); \
 			str_unaligned_be(rd, rout, 12, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0); \
 		2:;
 #endif
 
 /**********************************************************************
   1-way camellia
  **********************************************************************/
 #define roundsm(xl, xr, kl, kr, yl, yr) \
 	ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \
 	and  IR, RMASK, xr, lsl#(4);      /*sp1110*/ \
 	ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \
 	and  IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \
 	and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \
 	ldr  IR, [RTAB1,  IR]; \
 	and RT1, RMASK, xl, lsr#(8 - 4);  /*sp3033*/ \
 	eor yl, RT2; \
 	ldr  IL, [RTAB1,  IL]; \
 	eor yr, RT3; \
 	\
 	ldr RT0, [RTAB3, RT0]; \
 	add RTAB1, #4; \
 	ldr RT1, [RTAB3, RT1]; \
 	add RTAB3, #4; \
 	\
 	and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \
 	and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \
 	\
 	eor IR, RT0; \
 	eor IL, RT1; \
 	\
 	ldr RT2, [RTAB1, RT2]; \
 	and RT0, RMASK, xr, lsr#(8 - 4);  /*sp4404*/ \
 	ldr RT3, [RTAB1, RT3]; \
 	and RT1, RMASK, xl, lsl#(4);      /*sp4404*/ \
 	\
 	ldr RT0, [RTAB3, RT0]; \
 	sub RTAB1, #4; \
 	ldr RT1, [RTAB3, RT1]; \
 	sub RTAB3, #4; \
 	\
 	eor IR, RT2; \
 	eor IL, RT3; \
 	eor IR, RT0; \
 	eor IL, RT1; \
 	\
 	eor IR, IL; \
 	eor yr, yr, IL, ror#8; \
 	eor yl, IR; \
 	eor yr, IR;
 
 #define enc_rounds(n) \
 	roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \
 	roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \
 	roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \
 	roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \
 	roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \
 	roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR);
 
 #define dec_rounds(n) \
 	roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \
 	roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \
 	roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \
 	roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \
 	roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \
 	roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR);
 
 /* perform FL and FL⁻¹ */
 #define fls(ll, lr, rl, rr, kll, klr, krl, krr) \
 	ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \
 	ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \
 	and RT0, ll; \
 	ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \
 	orr RT2, rr; \
 	ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \
 	eor rl, RT2; \
 	eor lr, lr, RT0, ror#31; \
 	and RT3, rl; \
 	orr RT1, lr; \
 	eor ll, RT1; \
 	eor rr, rr, RT3, ror#31;
 
 #define enc_fls(n) \
 	fls(XL, XR, YL, YR, \
 	    (n) * 2 + 0, (n) * 2 + 1, \
 	    (n) * 2 + 2, (n) * 2 + 3);
 
 #define dec_fls(n) \
 	fls(XL, XR, YL, YR, \
 	    (n) * 2 + 2, (n) * 2 + 3, \
 	    (n) * 2 + 0, (n) * 2 + 1);
 
 #define inpack(n) \
-	ldr_input_be(%r2, XL, XR, YL, YR, RT0); \
+	ldr_input_be(r2, XL, XR, YL, YR, RT0); \
 	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
 	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
 	eor XL, RT0; \
 	eor XR, RT1;
 
 #define outunpack(n) \
 	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
 	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
 	eor YL, RT0; \
 	eor YR, RT1; \
-	str_output_be(%r1, YL, YR, XL, XR, RT0, RT1);
+	str_output_be(r1, YL, YR, XL, XR, RT0, RT1);
 
 .align 3
 .globl _gcry_camellia_arm_encrypt_block
 .type   _gcry_camellia_arm_encrypt_block,%function;
 
 _gcry_camellia_arm_encrypt_block:
 	/* input:
-	 *	%r0: keytable
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: keybitlen
+	 *	r0: keytable
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: keybitlen
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
 	mov RMASK, #0xff;
 	add RTAB3, RTAB1, #(2 * 4);
-	push {%r3};
+	push {r3};
 	mov RMASK, RMASK, lsl#4 /* byte mask */
 
 	inpack(0);
 
 	enc_rounds(0);
 	enc_fls(8);
 	enc_rounds(8);
 	enc_fls(16);
 	enc_rounds(16);
 
 	pop {RT0};
 	cmp RT0, #(16 * 8);
 	bne .Lenc_256;
 
-	pop {%r1};
+	pop {r1};
 	outunpack(24);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 
 .Lenc_256:
 	enc_fls(24);
 	enc_rounds(24);
 
-	pop {%r1};
+	pop {r1};
 	outunpack(32);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;
 
 .align 3
 .globl _gcry_camellia_arm_decrypt_block
 .type   _gcry_camellia_arm_decrypt_block,%function;
 
 _gcry_camellia_arm_decrypt_block:
 	/* input:
-	 *	%r0: keytable
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: keybitlen
+	 *	r0: keytable
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: keybitlen
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
 	mov RMASK, #0xff;
 	add RTAB3, RTAB1, #(2 * 4);
 	mov RMASK, RMASK, lsl#4 /* byte mask */
 
-	cmp %r3, #(16 * 8);
+	cmp r3, #(16 * 8);
 	bne .Ldec_256;
 
 	inpack(24);
 
 .Ldec_128:
 	dec_rounds(16);
 	dec_fls(16);
 	dec_rounds(8);
 	dec_fls(8);
 	dec_rounds(0);
 
-	pop {%r1};
+	pop {r1};
 	outunpack(0);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 
 .Ldec_256:
 	inpack(32);
 	dec_rounds(24);
 	dec_fls(24);
 
 	b .Ldec_128;
 .ltorg
 .size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;
 
 .data
 
 /* Encryption/Decryption tables */
 .align 5
 .Lcamellia_sp1110:
 .long 0x70707000
 .Lcamellia_sp0222:
             .long 0x00e0e0e0
 .Lcamellia_sp3033:
                         .long 0x38003838
 .Lcamellia_sp4404:
                                     .long 0x70700070
 .long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c
 .long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3
 .long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0
 .long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4
 .long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057
 .long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea
 .long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae
 .long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023
 .long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b
 .long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045
 .long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5
 .long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed
 .long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f
 .long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d
 .long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092
 .long 0x23232300, 0x00464646, 0x91009191, 0x86860086
 .long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af
 .long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c
 .long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f
 .long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e
 .long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc
 .long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e
 .long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b
 .long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6
 .long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039
 .long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5
 .long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d
 .long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9
 .long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a
 .long 0x92929200, 0x00252525, 0x49004949, 0x51510051
 .long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c
 .long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b
 .long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a
 .long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb
 .long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0
 .long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074
 .long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b
 .long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0
 .long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084
 .long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df
 .long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb
 .long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034
 .long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076
 .long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d
 .long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9
 .long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1
 .long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004
 .long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014
 .long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a
 .long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de
 .long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011
 .long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032
 .long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c
 .long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053
 .long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2
 .long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe
 .long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf
 .long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3
 .long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a
 .long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024
 .long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8
 .long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060
 .long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069
 .long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa
 .long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0
 .long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1
 .long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062
 .long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054
 .long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e
 .long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0
 .long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064
 .long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010
 .long 0x12121200, 0x00242424, 0x09000909, 0x00000000
 .long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3
 .long 0x20202000, 0x00404040, 0x10001010, 0x75750075
 .long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a
 .long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6
 .long 0x84848400, 0x00090909, 0x42004242, 0x09090009
 .long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd
 .long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087
 .long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083
 .long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd
 .long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090
 .long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073
 .long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6
 .long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d
 .long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf
 .long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052
 .long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8
 .long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8
 .long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6
 .long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081
 .long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f
 .long 0x04040400, 0x00080808, 0x02000202, 0x13130013
 .long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063
 .long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9
 .long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7
 .long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f
 .long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc
 .long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029
 .long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9
 .long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f
 .long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4
 .long 0x32323200, 0x00646464, 0x19001919, 0x78780078
 .long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006
 .long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7
 .long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071
 .long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4
 .long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab
 .long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088
 .long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d
 .long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072
 .long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9
 .long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8
 .long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac
 .long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036
 .long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a
 .long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c
 .long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1
 .long 0x24242400, 0x00484848, 0x12001212, 0x40400040
 .long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3
 .long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb
 .long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043
 .long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015
 .long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad
 .long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077
 .long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080
 .long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082
 .long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec
 .long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027
 .long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5
 .long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085
 .long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035
 .long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c
 .long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041
 .long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef
 .long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093
 .long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019
 .long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021
 .long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e
 .long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e
 .long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065
 .long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd
 .long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8
 .long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f
 .long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb
 .long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce
 .long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030
 .long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f
 .long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5
 .long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a
 .long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1
 .long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca
 .long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047
 .long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d
 .long 0x09090900, 0x00121212, 0x84008484, 0x01010001
 .long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6
 .long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056
 .long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d
 .long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d
 .long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066
 .long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc
 .long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d
 .long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012
 .long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020
 .long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1
 .long 0x33333300, 0x00666666, 0x99009999, 0x99990099
 .long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c
 .long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2
 .long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e
 .long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005
 .long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7
 .long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031
 .long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017
 .long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7
 .long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058
 .long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061
 .long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b
 .long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c
 .long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f
 .long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016
 .long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018
 .long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022
 .long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044
 .long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2
 .long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5
 .long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091
 .long 0x13131300, 0x00262626, 0x89008989, 0x08080008
 .long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8
 .long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc
 .long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050
 .long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0
 .long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d
 .long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089
 .long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097
 .long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b
 .long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095
 .long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff
 .long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2
 .long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4
 .long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048
 .long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7
 .long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db
 .long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003
 .long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da
 .long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f
 .long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094
 .long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c
 .long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002
 .long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a
 .long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033
 .long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067
 .long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3
 .long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f
 .long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2
 .long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b
 .long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026
 .long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037
 .long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b
 .long 0x88888800, 0x00111111, 0x44004444, 0x96960096
 .long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b
 .long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be
 .long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e
 .long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079
 .long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c
 .long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e
 .long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e
 .long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5
 .long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6
 .long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd
 .long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059
 .long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098
 .long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a
 .long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046
 .long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba
 .long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025
 .long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042
 .long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2
 .long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa
 .long 0x40404000, 0x00808080, 0x20002020, 0x07070007
 .long 0x28282800, 0x00505050, 0x14001414, 0x55550055
 .long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee
 .long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a
 .long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049
 .long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068
 .long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038
 .long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4
 .long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028
 .long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b
 .long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9
 .long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1
 .long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3
 .long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4
 .long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7
 .long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e
 
 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
 #endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S
index 76ddd2e3..ae53e6b4 100644
--- a/cipher/cast5-arm.S
+++ b/cipher/cast5-arm.S
@@ -1,728 +1,728 @@
 /* cast5-arm.S  -  ARM assembly implementation of CAST5 cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(__ARMEL__)
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 .text
 
 .syntax unified
 .arm
 
 .extern _gcry_cast5_s1to4;
 
 #ifdef __PIC__
 #  define GET_DATA_POINTER(reg, name, rtmp) \
 		ldr reg, 1f; \
 		ldr rtmp, 2f; \
 		b 3f; \
 	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
 	2:	.word name(GOT); \
 	3:	add reg, pc, reg; \
 		ldr reg, [reg, rtmp];
 #else
 #  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
 #endif
 
 /* structure of crypto context */
 #define Km 0
 #define Kr (Km + (16 * 4))
 #define Kr_arm_enc (Kr + (16))
 #define Kr_arm_dec (Kr_arm_enc + (16))
 
 /* register macros */
-#define CTX %r0
-#define Rs1 %r7
-#define Rs2 %r8
-#define Rs3 %r9
-#define Rs4 %r10
-#define RMASK %r11
-#define RKM %r1
-#define RKR %r2
-
-#define RL0 %r3
-#define RR0 %r4
-
-#define RL1 %r9
-#define RR1 %r10
-
-#define RT0 %lr
-#define RT1 %ip
-#define RT2 %r5
-#define RT3 %r6
+#define CTX r0
+#define Rs1 r7
+#define Rs2 r8
+#define Rs3 r9
+#define Rs4 r10
+#define RMASK r11
+#define RKM r1
+#define RKR r2
+
+#define RL0 r3
+#define RR0 r4
+
+#define RL1 r9
+#define RR1 r10
+
+#define RT0 lr
+#define RT1 ip
+#define RT2 r5
+#define RT3 r6
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 0)]; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 3)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 0)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 1)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 2)]; \
 	strb rtmp0, [rdst, #((offs) + 3)];
 
 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 3)]; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 0)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 3)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 2)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 1)]; \
 	strb rtmp0, [rdst, #((offs) + 0)];
 
 #ifdef __ARMEL__
 	#define ldr_unaligned_host ldr_unaligned_le
 	#define str_unaligned_host str_unaligned_le
 
 	/* bswap on little-endian */
 #ifdef HAVE_ARM_ARCH_V6
 	#define host_to_be(reg, rtmp) \
 		rev reg, reg;
 	#define be_to_host(reg, rtmp) \
 		rev reg, reg;
 #else
 	#define host_to_be(reg, rtmp) \
 		eor	rtmp, reg, reg, ror #16; \
 		mov	rtmp, rtmp, lsr #8; \
 		bic	rtmp, rtmp, #65280; \
 		eor	reg, rtmp, reg, ror #8;
 	#define be_to_host(reg, rtmp) \
 		eor	rtmp, reg, reg, ror #16; \
 		mov	rtmp, rtmp, lsr #8; \
 		bic	rtmp, rtmp, #65280; \
 		eor	reg, rtmp, reg, ror #8;
 #endif
 #else
 	#define ldr_unaligned_host ldr_unaligned_be
 	#define str_unaligned_host str_unaligned_be
 
 	/* nop on big-endian */
 	#define host_to_be(reg, rtmp) /*_*/
 	#define be_to_host(reg, rtmp) /*_*/
 #endif
 
 #define host_to_host(x, y) /*_*/
 
 /**********************************************************************
   1-way cast5
  **********************************************************************/
 
 #define dummy(n) /*_*/
 
 #define load_kr(n) \
 	ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */
 
 #define load_dec_kr(n) \
 	ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */
 
 #define load_km(n) \
 	ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */
 
 #define shift_kr(dummy) \
 	mov RKR, RKR, lsr #8;
 
 #define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \
 	op1 RKM, rr; \
 	mov RKM, RKM, ror RKR; \
 	\
 	and RT0, RMASK, RKM, ror #(24); \
 	and RT1, RMASK, RKM, lsr #(16); \
 	and RT2, RMASK, RKM, lsr #(8); \
 	ldr RT0, [Rs1, RT0]; \
 	and RT3, RMASK, RKM; \
 	ldr RT1, [Rs2, RT1]; \
 	shiftkr(RKR); \
 	\
 	ldr RT2, [Rs3, RT2]; \
 	\
 	op2 RT0, RT1; \
 	ldr RT3, [Rs4, RT3]; \
 	op3 RT0, RT2; \
 	loadkm((n) + (1 - ((dec) * 2))); \
 	op4 RT0, RT3; \
 	loadkr((n) + (1 - ((dec) * 2))); \
 	eor rl, RT0;
 
 #define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
 	F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr)
 #define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
 	F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr)
 #define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
 	F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr)
 
 #define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
 	Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr)
 
 #define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
 	Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr)
 
 #define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
 	ldr l0, [rin, #((offs) + 0)]; \
 	ldr r0, [rin, #((offs) + 4)]; \
 	convert(l0, rtmp); \
 	convert(r0, rtmp);
 
 #define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
 	convert(l0, rtmp); \
 	convert(r0, rtmp); \
 	str l0, [rout, #((offs) + 0)]; \
 	str r0, [rout, #((offs) + 4)];
 
 #ifdef __ARM_FEATURE_UNALIGNED
 	/* unaligned word reads allowed */
 	#define read_block(rin, offs, l0, r0, rtmp0) \
 		read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
 
 	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
 		write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
 
 	#define read_block_host(rin, offs, l0, r0, rtmp0) \
 		read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
 
 	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
 		write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
 #else
 	/* need to handle unaligned reads by byte reads */
 	#define read_block(rin, offs, l0, r0, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
 			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
 		2:;
 
 	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
 			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
 		2:;
 
 	#define read_block_host(rin, offs, l0, r0, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
 			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
 		2:;
 
 	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
 			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block_aligned(rout, offs, l0, r0, host_to_host, rtmp0); \
 		2:;
 #endif
 
 .align 3
 .globl _gcry_cast5_arm_encrypt_block
 .type  _gcry_cast5_arm_encrypt_block,%function;
 
 _gcry_cast5_arm_encrypt_block:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
 	add Rs2, Rs1, #(0x100*4);
 	add Rs3, Rs1, #(0x100*4*2);
 	add Rs4, Rs1, #(0x100*4*3);
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	load_km(0);
 	load_kr(0);
 	enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy);
 	enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr);
 	enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy);
 	enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr);
 	enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy);
 	enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr);
 	enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy);
 	enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
 
-	ldr %r1, [%sp], #4;
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	ldr r1, [sp], #4;
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block;
 
 .align 3
 .globl _gcry_cast5_arm_decrypt_block
 .type  _gcry_cast5_arm_decrypt_block,%function;
 
 _gcry_cast5_arm_decrypt_block:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
 	add Rs2, Rs1, #(0x100 * 4);
 	add Rs3, Rs1, #(0x100 * 4 * 2);
 	add Rs4, Rs1, #(0x100 * 4 * 3);
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	load_km(15);
 	load_dec_kr(15);
 	dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy);
 	dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr);
 	dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy);
 	dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr);
 	dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy);
 	dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr);
 	dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy);
 	dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
 
-	ldr %r1, [%sp], #4;
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	ldr r1, [sp], #4;
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block;
 
 /**********************************************************************
   2-way cast5
  **********************************************************************/
 
 #define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \
 	     loadkr) \
 	op1 RT3, RKM, rr0; \
 	op1 RKM, RKM, rr1; \
 	mov RT3, RT3, ror RKR; \
 	mov RKM, RKM, ror RKR; \
 	\
 	and RT0, RMASK, RT3, ror #(24); \
 	and RT1, RMASK, RT3, lsr #(16); \
 	and RT2, RMASK, RT3, lsr #(8); \
 	and RT3, RMASK, RT3; \
 	\
 	ldr RT0, [Rs1, RT0]; \
 	add RT2, #(0x100 * 4); \
 	ldr RT1, [Rs2, RT1]; \
 	add RT3, #(0x100 * 4 * 2); \
 	\
 	ldr RT2, [Rs2, RT2]; \
 	\
 	op2 RT0, RT1; \
 	ldr RT3, [Rs2, RT3]; \
 	and RT1, RMASK, RKM, ror #(24); \
 	op3 RT0, RT2; \
 	and RT2, RMASK, RKM, lsr #(16); \
 	op4 RT0, RT3; \
 	and RT3, RMASK, RKM, lsr #(8); \
 	eor rl0, RT0; \
 	add RT3, #(0x100 * 4); \
 	ldr RT1, [Rs1, RT1]; \
 	and RT0, RMASK, RKM; \
 	ldr RT2, [Rs2, RT2]; \
 	add RT0, #(0x100 * 4 * 2); \
 	\
 	ldr RT3, [Rs2, RT3]; \
 	\
 	op2 RT1, RT2; \
 	ldr RT0, [Rs2, RT0]; \
 	op3 RT1, RT3; \
 	loadkm((n) + (1 - ((dec) * 2))); \
 	op4 RT1, RT0; \
 	loadkr((n) + (1 - ((dec) * 2))); \
 	shiftkr(RKR); \
 	eor rl1, RT1;
 
 #define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
 	F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \
 	     loadkm, shiftkr, loadkr)
 #define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
 	F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \
 	     loadkm, shiftkr, loadkr)
 #define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
 	F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \
 	     loadkm, shiftkr, loadkr)
 
 #define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
 	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr)
 
 #define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
 	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr)
 
 #define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
 	ldr l0, [rin, #(0)]; \
 	ldr r0, [rin, #(4)]; \
 	convert(l0, rtmp); \
 	ldr l1, [rin, #(8)]; \
 	convert(r0, rtmp); \
 	ldr r1, [rin, #(12)]; \
 	convert(l1, rtmp); \
 	convert(r1, rtmp);
 
 #define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
 	convert(l0, rtmp); \
 	convert(r0, rtmp); \
 	convert(l1, rtmp); \
 	str l0, [rout, #(0)]; \
 	convert(r1, rtmp); \
 	str r0, [rout, #(4)]; \
 	str l1, [rout, #(8)]; \
 	str r1, [rout, #(12)];
 
 #ifdef __ARM_FEATURE_UNALIGNED
 	/* unaligned word reads allowed */
 	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
 		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
 
 	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
 
 	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
 		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
 
 	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
 #else
 	/* need to handle unaligned reads by byte reads */
 	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_be(l0, rin, 0, rtmp0); \
 			ldr_unaligned_be(r0, rin, 4, rtmp0); \
 			ldr_unaligned_be(l1, rin, 8, rtmp0); \
 			ldr_unaligned_be(r1, rin, 12, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
 		2:;
 
 	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
 			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
 			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
 			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
 		2:;
 
 	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_host(l0, rin, 0, rtmp0); \
 			ldr_unaligned_host(r0, rin, 4, rtmp0); \
 			ldr_unaligned_host(l1, rin, 8, rtmp0); \
 			ldr_unaligned_host(r1, rin, 12, rtmp0); \
 			b 2f; \
 		1:;\
 			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
 		2:;
 
 	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
 			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
 			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
 			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
 		2:;
 #endif
 
 .align 3
 .type  _gcry_cast5_arm_enc_blk2,%function;
 
 _gcry_cast5_arm_enc_blk2:
 	/* input:
 	 *	preloaded: CTX
 	 *	[RL0, RR0], [RL1, RR1]: src
 	 * output:
 	 *	[RR0, RL0], [RR1, RL1]: dst
 	 */
-	push {%lr};
+	push {lr};
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
 	add Rs2, Rs1, #(0x100 * 4);
 
 	load_km(0);
 	load_kr(0);
 	enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy);
 	enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(3, F1, RR, RL, load_km, dummy, load_kr);
 	enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy);
 	enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(7, F2, RR, RL, load_km, dummy, load_kr);
 	enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy);
 	enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(11, F3, RR, RL, load_km, dummy, load_kr);
 	enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy);
 	enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy);
 	enc_round2(15, F1, RR, RL, dummy, dummy, dummy);
 
 	host_to_be(RR0, RT0);
 	host_to_be(RL0, RT0);
 	host_to_be(RR1, RT0);
 	host_to_be(RL1, RT0);
 
-	pop {%pc};
+	pop {pc};
 .ltorg
 .size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2;
 
 .align 3
 .globl _gcry_cast5_arm_cfb_dec;
 .type  _gcry_cast5_arm_cfb_dec,%function;
 
 _gcry_cast5_arm_cfb_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r1, %r2, %r4-%r11, %ip, %lr};
+	push {r1, r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
-	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-	ldm %r3, {RL0, RR0};
+	/* Load input (iv/r3 is aligned, src/r2 might not be) */
+	ldm r3, {RL0, RR0};
 	host_to_be(RL0, RT1);
 	host_to_be(RR0, RT1);
-	read_block(%r2, 0, RL1, RR1, %ip);
+	read_block(r2, 0, RL1, RR1, ip);
 
 	/* Update IV, load src[1] and save to iv[0] */
-	read_block_host(%r2, 8, %r5, %r6, %r7);
-	stm %lr, {%r5, %r6};
+	read_block_host(r2, 8, r5, r6, r7);
+	stm lr, {r5, r6};
 
 	bl _gcry_cast5_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: dst, %r1: %src */
-	pop {%r0, %r1};
+	/* r0: dst, r1: src */
+	pop {r0, r1};
 
 	/* dst = src ^ result */
-	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r1, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r0, r5, r6, r7, r8, r1, r2);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec;
 
 .align 3
 .globl _gcry_cast5_arm_ctr_enc;
 .type  _gcry_cast5_arm_ctr_enc,%function;
 
 _gcry_cast5_arm_ctr_enc:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit, big-endian)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit, big-endian)
 	 */
-	push {%r1, %r2, %r4-%r11, %ip, %lr};
+	push {r1, r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
 	/* Load IV (big => host endian) */
-	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1);
+	read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT1);
 
 	/* Construct IVs */
 	adds RR1, RR0, #1; /* +1 */
 	adc RL1, RL0, #0;
-	adds %r6, RR1, #1; /* +2 */
-	adc %r5, RL1, #0;
+	adds r6, RR1, #1; /* +2 */
+	adc r5, RL1, #0;
 
 	/* Store new IV (host => big-endian) */
-	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1);
+	write_block_aligned(lr, 0, r5, r6, host_to_be, RT1);
 
 	bl _gcry_cast5_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: dst, %r1: %src */
-	pop {%r0, %r1};
+	/* r0: dst, r1: src */
+	pop {r0, r1};
 
 	/* XOR key-stream with plaintext */
-	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r1, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r0, r5, r6, r7, r8, r1, r2);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc;
 
 .align 3
 .type  _gcry_cast5_arm_dec_blk2,%function;
 
 _gcry_cast5_arm_dec_blk2:
 	/* input:
 	 *	preloaded: CTX
 	 *	[RL0, RR0], [RL1, RR1]: src
 	 * output:
 	 *	[RR0, RL0], [RR1, RL1]: dst
 	 */
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
 	add Rs2, Rs1, #(0x100 * 4);
 
 	load_km(15);
 	load_dec_kr(15);
 	dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy);
 	dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr);
 	dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy);
 	dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr);
 	dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy);
 	dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr);
 	dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy);
 	dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy);
 	dec_round2(0, F1, RR, RL, dummy, dummy, dummy);
 
 	host_to_be(RR0, RT0);
 	host_to_be(RL0, RT0);
 	host_to_be(RR1, RT0);
 	host_to_be(RL1, RT0);
 
 	b .Ldec_cbc_tail;
 .ltorg
 .size _gcry_cast5_arm_dec_blk2,.-_gcry_cast5_arm_dec_blk2;
 
 .align 3
 .globl _gcry_cast5_arm_cbc_dec;
 .type  _gcry_cast5_arm_cbc_dec,%function;
 
 _gcry_cast5_arm_cbc_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r1-%r11, %ip, %lr};
+	push {r1-r11, ip, lr};
 
-	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+	read_block2(r2, RL0, RR0, RL1, RR1, RT0);
 
 	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
 	 * of function call. */
 	b _gcry_cast5_arm_dec_blk2;
 .Ldec_cbc_tail:
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: dst, %r1: %src, %r2: iv */
-	pop {%r0-%r2};
+	/* r0: dst, r1: src, r2: iv */
+	pop {r0-r2};
 
-	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r1, 0, %r7, %r8, %r5);
-	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-	ldm %r2, {%r5, %r6};
+	/* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+	read_block_host(r1, 0, r7, r8, r5);
+	/* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+	ldm r2, {r5, r6};
 
 	/* out[1] ^= IV+1 */
-	eor %r10, %r7;
-	eor %r9, %r8;
+	eor r10, r7;
+	eor r9, r8;
 	/* out[0] ^= IV */
-	eor %r4, %r5;
-	eor %r3, %r6;
+	eor r4, r5;
+	eor r3, r6;
 
-	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r1, 8, %r7, %r8, %r5);
+	/* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+	read_block_host(r1, 8, r7, r8, r5);
 	/* store IV+2 to iv[0] (aligned). */
-	stm %r2, {%r7, %r8};
+	stm r2, {r7, r8};
 
 	/* store result to dst[0-3]. Might be unaligned. */
-	write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
+	write_block2_host(r0, r4, r3, r10, r9, r5, r6);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec;
 
 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
 #endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/cipher-gcm-armv7-neon.S b/cipher/cipher-gcm-armv7-neon.S
index 16502b4a..c7027af3 100644
--- a/cipher/cipher-gcm-armv7-neon.S
+++ b/cipher/cipher-gcm-armv7-neon.S
@@ -1,341 +1,341 @@
 /* cipher-gcm-armv7-neon.S - ARM/NEON accelerated GHASH
  * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_NEON)
 
 .syntax unified
 .fpu neon
 .arm
 
 .text
 
 #ifdef __PIC__
 #  define GET_DATA_POINTER(reg, name, rtmp) \
 		ldr reg, 1f; \
 		ldr rtmp, 2f; \
 		b 3f; \
 	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
 	2:	.word name(GOT); \
 	3:	add reg, pc, reg; \
 		ldr reg, [reg, rtmp];
 #else
 #  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
 #endif
 
 
 /* Constants */
 
 .align 4
 gcry_gcm_reduction_constant:
 .Lrconst64:
   .quad 0xc200000000000000
 
 /* Register macros */
 
 #define rhash q0
 #define rhash_l d0
 #define rhash_h d1
 
 #define rh1 q1
 #define rh1_l d2
 #define rh1_h d3
 
 #define rbuf q2
 #define rbuf_l d4
 #define rbuf_h d5
 
 #define rbuf1 q3
 #define rbuf1_l d6
 #define rbuf1_h d7
 
 #define t0q q4
 #define t0l d8
 #define t0h d9
 
 #define t1q q5
 #define t1l d10
 #define t1h d11
 
 #define t2q q6
 #define t2l d12
 #define t2h d13
 
 #define t3q q7
 #define t3l d14
 #define t3h d15
 
 /* q8 */
 #define k16 d16
 #define k32 d17
 
 /* q9 */
 #define k48 d18
 
 #define k0 q10
 
 #define rr0 q11
 #define rr0_l d22
 #define rr0_h d23
 
 #define rr1 q12
 #define rr1_l d24
 #define rr1_h d25
 
 #define rt0 q13
 #define rt0_l d26
 #define rt0_h d27
 
 #define rt1 q14
 #define rt1_l d28
 #define rt1_h d29
 
 #define rrconst q15
 #define rrconst_l d30
 #define rrconst_h d31
 
 /* Macro for 64x64=>128 carry-less multiplication using vmull.p8 instruction.
  *
  * From "Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R. Fast Software
  * Polynomial Multiplication on ARM Processors using the NEON Engine. The
  * Second International Workshop on Modern Cryptography and Security
  * Engineering — MoCrySEn, 2013". */
 
 #define vmull_p64(rq, rl, rh, ad, bd) \
-	vext.8 t0l, ad, ad, $1; \
+	vext.8 t0l, ad, ad, #1; \
 	vmull.p8 t0q, t0l, bd; \
-	vext.8 rl, bd, bd, $1; \
+	vext.8 rl, bd, bd, #1; \
 	vmull.p8 rq, ad, rl; \
-	vext.8 t1l, ad, ad, $2; \
+	vext.8 t1l, ad, ad, #2; \
 	vmull.p8 t1q, t1l, bd; \
-	vext.8 t3l, bd, bd, $2; \
+	vext.8 t3l, bd, bd, #2; \
 	vmull.p8 t3q, ad, t3l; \
-	vext.8 t2l, ad, ad, $3; \
+	vext.8 t2l, ad, ad, #3; \
 	vmull.p8 t2q, t2l, bd; \
 	veor t0q, t0q, rq; \
-	vext.8 rl, bd, bd, $3; \
+	vext.8 rl, bd, bd, #3; \
 	vmull.p8 rq, ad, rl; \
 	veor t1q, t1q, t3q; \
-	vext.8 t3l, bd, bd, $4; \
+	vext.8 t3l, bd, bd, #4; \
 	vmull.p8 t3q, ad, t3l; \
 	veor t0l, t0l, t0h; \
 	vand t0h, t0h, k48; \
 	veor t1l, t1l, t1h; \
 	vand t1h, t1h, k32; \
 	veor t2q, t2q, rq; \
 	veor t0l, t0l, t0h; \
 	veor t1l, t1l, t1h; \
 	veor t2l, t2l, t2h; \
 	vand t2h, t2h, k16; \
 	veor t3l, t3l, t3h; \
-	vmov.i64 t3h, $0; \
-	vext.8 t0q, t0q, t0q, $15; \
+	vmov.i64 t3h, #0; \
+	vext.8 t0q, t0q, t0q, #15; \
 	veor t2l, t2l, t2h; \
-	vext.8 t1q, t1q, t1q, $14; \
+	vext.8 t1q, t1q, t1q, #14; \
 	vmull.p8 rq, ad, bd; \
-	vext.8 t2q, t2q, t2q, $13; \
-	vext.8 t3q, t3q, t3q, $12; \
+	vext.8 t2q, t2q, t2q, #13; \
+	vext.8 t3q, t3q, t3q, #12; \
 	veor t0q, t0q, t1q; \
 	veor t2q, t2q, t3q; \
 	veor rq, rq, t0q; \
 	veor rq, rq, t2q;
 
 /* GHASH macros.
  *
  * See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
  * Cryptology — CT-RSA 2015" for details.
  */
 
 /* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
  *  Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
  */
 #define PMUL_128x128(r0, r1, a, b, t1, t2, interleave_op) \
         veor t1##_h, b##_l, b##_h; \
         veor t1##_l, a##_l, a##_h; \
         vmull_p64( r0, r0##_l, r0##_h, a##_l, b##_l ); \
         vmull_p64( r1, r1##_l, r1##_h, a##_h, b##_h ); \
         vmull_p64( t2, t2##_h, t2##_l, t1##_h, t1##_l ); \
         interleave_op; \
         veor t2, r0; \
         veor t2, r1; \
         veor r0##_h, t2##_l; \
         veor r1##_l, t2##_h;
 
 /* Reduction using Xor and Shift.
  * Input: 'r0:r1', Output: 'a'
  *
  * See "Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication
  * Instruction and its Usage for Computing the GCM Mode" for details.
  */
 #define REDUCTION(a, r0, r1, t, interleave_op) \
         vshl.u32 t0q, r0, #31; \
         vshl.u32 t1q, r0, #30; \
         vshl.u32 t2q, r0, #25; \
         veor t0q, t0q, t1q; \
         veor t0q, t0q, t2q; \
         vext.8 t, t0q, k0, #4; \
         vext.8 t0q, k0, t0q, #(16-12); \
         veor r0, r0, t0q; \
         interleave_op; \
         vshr.u32 t0q, r0, #1; \
         vshr.u32 t1q, r0, #2; \
         vshr.u32 t2q, r0, #7; \
         veor t0q, t0q, t1q; \
         veor t0q, t0q, t2q; \
         veor t0q, t0q, t; \
         veor r0, r0, t0q; \
         veor a, r0, r1;
 
 #define _(...) __VA_ARGS__
 #define __ _()
 
 /* Other functional macros */
 
 #define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
  * unsigned int _gcry_ghash_armv7_neon (void *gcm_key, byte *result,
  *                                      const byte *buf, size_t nblocks);
  */
 .align 3
 .globl _gcry_ghash_armv7_neon
 .type  _gcry_ghash_armv7_neon,%function;
 _gcry_ghash_armv7_neon:
   /* input:
    *    r0: gcm_key
    *    r1: result/hash
    *    r2: buf
    *    r3: nblocks
    */
   push {r4-r6, lr}
 
   cmp r3, #0
   beq .Ldo_nothing
 
   vpush {q4-q7}
 
   vld1.64 {rhash}, [r1]
   vld1.64 {rh1}, [r0]
 
   vrev64.8 rhash, rhash /* byte-swap */
 
   vmov.i64 k0, #0x0
   vmov.i64 k16, #0xffff
   vmov.i64 k32, #0xffffffff
   vmov.i64 k48, #0xffffffffffff
 
   vext.8 rhash, rhash, rhash, #8
 
   /* Handle remaining blocks. */
 
   vld1.64 {rbuf}, [r2]!
   subs r3, r3, #1
 
   vrev64.8 rbuf, rbuf /* byte-swap */
   vext.8 rbuf, rbuf, rbuf, #8
 
   veor rhash, rhash, rbuf
 
   beq .Lend
 
 .Loop:
   vld1.64 {rbuf}, [r2]!
   PMUL_128x128(rr0, rr1, rhash, rh1, rt0, rt1, _(vrev64.8 rbuf, rbuf))
   REDUCTION(rhash, rr0, rr1, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
   subs r3, r3, #1
   veor rhash, rhash, rbuf
 
   bne .Loop
 
 .Lend:
   PMUL_128x128(rr0, rr1, rhash, rh1, rt0, rt1, _(CLEAR_REG(rbuf)))
   REDUCTION(rhash, rr0, rr1, rt0, _(CLEAR_REG(rh1)))
 
 .Ldone:
   CLEAR_REG(rr1)
   vrev64.8 rhash, rhash /* byte-swap */
   CLEAR_REG(rt0)
   CLEAR_REG(rr0)
   vext.8 rhash, rhash, rhash, #8
   CLEAR_REG(rt1)
   CLEAR_REG(t0q)
   CLEAR_REG(t1q)
   CLEAR_REG(t2q)
   CLEAR_REG(t3q)
   vst1.64 {rhash}, [r1]
   CLEAR_REG(rhash)
 
   vpop {q4-q7}
 
 .Ldo_nothing:
   mov r0, #0
   pop {r4-r6, pc}
 .size _gcry_ghash_armv7_neon,.-_gcry_ghash_armv7_neon;
 
 
 /*
  * void _gcry_ghash_armv7_neon (void *gcm_key);
  */
 .align 3
 .globl _gcry_ghash_setup_armv7_neon
 .type  _gcry_ghash_setup_armv7_neon,%function;
 _gcry_ghash_setup_armv7_neon:
   /* input:
    *	r0: gcm_key
    */
 
   vpush {q4-q7}
 
   GET_DATA_POINTER(r2, .Lrconst64, r3)
 
   vld1.64 {rrconst_h}, [r2]
 
 #define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
         /* H <<< 1 */ \
         vshr.s64 ma, ib, #63; \
         vshr.u64 oa, ib, #63; \
         vshr.u64 ob, ia, #63; \
         vand ma, const_d; \
         vshl.u64 ib, ib, #1; \
         vshl.u64 ia, ia, #1; \
         vorr ob, ib; \
         vorr oa, ia; \
         veor ob, ma; \
         vst1.64 {oa, ob}, [r_out]
 
   vld1.64 {rhash}, [r0]
   vrev64.8 rhash, rhash /* byte-swap */
   vext.8 rhash, rhash, rhash, #8
 
   vmov rbuf1, rhash
   GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
 
   CLEAR_REG(rh1)
   CLEAR_REG(rhash)
   CLEAR_REG(rbuf1)
   CLEAR_REG(rrconst)
   vpop {q4-q7}
   bx lr
 .size _gcry_ghash_setup_armv7_neon,.-_gcry_ghash_setup_armv7_neon;
 
 #endif
diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S
index e680c817..632daac2 100644
--- a/cipher/rijndael-arm.S
+++ b/cipher/rijndael-arm.S
@@ -1,581 +1,581 @@
 /* rijndael-arm.S  -  ARM assembly implementation of AES cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(__ARMEL__)
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 .text
 
 .syntax unified
 .arm
 
 /* register macros */
-#define CTX	%r0
-#define RTAB	%lr
-#define RMASK	%ip
+#define CTX	r0
+#define RTAB	lr
+#define RMASK	ip
 
-#define RA	%r4
-#define RB	%r5
-#define RC	%r6
-#define RD	%r7
+#define RA	r4
+#define RB	r5
+#define RC	r6
+#define RD	r7
 
-#define RNA	%r8
-#define RNB	%r9
-#define RNC	%r10
-#define RND	%r11
+#define RNA	r8
+#define RNB	r9
+#define RNC	r10
+#define RND	r11
 
-#define RT0	%r1
-#define RT1	%r2
-#define RT2	%r3
+#define RT0	r1
+#define RT1	r2
+#define RT2	r3
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 0)]; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 3)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 0)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 1)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 2)]; \
 	strb rtmp0, [rdst, #((offs) + 3)];
 
 /***********************************************************************
  * ARM assembly implementation of the AES cipher
  ***********************************************************************/
 #define preload_first_key(round, ra) \
 	ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
 
 #define dummy(round, ra) /* nothing */
 
 #define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
 	ldm CTX, {rna, rnb, rnc, rnd}; \
 	eor ra, rna; \
 	eor rb, rnb; \
 	eor rc, rnc; \
 	preload_key(1, rna); \
 	eor rd, rnd;
 
 #define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
 	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
 	\
 	and RT0, RMASK, ra, lsl#2; \
 	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
 	and RT1, RMASK, ra, lsr#(8 - 2); \
 	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
 	and RT2, RMASK, ra, lsr#(16 - 2); \
 	ldr RT0, [RTAB, RT0]; \
 	and ra,  RMASK, ra, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rna, rna, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rd, lsl#2; \
 	ldr ra,  [RTAB, ra]; \
 	\
 	eor rnd, rnd, RT1, ror #24; \
 	and RT1, RMASK, rd, lsr#(8 - 2); \
 	eor rnc, rnc, RT2, ror #16; \
 	and RT2, RMASK, rd, lsr#(16 - 2); \
 	eor rnb, rnb, ra, ror #8; \
 	ldr RT0, [RTAB, RT0]; \
 	and rd,  RMASK, rd, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rnd, rnd, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rc, lsl#2; \
 	ldr rd,  [RTAB, rd]; \
 	\
 	eor rnc, rnc, RT1, ror #24; \
 	and RT1, RMASK, rc, lsr#(8 - 2); \
 	eor rnb, rnb, RT2, ror #16; \
 	and RT2, RMASK, rc, lsr#(16 - 2); \
 	eor rna, rna, rd, ror #8; \
 	ldr RT0, [RTAB, RT0]; \
 	and rc,  RMASK, rc, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rnc, rnc, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rb, lsl#2; \
 	ldr rc,  [RTAB, rc]; \
 	\
 	eor rnb, rnb, RT1, ror #24; \
 	and RT1, RMASK, rb, lsr#(8 - 2); \
 	eor rna, rna, RT2, ror #16; \
 	and RT2, RMASK, rb, lsr#(16 - 2); \
 	eor rnd, rnd, rc, ror #8; \
 	ldr RT0, [RTAB, RT0]; \
 	and rb,  RMASK, rb, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rnb, rnb, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	eor rna, rna, RT1, ror #24; \
 	ldr rb,  [RTAB, rb]; \
 	\
 	eor rnd, rnd, RT2, ror #16; \
 	preload_key((next_r) + 1, ra); \
 	eor rnc, rnc, rb, ror #8;
 
 #define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	and RT0, RMASK, ra, lsl#2; \
 	and RT1, RMASK, ra, lsr#(8 - 2); \
 	and RT2, RMASK, ra, lsr#(16 - 2); \
 	ldrb rna, [RTAB, RT0]; \
 	and ra,  RMASK, ra, lsr#(24 - 2); \
 	ldrb rnd, [RTAB, RT1]; \
 	and RT0, RMASK, rd, lsl#2; \
 	ldrb rnc, [RTAB, RT2]; \
 	mov rnd, rnd, ror #24; \
 	ldrb rnb, [RTAB, ra]; \
 	and RT1, RMASK, rd, lsr#(8 - 2); \
 	mov rnc, rnc, ror #16; \
 	and RT2, RMASK, rd, lsr#(16 - 2); \
 	mov rnb, rnb, ror #8; \
 	ldrb RT0, [RTAB, RT0]; \
 	and rd,  RMASK, rd, lsr#(24 - 2); \
 	ldrb RT1, [RTAB, RT1]; \
 	\
 	orr rnd, rnd, RT0; \
 	ldrb RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rc, lsl#2; \
 	ldrb rd,  [RTAB, rd]; \
 	orr rnc, rnc, RT1, ror #24; \
 	and RT1, RMASK, rc, lsr#(8 - 2); \
 	orr rnb, rnb, RT2, ror #16; \
 	and RT2, RMASK, rc, lsr#(16 - 2); \
 	orr rna, rna, rd, ror #8; \
 	ldrb RT0, [RTAB, RT0]; \
 	and rc,  RMASK, rc, lsr#(24 - 2); \
 	ldrb RT1, [RTAB, RT1]; \
 	\
 	orr rnc, rnc, RT0; \
 	ldrb RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rb, lsl#2; \
 	ldrb rc,  [RTAB, rc]; \
 	orr rnb, rnb, RT1, ror #24; \
 	and RT1, RMASK, rb, lsr#(8 - 2); \
 	orr rna, rna, RT2, ror #16; \
 	ldrb RT0, [RTAB, RT0]; \
 	and RT2, RMASK, rb, lsr#(16 - 2); \
 	ldrb RT1, [RTAB, RT1]; \
 	orr rnd, rnd, rc, ror #8; \
 	ldrb RT2, [RTAB, RT2]; \
 	and rb,  RMASK, rb, lsr#(24 - 2); \
 	ldrb rb,  [RTAB, rb]; \
 	\
 	orr rnb, rnb, RT0; \
 	orr rna, rna, RT1, ror #24; \
 	orr rnd, rnd, RT2, ror #16; \
 	orr rnc, rnc, rb, ror #8;
 
 #define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
 	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
 
 #define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
 	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
 
 #define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	add CTX, #(((round) + 1) * 16); \
 	add RTAB, #1; \
 	do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
 	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
 
 .align 3
 .globl _gcry_aes_arm_encrypt_block
 .type   _gcry_aes_arm_encrypt_block,%function;
 
 _gcry_aes_arm_encrypt_block:
 	/* input:
-	 *	%r0: keysched, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: number of rounds.. 10, 12 or 14
-	 *      %st+0: encryption table
+	 *	r0: keysched, CTX
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: number of rounds.. 10, 12 or 14
+	 *      st+0: encryption table
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
 	/* read input block */
 
 	/* test if src is unaligned */
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	1f;
 
 	/* unaligned load */
-	ldr_unaligned_le(RA, %r2, 0, RNA);
-	ldr_unaligned_le(RB, %r2, 4, RNB);
-	ldr_unaligned_le(RC, %r2, 8, RNA);
-	ldr_unaligned_le(RD, %r2, 12, RNB);
+	ldr_unaligned_le(RA, r2, 0, RNA);
+	ldr_unaligned_le(RB, r2, 4, RNB);
+	ldr_unaligned_le(RC, r2, 8, RNA);
+	ldr_unaligned_le(RD, r2, 12, RNB);
 	b	2f;
 .ltorg
 1:
 	/* aligned load */
-	ldm	%r2, {RA, RB, RC, RD};
+	ldm	r2, {RA, RB, RC, RD};
 #ifndef __ARMEL__
 	rev	RA, RA;
 	rev	RB, RB;
 	rev	RC, RC;
 	rev	RD, RD;
 #endif
 2:
-	ldr     RTAB, [%sp, #40];
-	sub	%sp, #16;
+	ldr     RTAB, [sp, #40];
+	sub	sp, #16;
 
-	str	%r1, [%sp, #4];		/* dst */
+	str	r1, [sp, #4];		/* dst */
 	mov	RMASK, #0xff;
-	str	%r3, [%sp, #8];		/* nrounds */
+	str	r3, [sp, #8];		/* nrounds */
 	mov	RMASK, RMASK, lsl#2;	/* byte mask */
 
 	firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
 	encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 
-	ldr	RT0, [%sp, #8];		/* nrounds */
+	ldr	RT0, [sp, #8];		/* nrounds */
 	cmp	RT0, #12;
 	bge	.Lenc_not_128;
 
 	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
 	lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
 .Lenc_done:
-	ldr	RT0, [%sp, #4];		/* dst */
-	add	%sp, #16;
+	ldr	RT0, [sp, #4];		/* dst */
+	add	sp, #16;
 
 	/* store output block */
 
 	/* test if dst is unaligned */
 	tst	RT0, #3;
 	beq	1f;
 
 	/* unaligned store */
 	str_unaligned_le(RA, RT0, 0, RNA, RNB);
 	str_unaligned_le(RB, RT0, 4, RNA, RNB);
 	str_unaligned_le(RC, RT0, 8, RNA, RNB);
 	str_unaligned_le(RD, RT0, 12, RNA, RNB);
 	b	2f;
 .ltorg
 1:
 	/* aligned store */
 #ifndef __ARMEL__
 	rev	RA, RA;
 	rev	RB, RB;
 	rev	RC, RC;
 	rev	RD, RD;
 #endif
 	/* write output block */
 	stm	RT0, {RA, RB, RC, RD};
 2:
 
 	mov     r0, #(10 * 4);
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 
 .ltorg
 .Lenc_not_128:
 	beq .Lenc_192
 
 	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
 	lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
 	b .Lenc_done;
 
 .ltorg
 .Lenc_192:
 	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
 	lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
 	b .Lenc_done;
 .size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;
 
 #define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
 	ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
 	eor ra, rna; \
 	ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
 	eor rb, rnb; \
 	ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
 	eor rc, rnc; \
 	preload_first_key((round) - 1, rna); \
 	eor rd, rnd;
 
 #define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
 	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
 	\
 	and RT0, RMASK, ra, lsl#2; \
 	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
 	and RT1, RMASK, ra, lsr#(8 - 2); \
 	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
 	and RT2, RMASK, ra, lsr#(16 - 2); \
 	ldr RT0, [RTAB, RT0]; \
 	and ra,  RMASK, ra, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rna, rna, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rb, lsl#2; \
 	ldr ra,  [RTAB, ra]; \
 	\
 	eor rnb, rnb, RT1, ror #24; \
 	and RT1, RMASK, rb, lsr#(8 - 2); \
 	eor rnc, rnc, RT2, ror #16; \
 	and RT2, RMASK, rb, lsr#(16 - 2); \
 	eor rnd, rnd, ra, ror #8; \
 	ldr RT0, [RTAB, RT0]; \
 	and rb,  RMASK, rb, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rnb, rnb, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rc, lsl#2; \
 	ldr rb,  [RTAB, rb]; \
 	\
 	eor rnc, rnc, RT1, ror #24; \
 	and RT1, RMASK, rc, lsr#(8 - 2); \
 	eor rnd, rnd, RT2, ror #16; \
 	and RT2, RMASK, rc, lsr#(16 - 2); \
 	eor rna, rna, rb, ror #8; \
 	ldr RT0, [RTAB, RT0]; \
 	and rc,  RMASK, rc, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rnc, rnc, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rd, lsl#2; \
 	ldr rc,  [RTAB, rc]; \
 	\
 	eor rnd, rnd, RT1, ror #24; \
 	and RT1, RMASK, rd, lsr#(8 - 2); \
 	eor rna, rna, RT2, ror #16; \
 	and RT2, RMASK, rd, lsr#(16 - 2); \
 	eor rnb, rnb, rc, ror #8; \
 	ldr RT0, [RTAB, RT0]; \
 	and rd,  RMASK, rd, lsr#(24 - 2); \
 	\
 	ldr RT1, [RTAB, RT1]; \
 	eor rnd, rnd, RT0; \
 	ldr RT2, [RTAB, RT2]; \
 	eor rna, rna, RT1, ror #24; \
 	ldr rd,  [RTAB, rd]; \
 	\
 	eor rnb, rnb, RT2, ror #16; \
 	preload_key((next_r) - 1, ra); \
 	eor rnc, rnc, rd, ror #8;
 
 #define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	and RT0, RMASK, ra; \
 	and RT1, RMASK, ra, lsr#8; \
 	and RT2, RMASK, ra, lsr#16; \
 	ldrb rna, [RTAB, RT0]; \
 	mov ra,  ra, lsr#24; \
 	ldrb rnb, [RTAB, RT1]; \
 	and RT0, RMASK, rb; \
 	ldrb rnc, [RTAB, RT2]; \
 	mov rnb, rnb, ror #24; \
 	ldrb rnd, [RTAB, ra]; \
 	and RT1, RMASK, rb, lsr#8; \
 	mov rnc, rnc, ror #16; \
 	and RT2, RMASK, rb, lsr#16; \
 	mov rnd, rnd, ror #8; \
 	ldrb RT0, [RTAB, RT0]; \
 	mov rb,  rb, lsr#24; \
 	ldrb RT1, [RTAB, RT1]; \
 	\
 	orr rnb, rnb, RT0; \
 	ldrb RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rc; \
 	ldrb rb,  [RTAB, rb]; \
 	orr rnc, rnc, RT1, ror #24; \
 	and RT1, RMASK, rc, lsr#8; \
 	orr rnd, rnd, RT2, ror #16; \
 	and RT2, RMASK, rc, lsr#16; \
 	orr rna, rna, rb, ror #8; \
 	ldrb RT0, [RTAB, RT0]; \
 	mov rc,  rc, lsr#24; \
 	ldrb RT1, [RTAB, RT1]; \
 	\
 	orr rnc, rnc, RT0; \
 	ldrb RT2, [RTAB, RT2]; \
 	and RT0, RMASK, rd; \
 	ldrb rc,  [RTAB, rc]; \
 	orr rnd, rnd, RT1, ror #24; \
 	and RT1, RMASK, rd, lsr#8; \
 	orr rna, rna, RT2, ror #16; \
 	ldrb RT0, [RTAB, RT0]; \
 	and RT2, RMASK, rd, lsr#16; \
 	ldrb RT1, [RTAB, RT1]; \
 	orr rnb, rnb, rc, ror #8; \
 	ldrb RT2, [RTAB, RT2]; \
 	mov rd,  rd, lsr#24; \
 	ldrb rd,  [RTAB, rd]; \
 	\
 	orr rnd, rnd, RT0; \
 	orr rna, rna, RT1, ror #24; \
 	orr rnb, rnb, RT2, ror #16; \
 	orr rnc, rnc, rd, ror #8;
 
 #define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
 	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
 
 #define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
 	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
 
 #define set_last_round_rmask(_, __) \
 	mov RMASK, #0xff;
 
 #define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
 	add RTAB, #(4 * 256); \
 	do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
 	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
 
 .align 3
 .globl _gcry_aes_arm_decrypt_block
 .type   _gcry_aes_arm_decrypt_block,%function;
 
 _gcry_aes_arm_decrypt_block:
 	/* input:
-	 *	%r0: keysched, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: number of rounds.. 10, 12 or 14
-	 *      %st+0: decryption table
+	 *	r0: keysched, CTX
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: number of rounds.. 10, 12 or 14
+	 *      st+0: decryption table
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
 	/* read input block */
 
 	/* test if src is unaligned */
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	1f;
 
 	/* unaligned load */
-	ldr_unaligned_le(RA, %r2, 0, RNA);
-	ldr_unaligned_le(RB, %r2, 4, RNB);
-	ldr_unaligned_le(RC, %r2, 8, RNA);
-	ldr_unaligned_le(RD, %r2, 12, RNB);
+	ldr_unaligned_le(RA, r2, 0, RNA);
+	ldr_unaligned_le(RB, r2, 4, RNB);
+	ldr_unaligned_le(RC, r2, 8, RNA);
+	ldr_unaligned_le(RD, r2, 12, RNB);
 	b	2f;
 .ltorg
 1:
 	/* aligned load */
-	ldm	%r2, {RA, RB, RC, RD};
+	ldm	r2, {RA, RB, RC, RD};
 #ifndef __ARMEL__
 	rev	RA, RA;
 	rev	RB, RB;
 	rev	RC, RC;
 	rev	RD, RD;
 #endif
 2:
-	ldr     RTAB, [%sp, #40];
-	sub	%sp, #16;
+	ldr     RTAB, [sp, #40];
+	sub	sp, #16;
 
 	mov	RMASK, #0xff;
-	str	%r1, [%sp, #4];		/* dst */
+	str	r1, [sp, #4];		/* dst */
 	mov	RMASK, RMASK, lsl#2;	/* byte mask */
 
-	cmp	%r3, #12;
+	cmp	r3, #12;
 	bge	.Ldec_256;
 
 	firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
 .Ldec_tail:
 	decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
 	lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
-	ldr	RT0, [%sp, #4];		/* dst */
-	add	%sp, #16;
+	ldr	RT0, [sp, #4];		/* dst */
+	add	sp, #16;
 
 	/* store output block */
 
 	/* test if dst is unaligned */
 	tst	RT0, #3;
 	beq	1f;
 
 	/* unaligned store */
 	str_unaligned_le(RA, RT0, 0, RNA, RNB);
 	str_unaligned_le(RB, RT0, 4, RNA, RNB);
 	str_unaligned_le(RC, RT0, 8, RNA, RNB);
 	str_unaligned_le(RD, RT0, 12, RNA, RNB);
 	b	2f;
 .ltorg
 1:
 	/* aligned store */
 #ifndef __ARMEL__
 	rev	RA, RA;
 	rev	RB, RB;
 	rev	RC, RC;
 	rev	RD, RD;
 #endif
 	/* write output block */
 	stm	RT0, {RA, RB, RC, RD};
 2:
 	mov     r0, #(10 * 4);
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 
 .ltorg
 .Ldec_256:
 	beq .Ldec_192;
 
 	firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
 	decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 
 	b .Ldec_tail;
 
 .ltorg
 .Ldec_192:
 	firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
 	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 
 	b .Ldec_tail;
 .size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;
 
 #endif /*HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS*/
 #endif /*__ARMEL__ */
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 6208652b..3c4149b3 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1,2134 +1,2134 @@
 /* rijndael-armv8-aarch32-ce.S - ARMv8/CE accelerated AES
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 
 .syntax unified
 .arch armv8-a
 .fpu crypto-neon-fp-armv8
 .arm
 
 .text
 
 #ifdef __PIC__
 #  define GET_DATA_POINTER(reg, name, rtmp) \
 		ldr reg, 1f; \
 		ldr rtmp, 2f; \
 		b 3f; \
 	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
 	2:	.word name(GOT); \
 	3:	add reg, pc, reg; \
 		ldr reg, [reg, rtmp];
 #else
 #  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
 #endif
 
 
 /* AES macros */
 
 #define aes_preload_keys(keysched, rekeysched) \
         vldmia   keysched!, {q5-q7}; \
         mov      rekeysched, keysched; \
         vldmialo keysched!, {q8-q15}; /* 128-bit */ \
         addeq    keysched, #(2*16); \
         vldmiaeq keysched!, {q10-q15}; /* 192-bit */ \
         addhi    keysched, #(4*16); \
         vldmiahi keysched!, {q12-q15}; /* 256-bit */ \
 
 #define do_aes_one128(ed, mcimc, qo, qb) \
         aes##ed.8    qb, q5; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q6; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q7; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q8; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q9; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q10; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q11; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q12; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q13; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q14; \
         veor         qo, qb, q15;
 
 #define do_aes_one128re(ed, mcimc, qo, qb, keysched, rekeysched) \
         vldm         rekeysched, {q8-q9}; \
         do_aes_one128(ed, mcimc, qo, qb);
 
 #define do_aes_one192(ed, mcimc, qo, qb, keysched, rekeysched) \
         vldm         rekeysched!, {q8}; \
         aes##ed.8    qb, q5; \
         aes##mcimc.8 qb, qb; \
         vldm         rekeysched, {q9}; \
         aes##ed.8    qb, q6; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q7; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q8; \
         aes##mcimc.8 qb, qb; \
         vldmia       keysched!, {q8}; \
         aes##ed.8    qb, q9; \
         aes##mcimc.8 qb, qb; \
         sub          rekeysched, #(1*16); \
         aes##ed.8    qb, q10; \
         aes##mcimc.8 qb, qb; \
         vldm         keysched, {q9}; \
         aes##ed.8    qb, q11; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q12; \
         aes##mcimc.8 qb, qb; \
         sub          keysched, #16; \
         aes##ed.8    qb, q13; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q14; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q15; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q8; \
         veor         qo, qb, q9; \
 
 #define do_aes_one256(ed, mcimc, qo, qb, keysched, rekeysched) \
         vldmia       rekeysched!, {q8}; \
         aes##ed.8    qb, q5; \
         aes##mcimc.8 qb, qb; \
         vldmia       rekeysched!, {q9}; \
         aes##ed.8    qb, q6; \
         aes##mcimc.8 qb, qb; \
         vldmia       rekeysched!, {q10}; \
         aes##ed.8    qb, q7; \
         aes##mcimc.8 qb, qb; \
         vldm         rekeysched, {q11}; \
         aes##ed.8    qb, q8; \
         aes##mcimc.8 qb, qb; \
         vldmia       keysched!, {q8}; \
         aes##ed.8    qb, q9; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q10; \
         aes##mcimc.8 qb, qb; \
         vldmia       keysched!, {q9}; \
         aes##ed.8    qb, q11; \
         aes##mcimc.8 qb, qb; \
         sub          rekeysched, #(3*16); \
         aes##ed.8    qb, q12; \
         aes##mcimc.8 qb, qb; \
         vldmia       keysched!, {q10}; \
         aes##ed.8    qb, q13; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q14; \
         aes##mcimc.8 qb, qb; \
         vldm         keysched, {q11}; \
         aes##ed.8    qb, q15; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q8; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q9; \
         aes##mcimc.8 qb, qb; \
         aes##ed.8    qb, q10; \
         veor         qo, qb, q11; \
         sub          keysched, #(3*16); \
 
 #define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
         aes##ed.8    b0, key; \
         aes##mcimc.8 b0, b0; \
           aes##ed.8    b1, key; \
           aes##mcimc.8 b1, b1; \
             aes##ed.8    b2, key; \
             aes##mcimc.8 b2, b2; \
               aes##ed.8    b3, key; \
               aes##mcimc.8 b3, b3;
 
 #define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
         aes##ed.8    b0, q14; \
         veor         b0, b0, q15; \
         aes##ed.8    b1, q14; \
         veor         b1, b1, q15; \
         aes##ed.8    b2, q14; \
         veor         b2, b2, q15; \
         aes##ed.8    b3, q14; \
         veor         b3, b3, q15;
 
 #define do_aes_4_128re(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
         vldm         rekeysched, {q8-q9}; \
         do_aes_4_128(ed, mcimc, b0, b1, b2, b3);
 
 #define do_aes_4_192(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
         vldm         rekeysched!, {q8}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
         vldm         rekeysched, {q9}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
         vldmia       keysched!, {q8}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
         sub          rekeysched, #(1*16); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
         vldm         keysched, {q9}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
         sub          keysched, #16; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \
         aes##ed.8    b0, q8; \
         veor         b0, b0, q9; \
         aes##ed.8    b1, q8; \
         veor         b1, b1, q9; \
         aes##ed.8    b2, q8; \
         veor         b2, b2, q9; \
         aes##ed.8    b3, q8; \
         veor         b3, b3, q9;
 
 #define do_aes_4_256(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
         vldmia       rekeysched!, {q8}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
         vldmia       rekeysched!, {q9}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
         vldmia       rekeysched!, {q10}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
         vldm         rekeysched, {q11}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
         vldmia       keysched!, {q8}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
         vldmia       keysched!, {q9}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
         sub          rekeysched, #(3*16); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
         vldmia       keysched!, {q10}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \
         vldm         keysched, {q11}; \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
         aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
         sub          keysched, #(3*16); \
         aes##ed.8    b0, q10; \
         veor         b0, b0, q11; \
         aes##ed.8    b1, q10; \
         veor         b1, b1, q11; \
         aes##ed.8    b2, q10; \
         veor         b2, b2, q11; \
         aes##ed.8    b3, q10; \
         veor         b3, b3, q11;
 
 
 /* Other functional macros */
 
 #define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
  * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
  *                                     const byte *src,
  *                                     unsigned int nrounds);
  */
 .align 3
 .globl _gcry_aes_enc_armv8_ce
 .type  _gcry_aes_enc_armv8_ce,%function;
 _gcry_aes_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: dst
    *    r2: src
    *    r3: nrounds
    */
 
   vldmia r0!, {q1-q3} /* load 3 round keys */
 
   cmp r3, #12
 
   vld1.8 {q0}, [r2]
 
   bhi .Lenc1_256
   beq .Lenc1_192
 
 .Lenc1_128:
 
 .Lenc1_tail:
   vldmia r0, {q8-q15} /* load 8 round keys */
 
   aese.8  q0, q1
   aesmc.8 q0, q0
   CLEAR_REG(q1)
 
   aese.8  q0, q2
   aesmc.8 q0, q0
   CLEAR_REG(q2)
 
   aese.8  q0, q3
   aesmc.8 q0, q0
   CLEAR_REG(q3)
 
   aese.8  q0, q8
   aesmc.8 q0, q0
   CLEAR_REG(q8)
 
   aese.8  q0, q9
   aesmc.8 q0, q0
   CLEAR_REG(q9)
 
   aese.8  q0, q10
   aesmc.8 q0, q0
   CLEAR_REG(q10)
 
   aese.8  q0, q11
   aesmc.8 q0, q0
   CLEAR_REG(q11)
 
   aese.8  q0, q12
   aesmc.8 q0, q0
   CLEAR_REG(q12)
 
   aese.8  q0, q13
   aesmc.8 q0, q0
   CLEAR_REG(q13)
 
   aese.8  q0, q14
   veor    q0, q15
   CLEAR_REG(q14)
   CLEAR_REG(q15)
 
   vst1.8 {q0}, [r1]
   CLEAR_REG(q0)
 
   mov r0, #0
   bx lr
 
 .Lenc1_192:
   aese.8  q0, q1
   aesmc.8 q0, q0
   vmov q1, q3
 
   aese.8  q0, q2
   aesmc.8 q0, q0
   vldm r0!, {q2-q3} /* load 3 round keys */
 
   b .Lenc1_tail
 
 .Lenc1_256:
   vldm r0!, {q15}   /* load 1 round key */
   aese.8  q0, q1
   aesmc.8 q0, q0
 
   aese.8  q0, q2
   aesmc.8 q0, q0
 
   aese.8  q0, q3
   aesmc.8 q0, q0
   vldm r0!, {q1-q3} /* load 3 round keys */
 
   aese.8  q0, q15
   aesmc.8 q0, q0
 
   b .Lenc1_tail
 .size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;
 
 
 /*
  * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
  *                                     const byte *src,
  *                                     unsigned int nrounds);
  */
 .align 3
 .globl _gcry_aes_dec_armv8_ce
 .type  _gcry_aes_dec_armv8_ce,%function;
 _gcry_aes_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: dst
    *    r2: src
    *    r3: nrounds
    */
 
   vldmia r0!, {q1-q3} /* load 3 round keys */
 
   cmp r3, #12
 
   vld1.8 {q0}, [r2]
 
   bhi .Ldec1_256
   beq .Ldec1_192
 
 .Ldec1_128:
 
 .Ldec1_tail:
   vldmia r0, {q8-q15} /* load 8 round keys */
 
   aesd.8   q0, q1
   aesimc.8 q0, q0
   CLEAR_REG(q1)
 
   aesd.8   q0, q2
   aesimc.8 q0, q0
   CLEAR_REG(q2)
 
   aesd.8   q0, q3
   aesimc.8 q0, q0
   CLEAR_REG(q3)
 
   aesd.8   q0, q8
   aesimc.8 q0, q0
   CLEAR_REG(q8)
 
   aesd.8   q0, q9
   aesimc.8 q0, q0
   CLEAR_REG(q9)
 
   aesd.8   q0, q10
   aesimc.8 q0, q0
   CLEAR_REG(q10)
 
   aesd.8   q0, q11
   aesimc.8 q0, q0
   CLEAR_REG(q11)
 
   aesd.8   q0, q12
   aesimc.8 q0, q0
   CLEAR_REG(q12)
 
   aesd.8   q0, q13
   aesimc.8 q0, q0
   CLEAR_REG(q13)
 
   aesd.8   q0, q14
   veor     q0, q15
   CLEAR_REG(q14)
   CLEAR_REG(q15)
 
   vst1.8 {q0}, [r1]
   CLEAR_REG(q0)
 
   mov r0, #0
   bx lr
 
 .Ldec1_192:
   aesd.8   q0, q1
   aesimc.8 q0, q0
   vmov q1, q3
 
   aesd.8   q0, q2
   aesimc.8 q0, q0
   vldm r0!, {q2-q3} /* load 3 round keys */
 
   b .Ldec1_tail
 
 .Ldec1_256:
   vldm r0!, {q15}   /* load 1 round key */
   aesd.8   q0, q1
   aesimc.8 q0, q0
 
   aesd.8   q0, q2
   aesimc.8 q0, q0
 
   aesd.8  q0, q3
   aesimc.8 q0, q0
   vldm r0!, {q1-q3} /* load 3 round keys */
 
   aesd.8   q0, q15
   aesimc.8 q0, q0
 
   b .Ldec1_tail
 .size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;
 
 
 /*
  * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, size_t nblocks,
  *                                  int cbc_mac, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cbc_enc_armv8_ce
 .type  _gcry_aes_cbc_enc_armv8_ce,%function;
 _gcry_aes_cbc_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: cbc_mac => r5
-   *    %st+8: nrounds => r6
+   *    st+0: nblocks => r4
+   *    st+4: cbc_mac => r5
+   *    st+8: nrounds => r6
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
   ldr r4, [sp, #(16+0)]
   ldr r5, [sp, #(16+4)]
   cmp r4, #0
   ldr r6, [sp, #(16+8)]
   beq .Lcbc_enc_skip
   cmp r5, #0
   vpush {q4-q7}
   moveq r5, #16
   movne r5, #0
 
   cmp r6, #12
   vld1.8 {q1}, [r3] /* load IV */
 
   aes_preload_keys(r0, lr);
 
   beq .Lcbc_enc_loop192
   bhi .Lcbc_enc_loop256
 
 #define CBC_ENC(bits, ...) \
   .Lcbc_enc_loop##bits: \
     vld1.8 {q0}, [r2]!; /* load plaintext */ \
     veor q1, q0, q1; \
     subs r4, r4, #1; \
     \
     do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
     \
     vst1.8 {q1}, [r1], r5; /* store ciphertext */ \
     \
     bne .Lcbc_enc_loop##bits; \
     b .Lcbc_enc_done;
 
   CBC_ENC(128)
   CBC_ENC(192, r0, lr)
   CBC_ENC(256, r0, lr)
 
 #undef CBC_ENC
 
 .Lcbc_enc_done:
   vst1.8 {q1}, [r3] /* store IV */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   vpop {q4-q7}
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lcbc_enc_skip:
   pop {r4-r6,pc}
 .size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;
 
 
 /*
  * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cbc_dec_armv8_ce
 .type  _gcry_aes_cbc_dec_armv8_ce,%function;
 _gcry_aes_cbc_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
   ldr r4, [sp, #(16+0)]
   ldr r5, [sp, #(16+4)]
   cmp r4, #0
   beq .Lcbc_dec_skip
   vpush {q4-q7}
 
   cmp r5, #12
   vld1.8 {q0}, [r3] /* load IV */
 
   aes_preload_keys(r0, r6);
 
   beq .Lcbc_dec_entry_192
   bhi .Lcbc_dec_entry_256
 
 #define CBC_DEC(bits, ...) \
   .Lcbc_dec_entry_##bits: \
     cmp r4, #4; \
     blo .Lcbc_dec_loop_##bits; \
     \
   .Lcbc_dec_loop4_##bits: \
     \
     vld1.8 {q1-q2}, [r2]!; /* load ciphertext */ \
     sub r4, r4, #4; \
     vld1.8 {q3-q4}, [r2]; /* load ciphertext */ \
     cmp r4, #4; \
     sub r2, #32; \
     \
     do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     veor q1, q1, q0; \
     vld1.8 {q0}, [r2]!; /* load next IV */ \
     veor q2, q2, q0; \
     vld1.8 {q0}, [r2]!; /* load next IV */ \
     vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
     veor q3, q3, q0; \
     vld1.8 {q0}, [r2]!; /* load next IV */ \
     veor q4, q4, q0; \
     vld1.8 {q0}, [r2]!; /* load next IV */ \
     vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
     \
     bhs .Lcbc_dec_loop4_##bits; \
     cmp r4, #0; \
     beq .Lcbc_dec_done; \
     \
   .Lcbc_dec_loop_##bits: \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     subs r4, r4, #1; \
     vmov q2, q1; \
     \
     do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
     \
     veor q1, q1, q0; \
     vmov q0, q2; \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     \
     bne .Lcbc_dec_loop_##bits; \
     b .Lcbc_dec_done;
 
   CBC_DEC(128)
   CBC_DEC(192, r0, r6)
   CBC_DEC(256, r0, r6)
 
 #undef CBC_DEC
 
 .Lcbc_dec_done:
   vst1.8 {q0}, [r3] /* store IV */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   vpop {q4-q7}
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lcbc_dec_skip:
   pop {r4-r6,pc}
 .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
 
 
 /*
  * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_ecb_enc_armv8_ce
 .type  _gcry_aes_ecb_enc_armv8_ce,%function;
 _gcry_aes_ecb_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: nblocks
-   *    %st+0: nrounds => r4
+   *    st+0: nrounds => r4
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
   cmp r3, #0
   beq .Lecb_enc_skip
   ldr r4, [sp, #(16+0)]
   vpush {q4-q7}
 
   cmp r4, #12
   aes_preload_keys(r0, lr);
 
   beq .Lecb_entry_192e
   bhi .Lecb_entry_256e
 
 #define ECB_CRYPT(bits, e_d, mc_imc, ...) \
   .Lecb_entry_##bits##e_d: \
     cmp r3, #4; \
     blo .Lecb_loop_##bits##e_d; \
     \
   .Lecb_loop4_##bits##e_d: \
     vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
     sub r3, r3, #4; \
     vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
     cmp r3, #4; \
     \
     do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \
     vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \
     \
     bhs .Lecb_loop4_##bits##e_d; \
     cmp r3, #0; \
     beq .Lecb_done_##e_d; \
     \
   .Lecb_loop_##bits##e_d: \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     subs r3, r3, #1; \
     \
     do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \
     \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     bne .Lecb_loop_##bits##e_d; \
     b .Lecb_done_##e_d;
 
   ECB_CRYPT(128, e, mc)
   ECB_CRYPT(192, e, mc, r0, lr)
   ECB_CRYPT(256, e, mc, r0, lr)
 
 .Lecb_done_e:
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   vpop {q4-q7}
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lecb_enc_skip:
   pop {r4-r6,pc}
 .size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;
 
 
 /*
  * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_ecb_dec_armv8_ce
 .type  _gcry_aes_ecb_dec_armv8_ce,%function;
 _gcry_aes_ecb_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: nblocks
-   *    %st+0: nrounds => r4
+   *    st+0: nrounds => r4
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
   cmp r3, #0
   beq .Lecb_enc_skip
   ldr r4, [sp, #(16+0)]
   vpush {q4-q7}
 
   cmp r4, #12
 
   aes_preload_keys(r0, lr);
 
   beq .Lecb_entry_192d
   bhi .Lecb_entry_256d
 
   ECB_CRYPT(128, d, imc)
   ECB_CRYPT(192, d, imc, r0, lr)
   ECB_CRYPT(256, d, imc, r0, lr)
 
 #undef ECB_CRYPT
 
 .Lecb_done_d:
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   vpop {q4-q7}
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lecb_dec_skip:
   pop {r4-r6,pc}
 .size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;
 
 
 /*
  * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cfb_enc_armv8_ce
 .type  _gcry_aes_cfb_enc_armv8_ce,%function;
 _gcry_aes_cfb_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
   ldr r4, [sp, #(16+0)]
   ldr r5, [sp, #(16+4)]
   cmp r4, #0
   beq .Lcfb_enc_skip
   vpush {q4-q7}
 
   cmp r5, #12
   vld1.8 {q0}, [r3] /* load IV */
 
   aes_preload_keys(r0, r6);
 
   beq .Lcfb_enc_entry_192
   bhi .Lcfb_enc_entry_256
 
 #define CFB_ENC(bits, ...) \
   .Lcfb_enc_entry_##bits: \
   .Lcfb_enc_loop_##bits: \
     vld1.8 {q1}, [r2]!; /* load plaintext */ \
     subs r4, r4, #1; \
     \
     do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \
     \
     veor q0, q1, q0; \
     vst1.8 {q0}, [r1]!; /* store ciphertext */ \
     \
     bne .Lcfb_enc_loop_##bits; \
     b .Lcfb_enc_done;
 
   CFB_ENC(128)
   CFB_ENC(192, r0, r6)
   CFB_ENC(256, r0, r6)
 
 #undef CFB_ENC
 
 .Lcfb_enc_done:
   vst1.8 {q0}, [r3] /* store IV */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   vpop {q4-q7}
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lcfb_enc_skip:
   pop {r4-r6,pc}
 .size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;
 
 
 /*
  * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cfb_dec_armv8_ce
 .type  _gcry_aes_cfb_dec_armv8_ce,%function;
 _gcry_aes_cfb_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
   ldr r4, [sp, #(16+0)]
   ldr r5, [sp, #(16+4)]
   cmp r4, #0
   beq .Lcfb_dec_skip
   vpush {q4-q7}
 
   cmp r5, #12
   vld1.8 {q0}, [r3] /* load IV */
 
   aes_preload_keys(r0, r6);
 
   beq .Lcfb_dec_entry_192
   bhi .Lcfb_dec_entry_256
 
 #define CFB_DEC(bits, ...) \
   .Lcfb_dec_entry_##bits: \
     cmp r4, #4; \
     blo .Lcfb_dec_loop_##bits; \
     \
   .Lcfb_dec_loop4_##bits: \
     \
     vld1.8 {q2-q3}, [r2]!; /* load ciphertext */ \
     vmov q1, q0; \
     sub r4, r4, #4; \
     vld1.8 {q4}, [r2]; /* load ciphertext */ \
     sub r2, #32; \
     cmp r4, #4; \
     \
     do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     veor q1, q1, q0; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     veor q2, q2, q0; \
     vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
     vld1.8 {q0}, [r2]!; \
     veor q3, q3, q0; \
     vld1.8 {q0}, [r2]!; /* load next IV / ciphertext */ \
     veor q4, q4, q0; \
     vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
     \
     bhs .Lcfb_dec_loop4_##bits; \
     cmp r4, #0; \
     beq .Lcfb_dec_done; \
     \
   .Lcfb_dec_loop_##bits: \
     \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     \
     subs r4, r4, #1; \
     \
     do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \
     \
     veor q2, q1, q0; \
     vmov q0, q1; \
     vst1.8 {q2}, [r1]!; /* store plaintext */ \
     \
     bne .Lcfb_dec_loop_##bits; \
     b .Lcfb_dec_done;
 
   CFB_DEC(128)
   CFB_DEC(192, r0, r6)
   CFB_DEC(256, r0, r6)
 
 #undef CFB_DEC
 
 .Lcfb_dec_done:
   vst1.8 {q0}, [r3] /* store IV */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   vpop {q4-q7}
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lcfb_dec_skip:
   pop {r4-r6,pc}
 .size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;
 
 
 /*
  * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_ctr_enc_armv8_ce
 .type  _gcry_aes_ctr_enc_armv8_ce,%function;
 _gcry_aes_ctr_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   cmp r4, #0
   beq .Lctr_enc_skip
 
   cmp r5, #12
   ldm r3, {r7-r10}
   vld1.8 {q0}, [r3] /* load IV */
   rev r7, r7
   rev r8, r8
   rev r9, r9
   rev r10, r10
 
   aes_preload_keys(r0, r6);
 
   beq .Lctr_enc_entry_192
   bhi .Lctr_enc_entry_256
 
 #define CTR_ENC(bits, ...) \
   .Lctr_enc_entry_##bits: \
     cmp r4, #4; \
     blo .Lctr_enc_loop_##bits; \
     \
   .Lctr_enc_loop4_##bits: \
     cmp r10, #0xfffffffc; \
     sub r4, r4, #4; \
     blo .Lctr_enc_loop4_##bits##_nocarry; \
     cmp r9, #0xffffffff; \
     bne .Lctr_enc_loop4_##bits##_nocarry; \
     \
     adds r10, #1; \
     vmov q1, q0; \
     blcs .Lctr_overflow_one; \
     rev r11, r10; \
     vmov.32 d1[1], r11; \
     \
     adds r10, #1; \
     vmov q2, q0; \
     blcs .Lctr_overflow_one; \
     rev r11, r10; \
     vmov.32 d1[1], r11; \
     \
     adds r10, #1; \
     vmov q3, q0; \
     blcs .Lctr_overflow_one; \
     rev r11, r10; \
     vmov.32 d1[1], r11; \
     \
     adds r10, #1; \
     vmov q4, q0; \
     blcs .Lctr_overflow_one; \
     rev r11, r10; \
     vmov.32 d1[1], r11; \
     \
     b .Lctr_enc_loop4_##bits##_store_ctr; \
     \
   .Lctr_enc_loop4_##bits##_nocarry: \
     \
     veor q2, q2; \
     vrev64.8 q1, q0; \
     vceq.u32 d5, d5; \
     vadd.u64 q3, q2, q2; \
     vadd.u64 q4, q3, q2; \
     vadd.u64 q0, q3, q3; \
     vsub.u64 q2, q1, q2; \
     vsub.u64 q3, q1, q3; \
     vsub.u64 q4, q1, q4; \
     vsub.u64 q0, q1, q0; \
     vrev64.8 q1, q1; \
     vrev64.8 q2, q2; \
     vrev64.8 q3, q3; \
     vrev64.8 q0, q0; \
     vrev64.8 q4, q4; \
     add r10, #4; \
     \
   .Lctr_enc_loop4_##bits##_store_ctr: \
     \
     vst1.8 {q0}, [r3]; \
     cmp r4, #4; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     \
     do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     veor q1, q1, q0; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     veor q2, q2, q0; \
     veor q3, q3, q1; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     vst1.8 {q2}, [r1]!; /* store plaintext */ \
     veor q4, q4, q0; \
     vld1.8 {q0}, [r3]; /* reload IV */ \
     vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
     \
     bhs .Lctr_enc_loop4_##bits; \
     cmp r4, #0; \
     beq .Lctr_enc_done; \
     \
   .Lctr_enc_loop_##bits: \
     \
     adds r10, #1; \
     vmov q1, q0; \
     blcs .Lctr_overflow_one; \
     rev r11, r10; \
     subs r4, r4, #1; \
     vld1.8 {q2}, [r2]!; /* load ciphertext */ \
     vmov.32 d1[1], r11; \
     \
     do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
     \
     veor q1, q2, q1; \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     \
     bne .Lctr_enc_loop_##bits; \
     b .Lctr_enc_done;
 
   CTR_ENC(128)
   CTR_ENC(192, r0, r6)
   CTR_ENC(256, r0, r6)
 
 #undef CTR_ENC
 
 .Lctr_enc_done:
   vst1.8 {q0}, [r3] /* store IV */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lctr_enc_skip:
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 
 .Lctr_overflow_one:
   adcs r9, #0
   adcs r8, #0
   adc r7, #0
   rev r11, r9
   rev r12, r8
   vmov.32 d1[0], r11
   rev r11, r7
   vmov.32 d0[1], r12
   vmov.32 d0[0], r11
   bx lr
 .size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
 
 
 /*
  * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
  *                                      unsigned char *outbuf,
  *                                      const unsigned char *inbuf,
  *                                      unsigned char *iv,
  *                                      unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_ctr32le_enc_armv8_ce
 .type  _gcry_aes_ctr32le_enc_armv8_ce,%function;
 _gcry_aes_ctr32le_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   cmp r4, #0
   beq .Lctr32le_enc_skip
 
   cmp r5, #12
   vld1.8 {q0}, [r3] /* load IV */
 
   aes_preload_keys(r0, r6);
 
   beq .Lctr32le_enc_entry_192
   bhi .Lctr32le_enc_entry_256
 
 #define CTR_ENC(bits, ...) \
   .Lctr32le_enc_entry_##bits: \
     cmp r4, #4; \
     blo .Lctr32le_enc_loop_##bits; \
     \
   .Lctr32le_enc_loop4_##bits: \
     veor q2, q2; \
     sub r4, r4, #4; \
     vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \
     vmov q1, q0; \
     vadd.u32 q3, q2, q2; /* q3 <= -2:0:0:0 */ \
     vadd.u32 q0, q3, q3; /* q0 <= -4:0:0:0 */ \
     vadd.u32 q4, q3, q2; /* q4 <= -3:0:0:0 */ \
     vsub.u32 q0, q1, q0; \
     vsub.u32 q2, q1, q2; \
     vst1.8 {q0}, [r3]; \
     vsub.u32 q3, q1, q3; \
     vsub.u32 q4, q1, q4; \
     \
     cmp r4, #4; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     \
     do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     veor q1, q1, q0; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     veor q2, q2, q0; \
     veor q3, q3, q1; \
     vld1.8 {q0}, [r2]!; /* load ciphertext */ \
     vst1.8 {q2}, [r1]!; /* store plaintext */ \
     veor q4, q4, q0; \
     vld1.8 {q0}, [r3]; /* reload IV */ \
     vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
     \
     bhs .Lctr32le_enc_loop4_##bits; \
     cmp r4, #0; \
     beq .Lctr32le_enc_done; \
     \
   .Lctr32le_enc_loop_##bits: \
     \
     veor q2, q2; \
     vmov q1, q0; \
     vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \
     subs r4, r4, #1; \
     vsub.u32 q0, q0, q2; \
     vld1.8 {q2}, [r2]!; /* load ciphertext */ \
     \
     do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
     \
     veor q1, q2, q1; \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     \
     bne .Lctr32le_enc_loop_##bits; \
     b .Lctr32le_enc_done;
 
   CTR_ENC(128)
   CTR_ENC(192, r0, r6)
   CTR_ENC(256, r0, r6)
 
 #undef CTR_ENC
 
 .Lctr32le_enc_done:
   vst1.8 {q0}, [r3] /* store IV */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lctr32le_enc_skip:
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 .size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;
 
 
 /*
  * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
  *                                  unsigned char *L_table,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds,
  *                                  unsigned int blkn);
  */
 
 .align 3
 .globl _gcry_aes_ocb_enc_armv8_ce
 .type  _gcry_aes_ocb_enc_armv8_ce,%function;
 _gcry_aes_ocb_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: offset
-   *    %st+0: checksum => r4
-   *    %st+4: Ls => r5
-   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+12: nrounds => r7
-   *    %st+16: blkn => lr
+   *    st+0: checksum => r4
+   *    st+4: Ls => r5
+   *    st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    st+12: nrounds => r7
+   *    st+16: blkn => lr
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r7, [sp, #(104+12)]
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   ldr r6, [sp, #(104+8)]
   ldr lr, [sp, #(104+16)]
 
   cmp r7, #12
   vld1.8 {q0}, [r3] /* load offset */
 
   aes_preload_keys(r0, r12);
 
   beq .Locb_enc_entry_192
   bhi .Locb_enc_entry_256
 
 #define OCB_ENC(bits, ...) \
   .Locb_enc_entry_##bits: \
     cmp r6, #4; \
     add lr, #1; \
     blo .Locb_enc_loop_##bits; \
     \
   .Locb_enc_loop4_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
     add r9, lr, #1; \
     add r10, lr, #2; \
     add r11, lr, #3; \
     rbit r8, lr; \
     add lr, lr, #4; \
     rbit r9, r9; \
     rbit r10, r10; \
     rbit r11, r11; \
     clz r8, r8; /* ntz(i+0) */ \
     clz r9, r9; /* ntz(i+1) */ \
     clz r10, r10; /* ntz(i+2) */ \
     clz r11, r11; /* ntz(i+3) */ \
     add r8, r5, r8, lsl #4; \
     add r9, r5, r9, lsl #4; \
     add r10, r5, r10, lsl #4; \
     add r11, r5, r11, lsl #4; \
     \
     sub r6, #4; \
     \
     vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
     vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \
     vld1.8 {q8}, [r4];     /* load Checksum_{i-1} */ \
     veor q0, q0, q9;       /* Offset_i+0 */ \
     vld1.8 {q9}, [r9];     /* load L_{ntz(i+1)} */ \
     veor q8, q8, q1;       /* Checksum_i+0 */ \
     veor q1, q1, q0;       /* P_i+0 xor Offset_i+0 */\
     vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \
     vst1.8 {q0}, [r1]!;    /* store Offset_i+0 */\
     veor q0, q0, q9;       /* Offset_i+1 */ \
     vld1.8 {q9}, [r10];    /* load L_{ntz(i+2)} */ \
     veor q8, q8, q2;       /* Checksum_i+1 */ \
     veor q2, q2, q0;       /* P_i+1 xor Offset_i+1 */\
     vst1.8 {q0}, [r1]!;    /* store Offset_i+1 */\
     veor q0, q0, q9;       /* Offset_i+2 */ \
     vld1.8 {q9}, [r11];    /* load L_{ntz(i+3)} */ \
     veor q8, q8, q3;       /* Checksum_i+2 */ \
     veor q3, q3, q0;       /* P_i+2 xor Offset_i+2 */\
     vst1.8 {q0}, [r1]!;    /* store Offset_i+2 */\
     veor q0, q0, q9;       /* Offset_i+3 */ \
     veor q8, q8, q4;       /* Checksum_i+3 */ \
     veor q4, q4, q0;       /* P_i+3 xor Offset_i+3 */\
     vst1.8 {q0}, [r1];     /* store Offset_i+3 */\
     sub r1, #(3*16); \
     vst1.8 {q8}, [r4];     /* store Checksum_i+3 */\
     \
     cmp r6, #4; \
     \
     do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     mov r8, r1; \
     vld1.8 {q8-q9}, [r1]!; \
     veor q1, q1, q8; \
     veor q2, q2, q9; \
     vld1.8 {q8-q9}, [r1]!; \
     vst1.8 {q1-q2}, [r8]!; \
     veor q3, q3, q8; \
     veor q4, q4, q9; \
     vst1.8 {q3-q4}, [r8]; \
     \
     bhs .Locb_enc_loop4_##bits; \
     cmp r6, #0; \
     beq .Locb_enc_done; \
     \
   .Locb_enc_loop_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
     rbit r8, lr; \
     add lr, #1; \
     clz r8, r8; /* ntz(i) */ \
     add r8, r5, r8, lsl #4; \
     \
     vld1.8 {q1}, [r2]!; /* load plaintext */ \
     vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
     vld1.8 {q3}, [r4]; /* load checksum */ \
     subs r6, #1; \
     veor q0, q0, q2; \
     veor q3, q3, q1; \
     veor q1, q1, q0; \
     vst1.8 {q3}, [r4]; /* store checksum */ \
     \
     do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
     \
     veor q1, q1, q0; \
     vst1.8 {q1}, [r1]!; /* store ciphertext */ \
     \
     bne .Locb_enc_loop_##bits; \
     b .Locb_enc_done;
 
   OCB_ENC(128re, r0, r12)
   OCB_ENC(192, r0, r12)
   OCB_ENC(256, r0, r12)
 
 #undef OCB_ENC
 
 .Locb_enc_done:
   vst1.8 {q0}, [r3] /* store offset */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
   mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 .size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;
 
 
 /*
  * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
  *                                  unsigned char *L_table,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds,
  *                                  unsigned int blkn);
  */
 
 .align 3
 .globl _gcry_aes_ocb_dec_armv8_ce
 .type  _gcry_aes_ocb_dec_armv8_ce,%function;
 _gcry_aes_ocb_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: offset
-   *    %st+0: checksum => r4
-   *    %st+4: Ls => r5
-   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+12: nrounds => r7
-   *    %st+16: blkn => lr
+   *    st+0: checksum => r4
+   *    st+4: Ls => r5
+   *    st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    st+12: nrounds => r7
+   *    st+16: blkn => lr
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r7, [sp, #(104+12)]
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   ldr r6, [sp, #(104+8)]
   ldr lr, [sp, #(104+16)]
 
   cmp r7, #12
   vld1.8 {q0}, [r3] /* load offset */
 
   aes_preload_keys(r0, r12);
 
   beq .Locb_dec_entry_192
   bhi .Locb_dec_entry_256
 
 #define OCB_DEC(bits, ...) \
   .Locb_dec_entry_##bits: \
     cmp r6, #4; \
     add lr, #1; \
     blo .Locb_dec_loop_##bits; \
     \
   .Locb_dec_loop4_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
     add r9, lr, #1; \
     add r10, lr, #2; \
     add r11, lr, #3; \
     rbit r8, lr; \
     add lr, lr, #4; \
     rbit r9, r9; \
     rbit r10, r10; \
     rbit r11, r11; \
     clz r8, r8; /* ntz(i+0) */ \
     clz r9, r9; /* ntz(i+1) */ \
     clz r10, r10; /* ntz(i+2) */ \
     clz r11, r11; /* ntz(i+3) */ \
     add r8, r5, r8, lsl #4; \
     add r9, r5, r9, lsl #4; \
     add r10, r5, r10, lsl #4; \
     add r11, r5, r11, lsl #4; \
     \
     sub r6, #4; \
     \
     vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
     vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \
     veor q0, q0, q9;       /* Offset_i+0 */ \
     vld1.8 {q9}, [r9];     /* load L_{ntz(i+1)} */ \
     veor q1, q1, q0;       /* P_i+0 xor Offset_i+0 */\
     vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \
     vst1.8 {q0}, [r1]!;    /* store Offset_i+0 */\
     veor q0, q0, q9;       /* Offset_i+1 */ \
     vld1.8 {q9}, [r10];    /* load L_{ntz(i+2)} */ \
     veor q2, q2, q0;       /* P_i+1 xor Offset_i+1 */\
     vst1.8 {q0}, [r1]!;    /* store Offset_i+1 */\
     veor q0, q0, q9;       /* Offset_i+2 */ \
     vld1.8 {q9}, [r11];    /* load L_{ntz(i+3)} */ \
     veor q3, q3, q0;       /* P_i+2 xor Offset_i+2 */\
     vst1.8 {q0}, [r1]!;    /* store Offset_i+2 */\
     veor q0, q0, q9;       /* Offset_i+3 */ \
     veor q4, q4, q0;       /* P_i+3 xor Offset_i+3 */\
     vst1.8 {q0}, [r1];     /* store Offset_i+3 */\
     sub r1, #(3*16); \
     \
     cmp r6, #4; \
     \
     do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     mov r8, r1; \
     vld1.8 {q8-q9}, [r1]!; \
     veor q1, q1, q8; \
     veor q2, q2, q9; \
     vld1.8 {q8-q9}, [r1]!; \
     vst1.8 {q1-q2}, [r8]!; \
     veor q1, q1, q2; \
     vld1.8 {q2}, [r4];     /* load Checksum_{i-1} */ \
     veor q3, q3, q8; \
     veor q1, q1, q3; \
     veor q4, q4, q9; \
     veor q1, q1, q4; \
     vst1.8 {q3-q4}, [r8]; \
     veor q2, q2, q1; \
     vst1.8 {q2}, [r4];     /* store Checksum_i+3 */ \
     \
     bhs .Locb_dec_loop4_##bits; \
     cmp r6, #0; \
     beq .Locb_dec_done; \
     \
   .Locb_dec_loop_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
     rbit r8, lr; \
     add lr, #1; \
     clz r8, r8; /* ntz(i) */ \
     add r8, r5, r8, lsl #4; \
     \
     vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     subs r6, #1; \
     veor q0, q0, q2; \
     veor q1, q1, q0; \
     \
     do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__) \
     \
     vld1.8 {q2}, [r4]; /* load checksum */ \
     veor q1, q1, q0; \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     veor q2, q2, q1; \
     vst1.8 {q2}, [r4]; /* store checksum */ \
     \
     bne .Locb_dec_loop_##bits; \
     b .Locb_dec_done;
 
   OCB_DEC(128re, r0, r12)
   OCB_DEC(192, r0, r12)
   OCB_DEC(256, r0, r12)
 
 #undef OCB_DEC
 
 .Locb_dec_done:
   vst1.8 {q0}, [r3] /* store offset */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
   mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 .size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;
 
 
 /*
  * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
  *                                   unsigned char *L_table,
  *                                   size_t nblocks,
  *                                   unsigned int nrounds,
  *                                   unsigned int blkn);
  */
 
 .align 3
 .globl _gcry_aes_ocb_auth_armv8_ce
 .type  _gcry_aes_ocb_auth_armv8_ce,%function;
 _gcry_aes_ocb_auth_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: abuf
    *    r2: offset
    *    r3: checksum
-   *    %st+0: Ls => r5
-   *    %st+4: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+8: nrounds => r7
-   *    %st+12: blkn => lr
+   *    st+0: Ls => r5
+   *    st+4: nblocks => r6  (0 < nblocks <= 32)
+   *    st+8: nrounds => r7
+   *    st+12: blkn => lr
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r7, [sp, #(104+8)]
   ldr r5, [sp, #(104+0)]
   ldr r6, [sp, #(104+4)]
   ldr lr, [sp, #(104+12)]
 
   cmp r7, #12
   vld1.8 {q0}, [r2] /* load offset */
 
   aes_preload_keys(r0, r12);
 
   beq .Locb_auth_entry_192
   bhi .Locb_auth_entry_256
 
 #define OCB_AUTH(bits, ...) \
   .Locb_auth_entry_##bits: \
     cmp r6, #4; \
     add lr, #1; \
     blo .Locb_auth_loop_##bits; \
     \
   .Locb_auth_loop4_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
     add r9, lr, #1; \
     add r10, lr, #2; \
     add r11, lr, #3; \
     rbit r8, lr; \
     add lr, lr, #4; \
     rbit r9, r9; \
     rbit r10, r10; \
     rbit r11, r11; \
     clz r8, r8; /* ntz(i+0) */ \
     clz r9, r9; /* ntz(i+1) */ \
     clz r10, r10; /* ntz(i+2) */ \
     clz r11, r11; /* ntz(i+3) */ \
     add r8, r5, r8, lsl #4; \
     add r9, r5, r9, lsl #4; \
     add r10, r5, r10, lsl #4; \
     add r11, r5, r11, lsl #4; \
     \
     sub r6, #4; \
     \
     vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
     vld1.8 {q1-q2}, [r1]!; /* load A_i+<0-1> */ \
     veor q0, q0, q9;       /* Offset_i+0 */ \
     vld1.8 {q9}, [r9];     /* load L_{ntz(i+1)} */ \
     veor q1, q1, q0;       /* A_i+0 xor Offset_i+0 */\
     vld1.8 {q3-q4}, [r1]!; /* load A_i+<2-3> */ \
     veor q0, q0, q9;       /* Offset_i+1 */ \
     vld1.8 {q9}, [r10];    /* load L_{ntz(i+2)} */ \
     veor q2, q2, q0;       /* A_i+1 xor Offset_i+1 */\
     veor q0, q0, q9;       /* Offset_i+2 */ \
     vld1.8 {q9}, [r11];    /* load L_{ntz(i+3)} */ \
     veor q3, q3, q0;       /* A_i+2 xor Offset_i+2 */\
     veor q0, q0, q9;       /* Offset_i+3 */ \
     veor q4, q4, q0;       /* A_i+3 xor Offset_i+3 */\
     \
     cmp r6, #4; \
     \
     do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     veor q1, q1, q2; \
     veor q3, q3, q4; \
     vld1.8 {q2}, [r3]; \
     veor q1, q1, q3; \
     veor q2, q2, q1; \
     vst1.8 {q2}, [r3]; \
     \
     bhs .Locb_auth_loop4_##bits; \
     cmp r6, #0; \
     beq .Locb_auth_done; \
     \
   .Locb_auth_loop_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
     rbit r8, lr; \
     add lr, #1; \
     clz r8, r8; /* ntz(i) */ \
     add r8, r5, r8, lsl #4; \
     \
     vld1.8 {q2}, [r8];  /* load L_{ntz(i)} */ \
     vld1.8 {q1}, [r1]!; /* load aadtext */ \
     subs r6, #1; \
     veor q0, q0, q2; \
     vld1.8 {q2}, [r3]; /* load checksum */ \
     veor q1, q1, q0; \
     \
     do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__) \
     \
     veor q2, q2, q1; \
     vst1.8 {q2}, [r3]; /* store checksum */ \
     \
     bne .Locb_auth_loop_##bits; \
     b .Locb_auth_done;
 
   OCB_AUTH(128re, r0, r12)
   OCB_AUTH(192, r0, r12)
   OCB_AUTH(256, r0, r12)
 
 #undef OCB_AUTH
 
 .Locb_auth_done:
   vst1.8 {q0}, [r2] /* store offset */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
   mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 .size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
 
 
 
 /*
  * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_xts_enc_armv8_ce
 .type  _gcry_aes_xts_enc_armv8_ce,%function;
 _gcry_aes_xts_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   cmp r4, #0
   beq .Lxts_enc_skip
 
   cmp r5, #12
 
   vld1.8 {q0}, [r3] /* load tweak */
   mov r7, #0x87;
 
   aes_preload_keys(r0, r6);
 
   beq .Lxts_enc_entry_192
   bhi .Lxts_enc_entry_256
 
 #define CTR_XTS(bits, ...) \
   .Lxts_enc_entry_##bits: \
     cmp r4, #4; \
     blo .Lxts_enc_loop_##bits; \
     \
   .Lxts_enc_loop4_##bits: \
     sub r4, r4, #4; \
     veor q9, q9, q9; \
     \
     vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
     veor q1, q1, q0; \
     cmp r4, #4; \
     vmov.u32 d18[0], r7; \
     vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
     veor q2, q2, q0; \
     vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     veor q3, q3, q0; \
     vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     veor q4, q4, q0; \
     vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
     sub r1, r1, #48; \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
     veor q1, q1, q8; \
     veor q2, q2, q9; \
     vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
     sub r1, r1, #32; \
     veor q3, q3, q8; \
     veor q4, q4, q9; \
     vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
     vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
     \
     bhs .Lxts_enc_loop4_##bits; \
     cmp r4, #0; \
     beq .Lxts_enc_done; \
     \
   .Lxts_enc_loop_##bits: \
     \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     \
     veor q9, q9, q9; \
     veor q1, q1, q0; \
     vmov.u32 d18[0], r7; \
     vmov q2, q0; \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     subs r4, r4, #1; \
     \
     do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
     \
     veor q1, q1, q2; \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     \
     bne .Lxts_enc_loop_##bits; \
     b .Lxts_enc_done;
 
   CTR_XTS(128re, r0, r6)
   CTR_XTS(192, r0, r6)
   CTR_XTS(256, r0, r6)
 
 #undef CTR_XTS
 
 .Lxts_enc_done:
   vst1.8 {q0}, [r3] /* store tweak */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lxts_enc_skip:
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 .size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
 
 
 /*
  * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_xts_dec_armv8_ce
 .type  _gcry_aes_xts_dec_armv8_ce,%function;
 _gcry_aes_xts_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
   push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
   ldr r4, [sp, #(104+0)]
   ldr r5, [sp, #(104+4)]
   cmp r4, #0
   beq .Lxts_dec_skip
 
   cmp r5, #12
 
   vld1.8 {q0}, [r3] /* load tweak */
   mov r7, #0x87;
 
   aes_preload_keys(r0, r6);
 
   beq .Lxts_dec_entry_192
   bhi .Lxts_dec_entry_256
 
 #define CTR_XTS(bits, ...) \
   .Lxts_dec_entry_##bits: \
     cmp r4, #4; \
     blo .Lxts_dec_loop_##bits; \
     \
   .Lxts_dec_loop4_##bits: \
     sub r4, r4, #4; \
     veor q9, q9, q9; \
     \
     vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
     veor q1, q1, q0; \
     cmp r4, #4; \
     vmov.u32 d18[0], r7; \
     vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
     veor q2, q2, q0; \
     vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     veor q3, q3, q0; \
     vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     veor q4, q4, q0; \
     vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
     sub r1, r1, #48; \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     \
     do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
     \
     vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
     veor q1, q1, q8; \
     veor q2, q2, q9; \
     vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
     sub r1, r1, #32; \
     veor q3, q3, q8; \
     veor q4, q4, q9; \
     vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
     vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
     \
     bhs .Lxts_dec_loop4_##bits; \
     cmp r4, #0; \
     beq .Lxts_dec_done; \
     \
   .Lxts_dec_loop_##bits: \
     \
     vld1.8 {q1}, [r2]!; /* load ciphertext */ \
     \
     veor q9, q9, q9; \
     veor q1, q1, q0; \
     vmov.u32 d18[0], r7; \
     vmov q2, q0; \
     \
     vshr.s64 d16, d1, #63; \
     vshr.u64 d17, d0, #63; \
     vadd.u64 q0, q0, q0; \
     vand d16, d16, d18; \
     veor q0, q0, q8; \
     subs r4, r4, #1; \
     \
     do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
     \
     veor q1, q1, q2; \
     vst1.8 {q1}, [r1]!; /* store plaintext */ \
     \
     bne .Lxts_dec_loop_##bits; \
     b .Lxts_dec_done;
 
   CTR_XTS(128re, r0, r6)
   CTR_XTS(192, r0, r6)
   CTR_XTS(256, r0, r6)
 
 #undef CTR_XTS
 
 .Lxts_dec_done:
   vst1.8 {q0}, [r3] /* store tweak */
 
   CLEAR_REG(q0)
   CLEAR_REG(q1)
   CLEAR_REG(q2)
   CLEAR_REG(q3)
   CLEAR_REG(q8)
   CLEAR_REG(q9)
   CLEAR_REG(q10)
   CLEAR_REG(q11)
   CLEAR_REG(q12)
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
 .Lxts_dec_skip:
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
 .size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
 
 
 /*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
 .align 3
 .globl _gcry_aes_sbox4_armv8_ce
 .type  _gcry_aes_sbox4_armv8_ce,%function;
 _gcry_aes_sbox4_armv8_ce:
   /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
    * Cryptology — CT-RSA 2015" for details.
    */
   vmov.i8 q0, #0x52
   vmov.i8 q1, #0
   vmov s0, r0
   aese.8 q0, q1
   veor d0, d1
   vpadd.i32 d0, d0, d1
   vmov r0, s0
   CLEAR_REG(q0)
   bx lr
 .size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;
 
 
 /*
  * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
  */
 .align 3
 .globl _gcry_aes_invmixcol_armv8_ce
 .type  _gcry_aes_invmixcol_armv8_ce,%function;
 _gcry_aes_invmixcol_armv8_ce:
   vld1.8 {q0}, [r1]
   aesimc.8 q0, q0
   vst1.8 {q0}, [r0]
   CLEAR_REG(q0)
   bx lr
 .size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;
 
 #endif
diff --git a/cipher/sha512-arm.S b/cipher/sha512-arm.S
index 94ec0141..1e1d296f 100644
--- a/cipher/sha512-arm.S
+++ b/cipher/sha512-arm.S
@@ -1,464 +1,464 @@
 /* sha512-arm.S  -  ARM assembly implementation of SHA-512 transform
  *
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 #include <config.h>
 
 #if defined(__ARMEL__)
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 .text
 
 .syntax unified
 .arm
 
 /* structure of SHA512_CONTEXT */
 #define hd_a 0
 #define hd_b ((hd_a) + 8)
 #define hd_c ((hd_b) + 8)
 #define hd_d ((hd_c) + 8)
 #define hd_e ((hd_d) + 8)
 #define hd_f ((hd_e) + 8)
 #define hd_g ((hd_f) + 8)
 #define hd_h ((hd_g) + 8)
 
 /* register macros */
-#define RK    %r2
+#define RK    r2
 
-#define RElo %r0
-#define REhi %r1
+#define RElo r0
+#define REhi r1
 
-#define RT1lo %r3
-#define RT1hi %r4
-#define RT2lo %r5
-#define RT2hi %r6
-#define RWlo  %r7
-#define RWhi  %r8
-#define RT3lo %r9
-#define RT3hi %r10
-#define RT4lo %r11
-#define RT4hi %ip
+#define RT1lo r3
+#define RT1hi r4
+#define RT2lo r5
+#define RT2hi r6
+#define RWlo  r7
+#define RWhi  r8
+#define RT3lo r9
+#define RT3hi r10
+#define RT4lo r11
+#define RT4hi ip
 
-#define RRND  %lr
+#define RRND  lr
 
 /* variable offsets in stack */
 #define ctx (0)
 #define data ((ctx) + 4)
 #define nblks ((data) + 4)
 #define _a ((nblks) + 4)
 #define _b ((_a) + 8)
 #define _c ((_b) + 8)
 #define _d ((_c) + 8)
 #define _e ((_d) + 8)
 #define _f ((_e) + 8)
 #define _g ((_f) + 8)
 #define _h ((_g) + 8)
 
 #define w(i) ((_h) + 8 + ((i) % 16) * 8)
 
 #define STACK_MAX (w(15) + 8)
 
 /* helper macros */
 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
     ldrb rout, [rsrc, #((offs) + 3)]; \
     ldrb rtmp, [rsrc, #((offs) + 2)]; \
     orr rout, rout, rtmp, lsl #8; \
     ldrb rtmp, [rsrc, #((offs) + 1)]; \
     orr rout, rout, rtmp, lsl #16; \
     ldrb rtmp, [rsrc, #((offs) + 0)]; \
     orr rout, rout, rtmp, lsl #24;
 
 #ifdef __ARMEL__
     /* bswap on little-endian */
 #ifdef HAVE_ARM_ARCH_V6
     #define be_to_host(reg, rtmp) \
 	rev reg, reg;
 #else
     #define be_to_host(reg, rtmp) \
 	eor rtmp, reg, reg, ror #16; \
 	mov rtmp, rtmp, lsr #8; \
 	bic rtmp, rtmp, #65280; \
 	eor reg, rtmp, reg, ror #8;
 #endif
 #else
     /* nop on big-endian */
     #define be_to_host(reg, rtmp) /*_*/
 #endif
 
 #define host_to_host(x, y) /*_*/
 
 #define read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, convert, rtmp) \
     ldr lo0, [rin, #((offs) + 0 * 8 + 4)]; \
     ldr hi0, [rin, #((offs) + 0 * 8 + 0)]; \
     ldr lo1, [rin, #((offs) + 1 * 8 + 4)]; \
     ldr hi1, [rin, #((offs) + 1 * 8 + 0)]; \
     ldr lo2, [rin, #((offs) + 2 * 8 + 4)]; \
     convert(lo0, rtmp); \
     ldr hi2, [rin, #((offs) + 2 * 8 + 0)]; \
     convert(hi0, rtmp); \
     ldr lo3, [rin, #((offs) + 3 * 8 + 4)]; \
     convert(lo1, rtmp); \
     ldr hi3, [rin, #((offs) + 3 * 8 + 0)]; \
     convert(hi1, rtmp); \
     convert(lo2, rtmp); \
     convert(hi2, rtmp); \
     convert(lo3, rtmp); \
     convert(hi3, rtmp);
 
 #define read_be64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
     read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, be_to_host, rtmp0)
 
 /* need to handle unaligned reads by byte reads */
 #define read_be64_unaligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
     ldr_unaligned_be(lo0, rin, (offs) + 0 * 8 + 4, rtmp0); \
     ldr_unaligned_be(hi0, rin, (offs) + 0 * 8 + 0, rtmp0); \
     ldr_unaligned_be(lo1, rin, (offs) + 1 * 8 + 4, rtmp0); \
     ldr_unaligned_be(hi1, rin, (offs) + 1 * 8 + 0, rtmp0); \
     ldr_unaligned_be(lo2, rin, (offs) + 2 * 8 + 4, rtmp0); \
     ldr_unaligned_be(hi2, rin, (offs) + 2 * 8 + 0, rtmp0); \
     ldr_unaligned_be(lo3, rin, (offs) + 3 * 8 + 4, rtmp0); \
     ldr_unaligned_be(hi3, rin, (offs) + 3 * 8 + 0, rtmp0);
 
 /***********************************************************************
  * ARM assembly implementation of sha512 transform
  ***********************************************************************/
 
 /* Round function */
 
 #define R(_a,_b,_c,_d,_e,_f,_g,_h,W,wi) \
     /* Message expansion, t1 = _h + w[i] */ \
     W(_a,_h,wi); \
     \
     /* w = Sum1(_e) */ \
     mov RWlo, RElo, lsr#14; \
     ldm RK!, {RT2lo-RT2hi}; \
     mov RWhi, REhi, lsr#14; \
     eor RWlo, RWlo, RElo, lsr#18; \
     eor RWhi, RWhi, REhi, lsr#18; \
-    ldr RT3lo, [%sp, #(_f)]; \
+    ldr RT3lo, [sp, #(_f)]; \
     adds RT1lo, RT2lo; /* t1 += K */ \
-    ldr RT3hi, [%sp, #(_f) + 4]; \
+    ldr RT3hi, [sp, #(_f) + 4]; \
     adc RT1hi, RT2hi; \
-    ldr RT4lo, [%sp, #(_g)]; \
+    ldr RT4lo, [sp, #(_g)]; \
     eor RWlo, RWlo, RElo, lsl#23; \
-    ldr RT4hi, [%sp, #(_g) + 4]; \
+    ldr RT4hi, [sp, #(_g) + 4]; \
     eor RWhi, RWhi, REhi, lsl#23; \
     eor RWlo, RWlo, REhi, lsl#18; \
     eor RWhi, RWhi, RElo, lsl#18; \
     eor RWlo, RWlo, REhi, lsl#14; \
     eor RWhi, RWhi, RElo, lsl#14; \
     eor RWlo, RWlo, REhi, lsr#9; \
     eor RWhi, RWhi, RElo, lsr#9; \
     \
     /* Cho(_e,_f,_g) => (_e & _f) ^ (~_e & _g) */ \
     adds RT1lo, RWlo; /* t1 += Sum1(_e) */ \
     and RT3lo, RT3lo, RElo; \
     adc RT1hi, RWhi; \
     and RT3hi, RT3hi, REhi; \
     bic RT4lo, RT4lo, RElo; \
     bic RT4hi, RT4hi, REhi; \
     eor RT3lo, RT3lo, RT4lo; \
     eor RT3hi, RT3hi, RT4hi; \
     \
     /* Load D */ \
     /* t1 += Cho(_e,_f,_g) */ \
-    ldr RElo, [%sp, #(_d)]; \
+    ldr RElo, [sp, #(_d)]; \
     adds RT1lo, RT3lo; \
-    ldr REhi, [%sp, #(_d) + 4]; \
+    ldr REhi, [sp, #(_d) + 4]; \
     adc RT1hi, RT3hi; \
     \
     /* Load A */ \
-    ldr RT3lo, [%sp, #(_a)]; \
+    ldr RT3lo, [sp, #(_a)]; \
     \
     /* _d += t1 */ \
     adds RElo, RT1lo; \
-    ldr RT3hi, [%sp, #(_a) + 4]; \
+    ldr RT3hi, [sp, #(_a) + 4]; \
     adc REhi, RT1hi; \
     \
     /* Store D */ \
-    str RElo, [%sp, #(_d)]; \
+    str RElo, [sp, #(_d)]; \
     \
     /* t2 = Sum0(_a) */ \
     mov RT2lo, RT3lo, lsr#28; \
-    str REhi, [%sp, #(_d) + 4]; \
+    str REhi, [sp, #(_d) + 4]; \
     mov RT2hi, RT3hi, lsr#28; \
-    ldr RWlo, [%sp, #(_b)]; \
+    ldr RWlo, [sp, #(_b)]; \
     eor RT2lo, RT2lo, RT3lo, lsl#30; \
-    ldr RWhi, [%sp, #(_b) + 4]; \
+    ldr RWhi, [sp, #(_b) + 4]; \
     eor RT2hi, RT2hi, RT3hi, lsl#30; \
     eor RT2lo, RT2lo, RT3lo, lsl#25; \
     eor RT2hi, RT2hi, RT3hi, lsl#25; \
     eor RT2lo, RT2lo, RT3hi, lsl#4; \
     eor RT2hi, RT2hi, RT3lo, lsl#4; \
     eor RT2lo, RT2lo, RT3hi, lsr#2; \
     eor RT2hi, RT2hi, RT3lo, lsr#2; \
     eor RT2lo, RT2lo, RT3hi, lsr#7; \
     eor RT2hi, RT2hi, RT3lo, lsr#7; \
     \
     /* t2 += t1 */ \
     adds RT2lo, RT1lo; \
-    ldr RT1lo, [%sp, #(_c)]; \
+    ldr RT1lo, [sp, #(_c)]; \
     adc RT2hi, RT1hi; \
     \
     /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \
-    ldr RT1hi, [%sp, #(_c) + 4]; \
+    ldr RT1hi, [sp, #(_c) + 4]; \
     and RT4lo, RWlo, RT3lo; \
     and RT4hi, RWhi, RT3hi; \
     eor RWlo, RWlo, RT3lo; \
     eor RWhi, RWhi, RT3hi; \
     and RWlo, RWlo, RT1lo; \
     and RWhi, RWhi, RT1hi; \
     eor RWlo, RWlo, RT4lo; \
     eor RWhi, RWhi, RT4hi; \
 
 /* Message expansion */
 
 #define W_0_63(_a,_h,i) \
-    ldr RT3lo, [%sp, #(w(i-2))]; \
+    ldr RT3lo, [sp, #(w(i-2))]; \
     adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
-    ldr RT3hi, [%sp, #(w(i-2)) + 4]; \
+    ldr RT3hi, [sp, #(w(i-2)) + 4]; \
     adc RT2hi, RWhi; \
     /* nw = S1(w[i-2]) */ \
-    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    ldr RT1lo, [sp, #(_h)]; /* Load H */ \
     mov RWlo, RT3lo, lsr#19; \
-    str RT2lo, [%sp, #(_a)]; \
+    str RT2lo, [sp, #(_a)]; \
     eor RWlo, RWlo, RT3lo, lsl#3; \
-    ldr RT1hi, [%sp, #(_h) + 4]; \
+    ldr RT1hi, [sp, #(_h) + 4]; \
     mov RWhi, RT3hi, lsr#19; \
-    ldr RT2lo, [%sp, #(w(i-7))]; \
+    ldr RT2lo, [sp, #(w(i-7))]; \
     eor RWhi, RWhi, RT3hi, lsl#3; \
-    str RT2hi, [%sp, #(_a) + 4]; \
+    str RT2hi, [sp, #(_a) + 4]; \
     eor RWlo, RWlo, RT3lo, lsr#6; \
-    ldr RT2hi, [%sp, #(w(i-7)) + 4]; \
+    ldr RT2hi, [sp, #(w(i-7)) + 4]; \
     eor RWhi, RWhi, RT3hi, lsr#6; \
     eor RWlo, RWlo, RT3hi, lsl#13; \
     eor RWhi, RWhi, RT3lo, lsl#13; \
     eor RWlo, RWlo, RT3hi, lsr#29; \
     eor RWhi, RWhi, RT3lo, lsr#29; \
-    ldr RT3lo, [%sp, #(w(i-15))]; \
+    ldr RT3lo, [sp, #(w(i-15))]; \
     eor RWlo, RWlo, RT3hi, lsl#26; \
-    ldr RT3hi, [%sp, #(w(i-15)) + 4]; \
+    ldr RT3hi, [sp, #(w(i-15)) + 4]; \
     \
     adds RT2lo, RWlo; /* nw += w[i-7] */ \
-    ldr RWlo, [%sp, #(w(i-16))]; \
+    ldr RWlo, [sp, #(w(i-16))]; \
     adc RT2hi, RWhi; \
     mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \
-    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    ldr RWhi, [sp, #(w(i-16)) + 4]; \
     mov RT4hi, RT3hi, lsr#1; \
     adds RT2lo, RWlo; /* nw += w[i-16] */ \
     eor RT4lo, RT4lo, RT3lo, lsr#8; \
     eor RT4hi, RT4hi, RT3hi, lsr#8; \
     eor RT4lo, RT4lo, RT3lo, lsr#7; \
     eor RT4hi, RT4hi, RT3hi, lsr#7; \
     eor RT4lo, RT4lo, RT3hi, lsl#31; \
     eor RT4hi, RT4hi, RT3lo, lsl#31; \
     eor RT4lo, RT4lo, RT3hi, lsl#24; \
     eor RT4hi, RT4hi, RT3lo, lsl#24; \
     eor RT4lo, RT4lo, RT3hi, lsl#25; \
     adc RT2hi, RWhi; \
     \
     /* nw += S0(w[i-15]) */ \
     adds RT2lo, RT4lo; \
     adc RT2hi, RT4hi; \
     \
     /* w[0] = nw */ \
-    str RT2lo, [%sp, #(w(i))]; \
+    str RT2lo, [sp, #(w(i))]; \
     adds RT1lo, RWlo; \
-    str RT2hi, [%sp, #(w(i)) + 4]; \
+    str RT2hi, [sp, #(w(i)) + 4]; \
     adc RT1hi, RWhi;
 
 #define W_64_79(_a,_h,i) \
     adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
-    ldr RWlo, [%sp, #(w(i-16))]; \
+    ldr RWlo, [sp, #(w(i-16))]; \
     adc RT2hi, RWhi; \
-    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
-    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
-    ldr RT1hi, [%sp, #(_h) + 4]; \
-    str RT2lo, [%sp, #(_a)]; \
-    str RT2hi, [%sp, #(_a) + 4]; \
+    ldr RWhi, [sp, #(w(i-16)) + 4]; \
+    ldr RT1lo, [sp, #(_h)]; /* Load H */ \
+    ldr RT1hi, [sp, #(_h) + 4]; \
+    str RT2lo, [sp, #(_a)]; \
+    str RT2hi, [sp, #(_a) + 4]; \
     adds RT1lo, RWlo; \
     adc RT1hi, RWhi;
 
 .align 3
 .globl _gcry_sha512_transform_arm
 .type  _gcry_sha512_transform_arm,%function;
 
 _gcry_sha512_transform_arm:
 	/* Input:
-	 *	%r0: SHA512_CONTEXT
-	 *	%r1: data
-	 *	%r2: u64 k[] constants
-	 *	%r3: nblks
+	 *	r0: SHA512_CONTEXT
+	 *	r1: data
+	 *	r2: u64 k[] constants
+	 *	r3: nblks
 	 */
-	push {%r4-%r11, %ip, %lr};
-	sub %sp, %sp, #STACK_MAX;
-	movs RWlo, %r3;
-	str %r0, [%sp, #(ctx)];
+	push {r4-r11, ip, lr};
+	sub sp, sp, #STACK_MAX;
+	movs RWlo, r3;
+	str r0, [sp, #(ctx)];
 
 	beq .Ldone;
 
 .Loop_blocks:
-	str RWlo, [%sp, #nblks];
+	str RWlo, [sp, #nblks];
 
 	/* Load context to stack */
-	add RWhi, %sp, #(_a);
-	ldm %r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	add RWhi, sp, #(_a);
+	ldm r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
-	ldm %r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
 	/* Load input to w[16] */
 
 	/* test if data is unaligned */
-	tst %r1, #3;
+	tst r1, #3;
 	beq 1f;
 
 	/* unaligned load */
-	add RWhi, %sp, #(w(0));
-	read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	add RWhi, sp, #(w(0));
+	read_be64_unaligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_unaligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_unaligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_unaligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	b 2f;
 1:
 	/* aligned load */
-	add RWhi, %sp, #(w(0));
-	read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	add RWhi, sp, #(w(0));
+	read_be64_aligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_aligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_aligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_aligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 2:
-	add %r1, #(16 * 8);
+	add r1, #(16 * 8);
 	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
-	str %r1, [%sp, #(data)];
+	str r1, [sp, #(data)];
 
 	/* preload E & A */
-	ldr RElo, [%sp, #(_e)];
-	ldr REhi, [%sp, #(_e) + 4];
+	ldr RElo, [sp, #(_e)];
+	ldr REhi, [sp, #(_e) + 4];
 	mov RWlo, #0;
-	ldr RT2lo, [%sp, #(_a)];
+	ldr RT2lo, [sp, #(_a)];
 	mov RRND, #(80-16);
-	ldr RT2hi, [%sp, #(_a) + 4];
+	ldr RT2hi, [sp, #(_a) + 4];
 	mov RWhi, #0;
 
 .Loop_rounds:
 	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 16);
 	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 17);
 	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 18);
 	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 19);
 	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 20);
 	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 21);
 	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 22);
 	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 23);
 	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 24);
 	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 25);
 	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 26);
 	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 27);
 	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 28);
 	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 29);
 	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 30);
 	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 31);
 
 	subs RRND, #16;
 	bne .Loop_rounds;
 
 	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 16);
 	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 17);
 	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 18);
 	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 19);
 	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 20);
 	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 21);
 	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 22);
 	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 23);
 	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 24);
 	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 25);
 	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 26);
 	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 27);
 	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 28);
 	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 29);
 	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30);
 	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31);
 
-	ldr %r0, [%sp, #(ctx)];
+	ldr r0, [sp, #(ctx)];
 	adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */
-	ldr %r1, [%sp, #(data)];
+	ldr r1, [sp, #(data)];
 	adc RT2hi, RWhi;
 
-	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 	adds RT1lo, RT2lo;
-	ldr RT2lo, [%sp, #(_b + 0)];
+	ldr RT2lo, [sp, #(_b + 0)];
 	adc  RT1hi, RT2hi;
-	ldr RT2hi, [%sp, #(_b + 4)];
+	ldr RT2hi, [sp, #(_b + 4)];
 	adds RWlo, RT2lo;
-	ldr RT2lo, [%sp, #(_c + 0)];
+	ldr RT2lo, [sp, #(_c + 0)];
 	adc  RWhi, RT2hi;
-	ldr RT2hi, [%sp, #(_c + 4)];
+	ldr RT2hi, [sp, #(_c + 4)];
 	adds RT3lo, RT2lo;
-	ldr RT2lo, [%sp, #(_d + 0)];
+	ldr RT2lo, [sp, #(_d + 0)];
 	adc  RT3hi, RT2hi;
-	ldr RT2hi, [%sp, #(_d + 4)];
+	ldr RT2hi, [sp, #(_d + 4)];
 	adds RT4lo, RT2lo;
-	ldr RT2lo, [%sp, #(_e + 0)];
+	ldr RT2lo, [sp, #(_e + 0)];
 	adc  RT4hi, RT2hi;
-	stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	ldr RT2hi, [%sp, #(_e + 4)];
-	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldr RT2hi, [sp, #(_e + 4)];
+	ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 	adds RT1lo, RT2lo;
-	ldr RT2lo, [%sp, #(_f + 0)];
+	ldr RT2lo, [sp, #(_f + 0)];
 	adc  RT1hi, RT2hi;
-	ldr RT2hi, [%sp, #(_f + 4)];
+	ldr RT2hi, [sp, #(_f + 4)];
 	adds RWlo, RT2lo;
-	ldr RT2lo, [%sp, #(_g + 0)];
+	ldr RT2lo, [sp, #(_g + 0)];
 	adc  RWhi, RT2hi;
-	ldr RT2hi, [%sp, #(_g + 4)];
+	ldr RT2hi, [sp, #(_g + 4)];
 	adds RT3lo, RT2lo;
-	ldr RT2lo, [%sp, #(_h + 0)];
+	ldr RT2lo, [sp, #(_h + 0)];
 	adc  RT3hi, RT2hi;
-	ldr RT2hi, [%sp, #(_h + 4)];
+	ldr RT2hi, [sp, #(_h + 4)];
 	adds RT4lo, RT2lo;
 	adc  RT4hi, RT2hi;
-	stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
-	sub %r0, %r0, #(4 * 8);
-	ldr RWlo, [%sp, #nblks];
+	stm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	sub r0, r0, #(4 * 8);
+	ldr RWlo, [sp, #nblks];
 
 	sub RK, #(80 * 8);
 	subs RWlo, #1;
 	bne .Loop_blocks;
 
 .Ldone:
-	mov %r0, #STACK_MAX;
+	mov r0, #STACK_MAX;
 __out:
-	add %sp, %sp, #STACK_MAX;
-	pop {%r4-%r11, %ip, %pc};
+	add sp, sp, #STACK_MAX;
+	pop {r4-r11, ip, pc};
 .size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm;
 
 #endif
 #endif
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S
index 2b186b47..a1df73b8 100644
--- a/cipher/sha512-armv7-neon.S
+++ b/cipher/sha512-armv7-neon.S
@@ -1,452 +1,452 @@
 /* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_NEON)
 
 .text
 
 .syntax unified
 .fpu neon
 .arm
 
 /* structure of SHA512_CONTEXT */
 #define hd_a 0
 #define hd_b ((hd_a) + 8)
 #define hd_c ((hd_b) + 8)
 #define hd_d ((hd_c) + 8)
 #define hd_e ((hd_d) + 8)
 #define hd_f ((hd_e) + 8)
 #define hd_g ((hd_f) + 8)
 
 /* register macros */
-#define RK %r2
+#define RK r2
 
 #define RA d0
 #define RB d1
 #define RC d2
 #define RD d3
 #define RE d4
 #define RF d5
 #define RG d6
 #define RH d7
 
 #define RT0 d8
 #define RT1 d9
 #define RT2 d10
 #define RT3 d11
 #define RT4 d12
 #define RT5 d13
 #define RT6 d14
 #define RT7 d15
 
 #define RT01q q4
 #define RT23q q5
 #define RT45q q6
 #define RT67q q7
 
 #define RW0 d16
 #define RW1 d17
 #define RW2 d18
 #define RW3 d19
 #define RW4 d20
 #define RW5 d21
 #define RW6 d22
 #define RW7 d23
 #define RW8 d24
 #define RW9 d25
 #define RW10 d26
 #define RW11 d27
 #define RW12 d28
 #define RW13 d29
 #define RW14 d30
 #define RW15 d31
 
 #define RW01q q8
 #define RW23q q9
 #define RW45q q10
 #define RW67q q11
 #define RW89q q12
 #define RW1011q q13
 #define RW1213q q14
 #define RW1415q q15
 
 #define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 /***********************************************************************
  * ARM assembly implementation of sha512 transform
  ***********************************************************************/
 #define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
 	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
 	vshr.u64 RT2, re, #14; \
 	vshl.u64 RT3, re, #64 - 14; \
 	interleave_op(arg1); \
 	vshr.u64 RT4, re, #18; \
 	vshl.u64 RT5, re, #64 - 18; \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, re, #41; \
 	vshl.u64 RT5, re, #64 - 41; \
 	vadd.u64 RT0, RT0, rw0; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, re; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, rf, rg; \
 	\
 	vadd.u64 RT1, RT1, rh; \
 	vshr.u64 RT2, ra, #28; \
 	vshl.u64 RT3, ra, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, ra, #34; \
 	vshl.u64 RT5, ra, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* h = Sum0 (a) + Maj (a, b, c); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, ra, #39; \
 	vshl.u64 RT5, ra, #64 - 39; \
 	veor.64 RT0, ra, rb; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rc, rb; \
 	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
 	veor.64 rh, RT2, RT3; \
 	\
 	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
 	vshr.u64 RT2, rd, #14; \
 	vshl.u64 RT3, rd, #64 - 14; \
 	vadd.u64 rh, rh, RT0; \
 	vshr.u64 RT4, rd, #18; \
 	vshl.u64 RT5, rd, #64 - 18; \
 	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rd, #41; \
 	vshl.u64 RT5, rd, #64 - 41; \
 	vadd.u64 RT0, RT0, rw1; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, rd; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, re, rf; \
 	\
 	vadd.u64 RT1, RT1, rg; \
 	vshr.u64 RT2, rh, #28; \
 	vshl.u64 RT3, rh, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, rh, #34; \
 	vshl.u64 RT5, rh, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* g = Sum0 (h) + Maj (h, a, b); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rh, #39; \
 	vshl.u64 RT5, rh, #64 - 39; \
 	veor.64 RT0, rh, ra; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rb, ra; \
 	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
 	veor.64 rg, RT2, RT3; \
 	\
 	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
 	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
 	\
 	/**** S0(w[1:2]) */ \
 	\
 	/* w[0:1] += w[9:10] */ \
 	/* RT23q = rw1:rw2 */ \
 	vext.u64 RT23q, rw01q, rw23q, #1; \
 	vadd.u64 rw0, rw9; \
 	vadd.u64 rg, rg, RT0; \
 	vadd.u64 rw1, rw10;\
 	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
 	\
 	vshr.u64 RT45q, RT23q, #1; \
 	vshl.u64 RT67q, RT23q, #64 - 1; \
 	vshr.u64 RT01q, RT23q, #8; \
 	veor.u64 RT45q, RT45q, RT67q; \
 	vshl.u64 RT67q, RT23q, #64 - 8; \
 	veor.u64 RT45q, RT45q, RT01q; \
 	vshr.u64 RT01q, RT23q, #7; \
 	veor.u64 RT45q, RT45q, RT67q; \
 	\
 	/**** S1(w[14:15]) */ \
 	vshr.u64 RT23q, rw1415q, #6; \
 	veor.u64 RT01q, RT01q, RT45q; \
 	vshr.u64 RT45q, rw1415q, #19; \
 	vshl.u64 RT67q, rw1415q, #64 - 19; \
 	veor.u64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT45q, rw1415q, #61; \
 	veor.u64 RT23q, RT23q, RT67q; \
 	vshl.u64 RT67q, rw1415q, #64 - 61; \
 	veor.u64 RT23q, RT23q, RT45q; \
 	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
 	veor.u64 RT01q, RT23q, RT67q;
 #define vadd_RT01q(rw01q) \
 	/* w[0:1] += S(w[14:15]) */ \
 	vadd.u64 rw01q, RT01q;
 
 #define dummy(_) /*_*/
 
 #define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \
 	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
 	vshr.u64 RT2, re, #14; \
 	vshl.u64 RT3, re, #64 - 14; \
 	interleave_op1(arg1); \
 	vshr.u64 RT4, re, #18; \
 	vshl.u64 RT5, re, #64 - 18; \
 	interleave_op2(arg2); \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, re, #41; \
 	vshl.u64 RT5, re, #64 - 41; \
 	vadd.u64 RT0, RT0, rw0; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, re; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, rf, rg; \
 	\
 	vadd.u64 RT1, RT1, rh; \
 	vshr.u64 RT2, ra, #28; \
 	vshl.u64 RT3, ra, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, ra, #34; \
 	vshl.u64 RT5, ra, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* h = Sum0 (a) + Maj (a, b, c); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, ra, #39; \
 	vshl.u64 RT5, ra, #64 - 39; \
 	veor.64 RT0, ra, rb; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rc, rb; \
 	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
 	veor.64 rh, RT2, RT3; \
 	\
 	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
 	vshr.u64 RT2, rd, #14; \
 	vshl.u64 RT3, rd, #64 - 14; \
 	vadd.u64 rh, rh, RT0; \
 	vshr.u64 RT4, rd, #18; \
 	vshl.u64 RT5, rd, #64 - 18; \
 	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rd, #41; \
 	vshl.u64 RT5, rd, #64 - 41; \
 	vadd.u64 RT0, RT0, rw1; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, rd; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, re, rf; \
 	\
 	vadd.u64 RT1, RT1, rg; \
 	vshr.u64 RT2, rh, #28; \
 	vshl.u64 RT3, rh, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, rh, #34; \
 	vshl.u64 RT5, rh, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* g = Sum0 (h) + Maj (h, a, b); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rh, #39; \
 	vshl.u64 RT5, rh, #64 - 39; \
 	veor.64 RT0, rh, ra; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rb, ra; \
 	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
 	veor.64 rg, RT2, RT3;
 #define vadd_rg_RT0(rg) \
 	vadd.u64 rg, rg, RT0;
 #define vadd_rg_RT1(rg) \
 	vadd.u64 rg, rg, RT1; /* g+=t1; */
 
 .align 3
 .globl _gcry_sha512_transform_armv7_neon
 .type  _gcry_sha512_transform_armv7_neon,%function;
 
 _gcry_sha512_transform_armv7_neon:
 	/* Input:
-	 *	%r0: SHA512_CONTEXT
-	 *	%r1: data
-	 *	%r2: u64 k[] constants
-	 *	%r3: nblks
+	 *	r0: SHA512_CONTEXT
+	 *	r1: data
+	 *	r2: u64 k[] constants
+	 *	r3: nblks
 	 */
-	push {%lr};
+	push {lr};
 
-	mov %lr, #0;
+	mov lr, #0;
 
 	/* Load context to d0-d7 */
-	vld1.64 {RA-RD}, [%r0]!;
-	vld1.64 {RE-RH}, [%r0];
-	sub %r0, #(4*8);
+	vld1.64 {RA-RD}, [r0]!;
+	vld1.64 {RE-RH}, [r0];
+	sub r0, #(4*8);
 
 	/* Load input to w[16], d16-d31 */
 	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
-	vld1.64 {RW0-RW3}, [%r1]!;
-	vld1.64 {RW4-RW7}, [%r1]!;
-	vld1.64 {RW8-RW11}, [%r1]!;
-	vld1.64 {RW12-RW15}, [%r1]!;
+	vld1.64 {RW0-RW3}, [r1]!;
+	vld1.64 {RW4-RW7}, [r1]!;
+	vld1.64 {RW8-RW11}, [r1]!;
+	vld1.64 {RW12-RW15}, [r1]!;
 #ifdef __ARMEL__
 	/* byteswap */
 	vrev64.8 RW01q, RW01q;
 	vrev64.8 RW23q, RW23q;
 	vrev64.8 RW45q, RW45q;
 	vrev64.8 RW67q, RW67q;
 	vrev64.8 RW89q, RW89q;
 	vrev64.8 RW1011q, RW1011q;
 	vrev64.8 RW1213q, RW1213q;
 	vrev64.8 RW1415q, RW1415q;
 #endif
 
 	/* EABI says that d8-d15 must be preserved by callee. */
 	vpush {RT0-RT7};
 
 .Loop:
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _);
 	b .Lenter_rounds;
 
 .Loop_rounds:
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
 .Lenter_rounds:
 	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
 	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
 	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
 	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
-	add %lr, #16;
+	add lr, #16;
 	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
-	cmp %lr, #64;
+	cmp lr, #64;
 	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
 	bne .Loop_rounds;
 
-	subs %r3, #1;
+	subs r3, #1;
 
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 	beq .Lhandle_tail;
-	vld1.64 {RW0-RW3}, [%r1]!;
+	vld1.64 {RW0-RW3}, [r1]!;
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
 	vrev64.8 RW01q, RW01q;
 	vrev64.8 RW23q, RW23q;
 #endif
-	vld1.64 {RW4-RW7}, [%r1]!;
+	vld1.64 {RW4-RW7}, [r1]!;
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 #ifdef __ARMEL__
 	vrev64.8 RW45q, RW45q;
 	vrev64.8 RW67q, RW67q;
 #endif
-	vld1.64 {RW8-RW11}, [%r1]!;
+	vld1.64 {RW8-RW11}, [r1]!;
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
 	vrev64.8 RW89q, RW89q;
 	vrev64.8 RW1011q, RW1011q;
 #endif
-	vld1.64 {RW12-RW15}, [%r1]!;
+	vld1.64 {RW12-RW15}, [r1]!;
 	vadd_rg_RT0(RA);
 	vadd_rg_RT1(RA);
 
 	/* Load context */
-	vld1.64 {RT0-RT3}, [%r0]!;
-	vld1.64 {RT4-RT7}, [%r0];
-	sub %r0, #(4*8);
+	vld1.64 {RT0-RT3}, [r0]!;
+	vld1.64 {RT4-RT7}, [r0];
+	sub r0, #(4*8);
 
 #ifdef __ARMEL__
 	vrev64.8 RW1213q, RW1213q;
 	vrev64.8 RW1415q, RW1415q;
 #endif
 
 	vadd.u64 RA, RT0;
 	vadd.u64 RB, RT1;
 	vadd.u64 RC, RT2;
 	vadd.u64 RD, RT3;
 	vadd.u64 RE, RT4;
 	vadd.u64 RF, RT5;
 	vadd.u64 RG, RT6;
 	vadd.u64 RH, RT7;
 
 	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
+	vst1.64 {RA-RD}, [r0]!;
 	sub RK, $(8*80);
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	mov %lr, #0;
-	sub %r0, #(4*8);
+	vst1.64 {RE-RH}, [r0]; /* Store the last half of context */
+	mov lr, #0;
+	sub r0, #(4*8);
 
 	b .Loop;
 .ltorg
 
 .Lhandle_tail:
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 
 	/* Load context to d16-d23 */
-	vld1.64 {RW0-RW3}, [%r0]!;
+	vld1.64 {RW0-RW3}, [r0]!;
 	vadd_rg_RT0(RA);
-	vld1.64 {RW4-RW7}, [%r0];
+	vld1.64 {RW4-RW7}, [r0];
 	vadd_rg_RT1(RA);
-	sub %r0, #(4*8);
+	sub r0, #(4*8);
 
 	vadd.u64 RA, RW0;
 	vadd.u64 RB, RW1;
 	vadd.u64 RC, RW2;
 	vadd.u64 RD, RW3;
 	vadd.u64 RE, RW4;
 	vadd.u64 RF, RW5;
 	vadd.u64 RG, RW6;
 	vadd.u64 RH, RW7;
 
 	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
+	vst1.64 {RA-RD}, [r0]!;
 
 	/* Clear used registers */
 	/* d16-d31 */
 	CLEAR_REG(RW01q);
 	CLEAR_REG(RW23q);
 	CLEAR_REG(RW45q);
 	CLEAR_REG(RW67q);
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	vst1.64 {RE-RH}, [r0]; /* Store the last half of context */
 	CLEAR_REG(RW89q);
 	CLEAR_REG(RW1011q);
 	CLEAR_REG(RW1213q);
 	CLEAR_REG(RW1415q);
 	/* d8-d15 */
 	vpop {RT0-RT7};
 	/* d0-d7 (q0-q3) */
-	CLEAR_REG(%q0);
-	CLEAR_REG(%q1);
-	CLEAR_REG(%q2);
-	CLEAR_REG(%q3);
+	CLEAR_REG(q0);
+	CLEAR_REG(q1);
+	CLEAR_REG(q2);
+	CLEAR_REG(q3);
 
-	eor %r0, %r0;
-	pop {%pc};
+	eor r0, r0;
+	pop {pc};
 .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
 
 #endif
diff --git a/cipher/twofish-arm.S b/cipher/twofish-arm.S
index 2e1da6cd..b381e546 100644
--- a/cipher/twofish-arm.S
+++ b/cipher/twofish-arm.S
@@ -1,363 +1,363 @@
 /* twofish-arm.S  -  ARM assembly implementation of Twofish cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(__ARMEL__)
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 .text
 
 .syntax unified
 .arm
 
 /* structure of TWOFISH_context: */
 #define s0 0
 #define s1 ((s0) + 4 * 256)
 #define s2 ((s1) + 4 * 256)
 #define s3 ((s2) + 4 * 256)
 #define w  ((s3) + 4 * 256)
 #define k  ((w) + 4 * 8)
 
 /* register macros */
-#define CTX %r0
-#define CTXs0 %r0
-#define CTXs1 %r1
-#define CTXs3 %r7
+#define CTX r0
+#define CTXs0 r0
+#define CTXs1 r1
+#define CTXs3 r7
 
-#define RA %r3
-#define RB %r4
-#define RC %r5
-#define RD %r6
+#define RA r3
+#define RB r4
+#define RC r5
+#define RD r6
 
-#define RX %r2
-#define RY %ip
+#define RX r2
+#define RY ip
 
-#define RMASK %lr
+#define RMASK lr
 
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
+#define RT0 r8
+#define RT1 r9
+#define RT2 r10
+#define RT3 r11
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 	ldrb rout, [rsrc, #((offs) + 0)]; \
 	ldrb rtmp, [rsrc, #((offs) + 1)]; \
 	orr rout, rout, rtmp, lsl #8; \
 	ldrb rtmp, [rsrc, #((offs) + 2)]; \
 	orr rout, rout, rtmp, lsl #16; \
 	ldrb rtmp, [rsrc, #((offs) + 3)]; \
 	orr rout, rout, rtmp, lsl #24;
 
 #define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
 	mov rtmp0, rin, lsr #8; \
 	strb rin, [rdst, #((offs) + 0)]; \
 	mov rtmp1, rin, lsr #16; \
 	strb rtmp0, [rdst, #((offs) + 1)]; \
 	mov rtmp0, rin, lsr #24; \
 	strb rtmp1, [rdst, #((offs) + 2)]; \
 	strb rtmp0, [rdst, #((offs) + 3)];
 
 #ifndef __ARMEL__
 	/* bswap on big-endian */
 	#define host_to_le(reg) \
 		rev reg, reg;
 	#define le_to_host(reg) \
 		rev reg, reg;
 #else
 	/* nop on little-endian */
 	#define host_to_le(reg) /*_*/
 	#define le_to_host(reg) /*_*/
 #endif
 
 #define ldr_input_aligned_le(rin, a, b, c, d) \
 	ldr a, [rin, #0]; \
 	ldr b, [rin, #4]; \
 	le_to_host(a); \
 	ldr c, [rin, #8]; \
 	le_to_host(b); \
 	ldr d, [rin, #12]; \
 	le_to_host(c); \
 	le_to_host(d);
 
 #define str_output_aligned_le(rout, a, b, c, d) \
 	le_to_host(a); \
 	le_to_host(b); \
 	str a, [rout, #0]; \
 	le_to_host(c); \
 	str b, [rout, #4]; \
 	le_to_host(d); \
 	str c, [rout, #8]; \
 	str d, [rout, #12];
 
 #ifdef __ARM_FEATURE_UNALIGNED
 	/* unaligned word reads/writes allowed */
 	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
 		ldr_input_aligned_le(rin, ra, rb, rc, rd)
 
 	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
 		str_output_aligned_le(rout, ra, rb, rc, rd)
 #else
 	/* need to handle unaligned reads/writes by byte reads */
 	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
 		tst rin, #3; \
 		beq 1f; \
 			ldr_unaligned_le(ra, rin, 0, rtmp0); \
 			ldr_unaligned_le(rb, rin, 4, rtmp0); \
 			ldr_unaligned_le(rc, rin, 8, rtmp0); \
 			ldr_unaligned_le(rd, rin, 12, rtmp0); \
 			b 2f; \
 		1:;\
 			ldr_input_aligned_le(rin, ra, rb, rc, rd); \
 		2:;
 
 	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
 		tst rout, #3; \
 		beq 1f; \
 			str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
 			str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
 			str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
 			str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
 			b 2f; \
 		1:;\
 			str_output_aligned_le(rout, ra, rb, rc, rd); \
 		2:;
 #endif
 
 /**********************************************************************
   1-way twofish
  **********************************************************************/
 #define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
 	and RT0, RMASK, b, lsr#(8 - 2); \
 	and RY, RMASK, b, lsr#(16 - 2); \
 	add RT0, RT0, #(s2 - s1); \
 	and RT1, RMASK, b, lsr#(24 - 2); \
 	ldr RY, [CTXs3, RY]; \
 	and RT2, RMASK, b, lsl#(2); \
 	ldr RT0, [CTXs1, RT0]; \
 	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
 	ldr RT1, [CTXs0, RT1]; \
 	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
 	ldr RT2, [CTXs1, RT2]; \
 	add RT3, RT3, #(s2 - s1); \
 	ldr RX, [CTXs1, RX]; \
 	ror_a(a); \
 	\
 	eor RY, RY, RT0; \
 	ldr RT3, [CTXs1, RT3]; \
 	and RT0, RMASK, a, lsl#(2); \
 	eor RY, RY, RT1; \
 	and RT1, RMASK, a, lsr#(24 - 2); \
 	eor RY, RY, RT2; \
 	ldr RT0, [CTXs0, RT0]; \
 	eor RX, RX, RT3; \
 	ldr RT1, [CTXs3, RT1]; \
 	eor RX, RX, RT0; \
 	\
 	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
 	eor RX, RX, RT1; \
 	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
 	\
 	add RT0, RX, RY, lsl #1; \
 	add RX, RX, RY; \
 	add RT0, RT0, RT3; \
 	add RX, RX, RT2; \
 	eor rd, RT0, rd, ror #31; \
 	eor rc, rc, RX;
 
 #define dummy(x) /*_*/
 
 #define ror1(r) \
 	ror r, r, #1;
 
 #define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
 	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
 	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
 	ror_b(b); \
 	and RT2, RMASK, a, lsl#(2); \
 	and RT0, RMASK, a, lsr#(8 - 2); \
 	\
 	ldr RY, [CTXs1, RT3]; \
 	add RT1, RT1, #(s2 - s1); \
 	ldr RX, [CTXs0, RT2]; \
 	and RT3, RMASK, b, lsr#(16 - 2); \
 	ldr RT1, [CTXs1, RT1]; \
 	and RT2, RMASK, a, lsr#(16 - 2); \
 	ldr RT0, [CTXs1, RT0]; \
 	\
 	add RT2, RT2, #(s2 - s1); \
 	ldr RT3, [CTXs3, RT3]; \
 	eor RY, RY, RT1; \
 	\
 	and RT1, RMASK, b, lsr#(24 - 2); \
 	eor RX, RX, RT0; \
 	ldr RT2, [CTXs1, RT2]; \
 	and RT0, RMASK, a, lsr#(24 - 2); \
 	\
 	ldr RT1, [CTXs0, RT1]; \
 	\
 	eor RY, RY, RT3; \
 	ldr RT0, [CTXs3, RT0]; \
 	eor RX, RX, RT2; \
 	eor RY, RY, RT1; \
 	\
 	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
 	eor RX, RX, RT0; \
 	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
 	\
 	add RT0, RX, RY, lsl #1; \
 	add RX, RX, RY; \
 	add RT0, RT0, RT1; \
 	add RX, RX, RT2; \
 	eor rd, rd, RT0; \
 	eor rc, RX, rc, ror #31;
 
 #define first_encrypt_cycle(nc) \
 	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
 	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
 
 #define encrypt_cycle(nc) \
 	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
 	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
 
 #define last_encrypt_cycle(nc) \
 	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
 	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
 	ror1(RA);
 
 #define first_decrypt_cycle(nc) \
 	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
 	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
 
 #define decrypt_cycle(nc) \
 	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
 	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
 
 #define last_decrypt_cycle(nc) \
 	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
 	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
 	ror1(RD);
 
 .align 3
 .globl _gcry_twofish_arm_encrypt_block
 .type   _gcry_twofish_arm_encrypt_block,%function;
 
 _gcry_twofish_arm_encrypt_block:
 	/* input:
-	 *	%r0: ctx
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	add RY, CTXs0, #w;
 
-	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+	ldr_input_le(r2, RA, RB, RC, RD, RT0);
 
 	/* Input whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
 	add CTXs3, CTXs0, #(s3 - s0);
 	add CTXs1, CTXs0, #(s1 - s0);
 	mov RMASK, #(0xff << 2);
 	eor RA, RA, RT0;
 	eor RB, RB, RT1;
 	eor RC, RC, RT2;
 	eor RD, RD, RT3;
 
 	first_encrypt_cycle(0);
 	encrypt_cycle(1);
 	encrypt_cycle(2);
 	encrypt_cycle(3);
 	encrypt_cycle(4);
 	encrypt_cycle(5);
 	encrypt_cycle(6);
 	last_encrypt_cycle(7);
 
 	add RY, CTXs3, #(w + 4*4 - s3);
-	pop {%r1}; /* dst */
+	pop {r1}; /* dst */
 
 	/* Output whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
 	eor RC, RC, RT0;
 	eor RD, RD, RT1;
 	eor RA, RA, RT2;
 	eor RB, RB, RT3;
 
-	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+	str_output_le(r1, RC, RD, RA, RB, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
 
 .align 3
 .globl _gcry_twofish_arm_decrypt_block
 .type   _gcry_twofish_arm_decrypt_block,%function;
 
 _gcry_twofish_arm_decrypt_block:
 	/* input:
-	 *	%r0: ctx
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	add CTXs3, CTXs0, #(s3 - s0);
 
-	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+	ldr_input_le(r2, RC, RD, RA, RB, RT0);
 
 	add RY, CTXs3, #(w + 4*4 - s3);
 	add CTXs3, CTXs0, #(s3 - s0);
 
 	/* Input whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
 	add CTXs1, CTXs0, #(s1 - s0);
 	mov RMASK, #(0xff << 2);
 	eor RC, RC, RT0;
 	eor RD, RD, RT1;
 	eor RA, RA, RT2;
 	eor RB, RB, RT3;
 
 	first_decrypt_cycle(7);
 	decrypt_cycle(6);
 	decrypt_cycle(5);
 	decrypt_cycle(4);
 	decrypt_cycle(3);
 	decrypt_cycle(2);
 	decrypt_cycle(1);
 	last_decrypt_cycle(0);
 
 	add RY, CTXs0, #w;
-	pop {%r1}; /* dst */
+	pop {r1}; /* dst */
 
 	/* Output whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
 	eor RA, RA, RT0;
 	eor RB, RB, RT1;
 	eor RC, RC, RT2;
 	eor RD, RD, RT3;
 
-	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+	str_output_le(r1, RA, RB, RC, RD, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
 
 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
 #endif /*__ARMEL__*/
diff --git a/configure.ac b/configure.ac
index cd804305..cc1104ca 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,3596 +1,3596 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 # Copyright (C) 2012-2021  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # (Process this file with autoconf to produce a configure script.)
 AC_REVISION($Revision$)
 AC_PREREQ([2.69])
 min_automake_version="1.14"
 
 # To build a release you need to create a tag with the version number
 # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force".  Please
 # bump the version number immediately after the release and do another
 # commit and push so that the git magic is able to work.  See below
 # for the LT versions.
 m4_define([mym4_package],[libgcrypt])
 m4_define([mym4_major], [1])
 m4_define([mym4_minor], [11])
 m4_define([mym4_micro], [0])
 
 # Below is m4 magic to extract and compute the git revision number,
 # the decimalized short revision number, a beta version string and a
 # flag indicating a development version (mym4_isbeta).  Note that the
 # m4 processing is done by autoconf and not during the configure run.
 m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \
                            mym4_package mym4_major mym4_minor mym4_micro),[:]))
 m4_define([mym4_isbeta],       m4_argn(2, mym4_verslist))
 m4_define([mym4_version],      m4_argn(4, mym4_verslist))
 m4_define([mym4_revision],     m4_argn(7, mym4_verslist))
 m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist))
 m4_esyscmd([echo ]mym4_version[>VERSION])
 AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org])
 
 # LT Version numbers, remember to change them just *before* a release.
 # NOET NOTE - Already updated for a 1.11 series - NOTE NOTE
 #   (Code changed:			REVISION++)
 #   (Interfaces added/removed/changed:	CURRENT++, REVISION=0)
 #   (Interfaces added:			AGE++)
 #   (Interfaces removed:		AGE=0)
 #
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
 LIBGCRYPT_LT_CURRENT=25
 LIBGCRYPT_LT_AGE=5
 LIBGCRYPT_LT_REVISION=0
 ################################################
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
 AC_SUBST(LIBGCRYPT_LT_AGE)
 AC_SUBST(LIBGCRYPT_LT_REVISION)
 
 # If the API is changed in an incompatible way: increment the next counter.
 #
 # 1.6: ABI and API change but the change is to most users irrelevant
 #      and thus the API version number has not been incremented.
 LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
 NEED_GPG_ERROR_VERSION=1.27
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
 AM_INIT_AUTOMAKE([serial-tests dist-bzip2])
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_LIBOBJ_DIR([compat])
 AC_CANONICAL_HOST
 AM_MAINTAINER_MODE
 AM_SILENT_RULES
 AC_USE_SYSTEM_EXTENSIONS
 
 AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory])
 
 AH_TOP([
 #ifndef _GCRYPT_CONFIG_H_INCLUDED
 #define _GCRYPT_CONFIG_H_INCLUDED
 
 /* Enable gpg-error's strerror macro for W32CE.  */
 #define GPG_ERR_ENABLE_ERRNO_MACROS 1
 ])
 
 AH_BOTTOM([
 #define _GCRYPT_IN_LIBGCRYPT 1
 
 /* Add .note.gnu.property section for Intel CET in assembler sources
    when CET is enabled.  */
 #if defined(__ASSEMBLER__) && defined(__CET__)
 # include <cet.h>
 #endif
 
 /* If the configure check for endianness has been disabled, get it from
    OS macros.  This is intended for making fat binary builds on OS X.  */
 #ifdef DISABLED_ENDIAN_CHECK
 # if defined(__BIG_ENDIAN__)
 #  define WORDS_BIGENDIAN 1
 # elif defined(__LITTLE_ENDIAN__)
 #  undef WORDS_BIGENDIAN
 # else
 #  error "No endianness found"
 # endif
 #endif /*DISABLED_ENDIAN_CHECK*/
 
 /* We basically use the original Camellia source.  Make sure the symbols
    properly prefixed.  */
 #define CAMELLIA_EXT_SYM_PREFIX _gcry_
 
 #endif /*_GCRYPT_CONFIG_H_INCLUDED*/
 ])
 
 AH_VERBATIM([_REENTRANT],
 [/* To allow the use of Libgcrypt in multithreaded programs we have to use
     special features from the library. */
 #ifndef _REENTRANT
 # define _REENTRANT 1
 #endif
 ])
 
 
 ######################
 ##  Basic checks.  ### (we need some results later on (e.g. $GCC)
 ######################
 
 AC_PROG_MAKE_SET
 missing_dir=`cd $ac_aux_dir && pwd`
 AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
 AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
 AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
 AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
 # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
 AC_PROG_CC
 AC_PROG_CPP
 AM_PROG_CC_C_O
 AM_PROG_AS
 AC_SEARCH_LIBS([strerror],[cposix])
 AC_PROG_INSTALL
 AC_PROG_AWK
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
 dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has
 dnl the precedence over the run path, so that if a compatible MPFR library
 dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested
 dnl MPFR library will be this library instead of the MPFR library from the
 dnl build tree. Other OS with the same issue might be added later.
 dnl
 dnl References:
 dnl   https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732
 dnl   http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html
 dnl
 dnl We need to check whether --disable-new-dtags is supported as alternate
 dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc).
 dnl
 case $host in
   *-*-linux*)
     if test -n "$LD_LIBRARY_PATH"; then
       saved_LDFLAGS="$LDFLAGS"
       LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags"
       LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE"
       AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker)
       AC_LINK_IFELSE([AC_LANG_SOURCE([[
 int main (void) { return 0; }
       ]])],
       [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))],
       [AC_MSG_RESULT(no)
        LDADD_FOR_TESTS_KLUDGE=""
       ])
       LDFLAGS="$saved_LDFLAGS"
     fi
     ;;
 esac
 AC_SUBST([LDADD_FOR_TESTS_KLUDGE])
 
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \
                           mym4_minor mym4_micro)
 AC_SUBST(VERSION_NUMBER)
 
 # We need to compile and run a program on the build machine.
 AX_CC_FOR_BUILD
 
 
 LT_PREREQ([2.2.6])
 LT_INIT([win32-dll disable-static])
 LT_LANG([Windows Resource])
 
 
 ##########################
 ## General definitions. ##
 ##########################
 
 # Used by libgcrypt-config
 LIBGCRYPT_CONFIG_LIBS="-lgcrypt"
 LIBGCRYPT_CONFIG_CFLAGS=""
 LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 available_ciphers="$available_ciphers sm4"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
 available_pubkey_ciphers="dsa elgamal rsa ecc"
 enabled_pubkey_ciphers=""
 
 # Definitions for message digests.
 available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512"
 available_digests="$available_digests sha3 tiger whirlpool stribog blake2"
 available_digests="$available_digests sm3"
 enabled_digests=""
 
 # Definitions for kdfs (optional ones)
 available_kdfs="s2k pkdf2 scrypt"
 enabled_kdfs=""
 
 # Definitions for random modules.
 available_random_modules="getentropy linux egd unix"
 auto_random_modules="$available_random_modules"
 
 # Supported thread backends.
 LIBGCRYPT_THREAD_MODULES=""
 
 # Other definitions.
 have_w32_system=no
 have_w32ce_system=no
 have_pthread=no
 
 
 # Setup some stuff depending on host.
 case "${host}" in
     *-*-mingw32*)
       ac_cv_have_dev_random=no
       have_w32_system=yes
       case "${host}" in
         *-mingw32ce*)
             have_w32ce_system=yes
             available_random_modules="w32ce"
             ;;
         *)
             available_random_modules="w32"
             ;;
       esac
       AC_DEFINE(USE_ONLY_8DOT3,1,
                 [set this to limit filenames to the 8.3 format])
       AC_DEFINE(HAVE_DRIVE_LETTERS,1,
                 [defined if we must run on a stupid file system])
       AC_DEFINE(HAVE_DOSISH_SYSTEM,1,
                 [defined if we run on some of the PCDOS like systems
                  (DOS, Windoze. OS/2) with special properties like
                   no file modes])
       ;;
 
     i?86-emx-os2 | i?86-*-os2*emx)
         # OS/2 with the EMX environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     i?86-*-msdosdjgpp*)
         # DOS with the DJGPP environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     *-*-hpux*)
         if test -z "$GCC" ; then
             CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE"
         fi
         ;;
     *-dec-osf4*)
         if test -z "$GCC" ; then
             # Suppress all warnings
             # to get rid of the unsigned/signed char mismatch warnings.
             CFLAGS="$CFLAGS -w"
         fi
         ;;
     m68k-atari-mint)
         ;;
     *-apple-darwin*)
         AC_DEFINE(_DARWIN_C_SOURCE, 1,
                   Expose all libc features (__DARWIN_C_FULL).)
         AC_DEFINE(USE_POSIX_SPAWN_FOR_TESTS, 1,
                   [defined if we use posix_spawn in test program])
         AC_CHECK_HEADERS(spawn.h)
         ;;
     *)
       ;;
 esac
 
 if test "$have_w32_system" = yes; then
    AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system])
    if test "$have_w32ce_system" = yes; then
      AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE])
    fi
 fi
 AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes)
 AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes)
 
 
 
 # A printable OS Name is sometimes useful.
 case "${host}" in
     *-*-mingw32ce*)
         PRINTABLE_OS_NAME="W32CE"
         ;;
 
     *-*-mingw32*)
         PRINTABLE_OS_NAME="W32"
         ;;
 
     i?86-emx-os2 | i?86-*-os2*emx )
         PRINTABLE_OS_NAME="OS/2"
         ;;
 
     i?86-*-msdosdjgpp*)
         PRINTABLE_OS_NAME="MSDOS/DJGPP"
         ;;
 
     *-linux*)
         PRINTABLE_OS_NAME="GNU/Linux"
         ;;
 
     *)
         PRINTABLE_OS_NAME=`uname -s || echo "Unknown"`
         ;;
 esac
 
 NAME_OF_DEV_RANDOM="/dev/random"
 NAME_OF_DEV_URANDOM="/dev/urandom"
 
 AC_ARG_ENABLE(endian-check,
               AS_HELP_STRING([--disable-endian-check],
               [disable the endian check and trust the OS provided macros]),
 	      endiancheck=$enableval,endiancheck=yes)
 if test x"$endiancheck" = xyes ; then
   AC_C_BIGENDIAN
 else
   AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness])
 fi
 
 AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
 AC_CHECK_SIZEOF(unsigned __int128, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
 
 if test "$ac_cv_sizeof_unsigned_short" = "0" \
    || test "$ac_cv_sizeof_unsigned_int" = "0" \
    || test "$ac_cv_sizeof_unsigned_long" = "0"; then
     AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]);
 fi
 
 # Ensure that we have UINT64_C before we bother to check for uint64_t
 AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works],
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <inttypes.h>]],
        [[uint64_t foo=UINT64_C(42);]])],
      gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no))
 if test "$gnupg_cv_uint64_c_works" = "yes" ; then
    AC_CHECK_SIZEOF(uint64_t)
 fi
 
 # Do we have any 64-bit data types?
 if test "$ac_cv_sizeof_unsigned_int" != "8" \
    && test "$ac_cv_sizeof_unsigned_long" != "8" \
    && test "$ac_cv_sizeof_unsigned_long_long" != "8" \
    && test "$ac_cv_sizeof_uint64_t" != "8"; then
     AC_MSG_ERROR([[
 ***
 *** No 64-bit integer type available.
 *** It is not possible to build Libgcrypt on this platform.
 ***]])
 fi
 
 
 # If not specified otherwise, all available algorithms will be
 # included.
 default_ciphers="$available_ciphers"
 default_pubkey_ciphers="$available_pubkey_ciphers"
 default_digests="$available_digests"
 default_kdfs="$available_kdfs"
 # Blacklist MD2 by default
 default_digests=`echo $default_digests | sed -e 's/md2//g'`
 
 # Substitutions to set generated files in a Emacs buffer to read-only.
 AC_SUBST(emacs_local_vars_begin, ['Local Variables:'])
 AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t'])
 AC_SUBST(emacs_local_vars_end, ['End:'])
 
 ############################
 ## Command line switches. ##
 ############################
 
 # Implementation of the --enable-ciphers switch.
 AC_ARG_ENABLE(ciphers,
 	      AS_HELP_STRING([--enable-ciphers=ciphers],
                              [select the symmetric ciphers to include]),
 	      [enabled_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_ciphers=""])
 if test "x$enabled_ciphers" = "x" \
    -o "$enabled_ciphers" = "yes"  \
    -o "$enabled_ciphers" = "no"; then
    enabled_ciphers=$default_ciphers
 fi
 AC_MSG_CHECKING([which symmetric ciphers to include])
 for cipher in $enabled_ciphers; do
     LIST_MEMBER($cipher, $available_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported cipher "$cipher" specified])
     fi
 done
 AC_MSG_RESULT([$enabled_ciphers])
 
 # Implementation of the --enable-pubkey-ciphers switch.
 AC_ARG_ENABLE(pubkey-ciphers,
 	      AS_HELP_STRING([--enable-pubkey-ciphers=ciphers],
                              [select the public-key ciphers to include]),
 	      [enabled_pubkey_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_pubkey_ciphers=""])
 if test "x$enabled_pubkey_ciphers" = "x" \
    -o "$enabled_pubkey_ciphers" = "yes"  \
    -o "$enabled_pubkey_ciphers" = "no"; then
    enabled_pubkey_ciphers=$default_pubkey_ciphers
 fi
 AC_MSG_CHECKING([which public-key ciphers to include])
 for cipher in $enabled_pubkey_ciphers; do
     LIST_MEMBER($cipher, $available_pubkey_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported public-key cipher specified])
     fi
 done
 AC_MSG_RESULT([$enabled_pubkey_ciphers])
 
 # Implementation of the --enable-digests switch.
 AC_ARG_ENABLE(digests,
 	      AS_HELP_STRING([--enable-digests=digests],
                              [select the message digests to include]),
 	      [enabled_digests=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_digests=""])
 if test "x$enabled_digests" = "x" \
    -o "$enabled_digests" = "yes"  \
    -o "$enabled_digests" = "no"; then
    enabled_digests=$default_digests
 fi
 AC_MSG_CHECKING([which message digests to include])
 for digest in $enabled_digests; do
     LIST_MEMBER($digest, $available_digests)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported message digest specified])
     fi
 done
 AC_MSG_RESULT([$enabled_digests])
 
 # Implementation of the --enable-kdfs switch.
 AC_ARG_ENABLE(kdfs,
       AS_HELP_STRING([--enable-kfds=kdfs],
                      [select the KDFs to include]),
       [enabled_kdfs=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
       [enabled_kdfs=""])
 if test "x$enabled_kdfs" = "x" \
    -o "$enabled_kdfs" = "yes"  \
    -o "$enabled_kdfs" = "no"; then
    enabled_kdfs=$default_kdfs
 fi
 AC_MSG_CHECKING([which key derivation functions to include])
 for kdf in $enabled_kdfs; do
     LIST_MEMBER($kdf, $available_kdfs)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported key derivation function specified])
     fi
 done
 AC_MSG_RESULT([$enabled_kdfs])
 
 # Implementation of the --enable-random switch.
 AC_ARG_ENABLE(random,
 	      AS_HELP_STRING([--enable-random=name],
                              [select which random number generator to use]),
 	      [random=`echo $enableval | tr '[A-Z]' '[a-z]'`],
 	      [])
 if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then
     random=default
 fi
 AC_MSG_CHECKING([which random module to use])
 if test "$random" != "default" -a "$random" != "auto"; then
     LIST_MEMBER($random, $available_random_modules)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported random module specified])
     fi
 fi
 AC_MSG_RESULT($random)
 
 # Implementation of the --disable-dev-random switch.
 AC_MSG_CHECKING([whether use of /dev/random is requested])
 AC_ARG_ENABLE(dev-random,
 [  --disable-dev-random    disable the use of dev random],
     try_dev_random=$enableval, try_dev_random=yes)
 AC_MSG_RESULT($try_dev_random)
 
 # Implementation of the --with-egd-socket switch.
 AC_ARG_WITH(egd-socket,
     [  --with-egd-socket=NAME  Use NAME for the EGD socket)],
             egd_socket_name="$withval", egd_socket_name="" )
 AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested])
 AC_ARG_ENABLE([asm],
               AS_HELP_STRING([--disable-asm],
                              [Disable MPI and cipher assembler modules]),
               [try_asm_modules=$enableval],
               [try_asm_modules=yes])
 AC_MSG_RESULT($try_asm_modules)
 if test "$try_asm_modules" != yes ; then
     AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure])
 fi
 
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
               AS_HELP_STRING([--enable-large-data-tests],
                  [Enable the real long ruinning large data tests]),
 	      large_data_tests=$enableval,large_data_tests=no)
 AC_MSG_RESULT($large_data_tests)
 AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests)
 
 # Implementation of --enable-force-soft-hwfeatures
 AC_MSG_CHECKING([whether 'soft' HW feature bits are forced on])
 AC_ARG_ENABLE([force-soft-hwfeatures],
               AS_HELP_STRING([--enable-force-soft-hwfeatures],
                              [Enable forcing 'soft' HW feature bits on]),
               [force_soft_hwfeatures=$enableval],
               [force_soft_hwfeatures=no])
 AC_MSG_RESULT($force_soft_hwfeatures)
 
 
 # Implementation of the --with-capabilities switch.
 # Check whether we want to use Linux capabilities
 AC_MSG_CHECKING([whether use of capabilities is requested])
 AC_ARG_WITH(capabilities,
             AS_HELP_STRING([--with-capabilities],
                            [Use linux capabilities [default=no]]),
             [use_capabilities="$withval"],[use_capabilities=no])
 AC_MSG_RESULT($use_capabilities)
 
 # Implementation of the --enable-hmac-binary-check.
 AC_MSG_CHECKING([whether a HMAC binary check is requested])
 AC_ARG_ENABLE(hmac-binary-check,
               AS_HELP_STRING([--enable-hmac-binary-check],
                              [Enable library integrity check]),
               [use_hmac_binary_check="$enableval"],
               [use_hmac_binary_check=no])
 AC_MSG_RESULT($use_hmac_binary_check)
 if test "$use_hmac_binary_check" = no ; then
     DEF_HMAC_BINARY_CHECK=''
 else
     AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1,
               [Define to support an HMAC based integrity check])
     AC_CHECK_TOOL(OBJCOPY, [objcopy])
     AC_CHECK_TOOL(READELF, [readelf])
     if test "$use_hmac_binary_check" != yes ; then
         DEF_HMAC_BINARY_CHECK=-DKEY_FOR_BINARY_CHECK="'\"$use_hmac_binary_check\"'"
     fi
 fi
 AM_CONDITIONAL(USE_HMAC_BINARY_CHECK, test "x$use_hmac_binary_check" != xno)
 AC_SUBST(DEF_HMAC_BINARY_CHECK)
 
 # Implementation of the --with-fips-module-version.
 AC_ARG_WITH(fips-module-version,
             AS_HELP_STRING([--with-fips-module-version=VERSION],
                            [Specify the FIPS module version for the build]),
             fips_module_version="$withval", fips_module_version="" )
 AC_DEFINE_UNQUOTED(FIPS_MODULE_VERSION, "$fips_module_version",
                    [Define FIPS module version for certification])
 
 # Implementation of the --disable-jent-support switch.
 AC_MSG_CHECKING([whether jitter entropy support is requested])
 AC_ARG_ENABLE(jent-support,
               AS_HELP_STRING([--disable-jent-support],
                         [Disable support for the Jitter entropy collector]),
 	      jentsupport=$enableval,jentsupport=yes)
 AC_MSG_RESULT($jentsupport)
 
 # Implementation of the --disable-padlock-support switch.
 AC_MSG_CHECKING([whether padlock support is requested])
 AC_ARG_ENABLE(padlock-support,
               AS_HELP_STRING([--disable-padlock-support],
                         [Disable support for the PadLock Engine of VIA processors]),
 	      padlocksupport=$enableval,padlocksupport=yes)
 AC_MSG_RESULT($padlocksupport)
 
 # Implementation of the --disable-aesni-support switch.
 AC_MSG_CHECKING([whether AESNI support is requested])
 AC_ARG_ENABLE(aesni-support,
               AS_HELP_STRING([--disable-aesni-support],
                  [Disable support for the Intel AES-NI instructions]),
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
 # Implementation of the --disable-shaext-support switch.
 AC_MSG_CHECKING([whether SHAEXT support is requested])
 AC_ARG_ENABLE(shaext-support,
               AS_HELP_STRING([--disable-shaext-support],
                  [Disable support for the Intel SHAEXT instructions]),
               shaextsupport=$enableval,shaextsupport=yes)
 AC_MSG_RESULT($shaextsupport)
 
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
               AS_HELP_STRING([--disable-pclmul-support],
                  [Disable support for the Intel PCLMUL instructions]),
 	      pclmulsupport=$enableval,pclmulsupport=yes)
 AC_MSG_RESULT($pclmulsupport)
 
 # Implementation of the --disable-sse41-support switch.
 AC_MSG_CHECKING([whether SSE4.1 support is requested])
 AC_ARG_ENABLE(sse41-support,
               AS_HELP_STRING([--disable-sse41-support],
                  [Disable support for the Intel SSE4.1 instructions]),
 	      sse41support=$enableval,sse41support=yes)
 AC_MSG_RESULT($sse41support)
 
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
               AS_HELP_STRING([--disable-drng-support],
                  [Disable support for the Intel DRNG (RDRAND instruction)]),
 	      drngsupport=$enableval,drngsupport=yes)
 AC_MSG_RESULT($drngsupport)
 
 # Implementation of the --disable-avx-support switch.
 AC_MSG_CHECKING([whether AVX support is requested])
 AC_ARG_ENABLE(avx-support,
               AS_HELP_STRING([--disable-avx-support],
                  [Disable support for the Intel AVX instructions]),
 	      avxsupport=$enableval,avxsupport=yes)
 AC_MSG_RESULT($avxsupport)
 
 # Implementation of the --disable-avx2-support switch.
 AC_MSG_CHECKING([whether AVX2 support is requested])
 AC_ARG_ENABLE(avx2-support,
               AS_HELP_STRING([--disable-avx2-support],
                  [Disable support for the Intel AVX2 instructions]),
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
 # Implementation of the --disable-avx512-support switch.
 AC_MSG_CHECKING([whether AVX512 support is requested])
 AC_ARG_ENABLE(avx512-support,
               AS_HELP_STRING([--disable-avx512-support],
                  [Disable support for the Intel AVX512 instructions]),
 	      avx512support=$enableval,avx512support=yes)
 AC_MSG_RESULT($avx512support)
 
 # Implementation of the --disable-gfni-support switch.
 AC_MSG_CHECKING([whether GFNI support is requested])
 AC_ARG_ENABLE(gfni-support,
               AS_HELP_STRING([--disable-gfni-support],
                  [Disable support for the Intel GFNI instructions]),
 	      gfnisupport=$enableval,gfnisupport=yes)
 AC_MSG_RESULT($gfnisupport)
 
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
               AS_HELP_STRING([--disable-neon-support],
                  [Disable support for the ARM NEON instructions]),
 	      neonsupport=$enableval,neonsupport=yes)
 AC_MSG_RESULT($neonsupport)
 
 # Implementation of the --disable-arm-crypto-support switch.
 AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested])
 AC_ARG_ENABLE(arm-crypto-support,
               AS_HELP_STRING([--disable-arm-crypto-support],
                  [Disable support for the ARMv8 Crypto Extension instructions]),
 	      armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
 # Implementation of the --disable-sve-support switch.
 AC_MSG_CHECKING([whether SVE support is requested])
 AC_ARG_ENABLE(sve-support,
               AS_HELP_STRING([--disable-sve-support],
                  [Disable support for the ARMv8 SVE instructions]),
 	      svesupport=$enableval,svesupport=yes)
 AC_MSG_RESULT($svesupport)
 
 # Implementation of the --disable-sve2-support switch.
 AC_MSG_CHECKING([whether SVE2 support is requested])
 AC_ARG_ENABLE(sve2-support,
               AS_HELP_STRING([--disable-sve2-support],
                  [Disable support for the ARMv9 SVE2 instructions]),
 	      sve2support=$enableval,sve2support=yes)
 AC_MSG_RESULT($sve2support)
 
 # Implementation of the --disable-ppc-crypto-support switch.
 AC_MSG_CHECKING([whether PPC crypto support is requested])
 AC_ARG_ENABLE(ppc-crypto-support,
               AS_HELP_STRING([--disable-ppc-crypto-support],
                  [Disable support for the PPC crypto instructions introduced in POWER 8 (PowerISA 2.07)]),
               ppccryptosupport=$enableval,ppccryptosupport=yes)
 AC_MSG_RESULT($ppccryptosupport)
 
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
               AS_HELP_STRING([--disable-O-flag-munging],
                  [Disable modification of the cc -O flag]),
               [enable_o_flag_munging=$enableval],
               [enable_o_flag_munging=yes])
 AC_MSG_RESULT($enable_o_flag_munging)
 AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes")
 
 # Implementation of the --disable-instrumentation-munging switch.
 AC_MSG_CHECKING([whether a instrumentation (-fprofile, -fsanitize) munging is requested])
 AC_ARG_ENABLE([instrumentation-munging],
               AS_HELP_STRING([--disable-instrumentation-munging],
                  [Disable modification of the cc instrumentation options]),
               [enable_instrumentation_munging=$enableval],
               [enable_instrumentation_munging=yes])
 AC_MSG_RESULT($enable_instrumentation_munging)
 AM_CONDITIONAL(ENABLE_INSTRUMENTATION_MUNGING,
 	       test "$enable_instrumentation_munging" = "yes")
 
 # Implementation of the --disable-amd64-as-feature-detection switch.
 AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection])
 AC_ARG_ENABLE(amd64-as-feature-detection,
               AS_HELP_STRING([--disable-amd64-as-feature-detection],
                  [Disable the auto-detection of AMD64 as(1) features]),
 	      amd64_as_feature_detection=$enableval,
               amd64_as_feature_detection=yes)
 AC_MSG_RESULT($amd64_as_feature_detection)
 
 
 AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME",
                    [A human readable text with the name of the OS])
 
 # For some systems we know that we have ld_version scripts.
 # Use it then as default.
 have_ld_version_script=no
 case "${host}" in
     *-*-linux*)
 	have_ld_version_script=yes
         ;;
     *-*-gnu*)
 	have_ld_version_script=yes
         ;;
 esac
 AC_ARG_ENABLE([ld-version-script],
               AS_HELP_STRING([--enable-ld-version-script],
                              [enable/disable use of linker version script.
                               (default is system dependent)]),
               [have_ld_version_script=$enableval],
               [ : ] )
 AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes")
 
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM",
                    [defined to the name of the strong random device])
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM",
                    [defined to the name of the weaker random device])
 
 
 ###############################
 #### Checks for libraries. ####
 ###############################
 
 #
 # gpg-error is required.
 #
 AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION")
 if test "x$GPG_ERROR_LIBS" = "x"; then
   AC_MSG_ERROR([libgpg-error is needed.
                 See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .])
 fi
 
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
 AM_CONDITIONAL(USE_GPGRT_CONFIG, [test -n "$GPGRT_CONFIG" \
                                   -a "$ac_cv_path_GPG_ERROR_CONFIG" = no])
 
 #
 # Check whether pthreads is available
 #
 if test "$have_w32_system" != yes; then
   AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes)
   if test "$have_pthread" = yes; then
     AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.])
   fi
 fi
 
 
 # Solaris needs -lsocket and -lnsl. Unisys system includes
 # gethostbyname in libsocket but needs libnsl for socket.
 AC_SEARCH_LIBS(setsockopt, [socket], ,
 	[AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])])
 AC_SEARCH_LIBS(setsockopt, [nsl])
 
 ##################################
 #### Checks for header files. ####
 ##################################
 
 AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h sys/sysctl.h)
 
 
 ##########################################
 #### Checks for typedefs, structures, ####
 ####  and compiler characteristics.   ####
 ##########################################
 
 AC_C_CONST
 AC_C_INLINE
 AC_TYPE_SIZE_T
 AC_TYPE_PID_T
 
 AC_CHECK_TYPES([byte, ushort, u16, u32, u64])
 
 #
 # Check for __builtin_bswap32 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap32,
        [gcry_cv_have_builtin_bswap32],
        [gcry_cv_have_builtin_bswap32=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [int x = 0; int y = __builtin_bswap32(x); return y;])],
           [gcry_cv_have_builtin_bswap32=yes])])
 if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
              [Defined if compiler has '__builtin_bswap32' intrinsic])
 fi
 
 
 #
 # Check for __builtin_bswap64 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap64,
        [gcry_cv_have_builtin_bswap64],
        [gcry_cv_have_builtin_bswap64=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [long long x = 0; long long y = __builtin_bswap64(x); return y;])],
           [gcry_cv_have_builtin_bswap64=yes])])
 if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
              [Defined if compiler has '__builtin_bswap64' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctz,
        [gcry_cv_have_builtin_ctz],
        [gcry_cv_have_builtin_ctz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_ctz(x); return y;])],
           [gcry_cv_have_builtin_ctz=yes])])
 if test "$gcry_cv_have_builtin_ctz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZ, 1,
              [Defined if compiler has '__builtin_ctz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctzl,
        [gcry_cv_have_builtin_ctzl],
        [gcry_cv_have_builtin_ctzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_ctzl(x); return y;])],
           [gcry_cv_have_builtin_ctzl=yes])])
 if test "$gcry_cv_have_builtin_ctzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZL, 1,
              [Defined if compiler has '__builtin_ctzl' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clz,
        [gcry_cv_have_builtin_clz],
        [gcry_cv_have_builtin_clz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_clz(x); return y;])],
           [gcry_cv_have_builtin_clz=yes])])
 if test "$gcry_cv_have_builtin_clz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
              [Defined if compiler has '__builtin_clz' intrinsic])
 fi
 
 
 #
 # Check for __builtin_clzl intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_clzl,
        [gcry_cv_have_builtin_clzl],
        [gcry_cv_have_builtin_clzl=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned long x = 0; long y = __builtin_clzl(x); return y;])],
           [gcry_cv_have_builtin_clzl=yes])])
 if test "$gcry_cv_have_builtin_clzl" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CLZL, 1,
              [Defined if compiler has '__builtin_clzl' intrinsic])
 fi
 
 
 #
 # Check for __sync_synchronize intrinsic.
 #
 AC_CACHE_CHECK(for __sync_synchronize,
        [gcry_cv_have_sync_synchronize],
        [gcry_cv_have_sync_synchronize=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [__sync_synchronize(); return 0;])],
           [gcry_cv_have_sync_synchronize=yes])])
 if test "$gcry_cv_have_sync_synchronize" = "yes" ; then
    AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1,
              [Defined if compiler has '__sync_synchronize' intrinsic])
 fi
 
 
 #
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,
        [gcry_cv_have_vla],
        [gcry_cv_have_vla=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void f1(char *, int);
             char foo(int i) {
               char b[(i < 0 ? 0 : i) + 1];
               f1(b, sizeof b); return b[0];}]])],
           [gcry_cv_have_vla=yes])])
 if test "$gcry_cv_have_vla" = "yes" ; then
    AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported])
 fi
 
 
 #
 # Check for ELF visibility support.
 #
 AC_CACHE_CHECK(whether the visibility attribute is supported,
        gcry_cv_visibility_attribute,
        [gcry_cv_visibility_attribute=no
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo __attribute__ ((visibility ("hidden"))) = 1;
             int bar __attribute__ ((visibility ("protected"))) = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
             if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then
                 if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then
                     gcry_cv_visibility_attribute=yes
                 fi
             fi
         fi
        ])
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken visibility attribute,
        gcry_cv_broken_visibility_attribute,
        [gcry_cv_broken_visibility_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo (int x);
             int bar (int x) __asm__ ("foo")
                             __attribute__ ((visibility ("hidden")));
             int bar (int x) { return x; }
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep '\.hidden@<:@ 	_@:>@foo' conftest.s >/dev/null 2>&1;
             then
                gcry_cv_broken_visibility_attribute=no
            fi
         fi
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken alias attribute,
        gcry_cv_broken_alias_attribute,
        [gcry_cv_broken_alias_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[extern int foo (int x) __asm ("xyzzy");
             int bar (int x) { return x; }
             extern __typeof (bar) foo __attribute ((weak, alias ("bar")));
             extern int dfoo;
             extern __typeof (dfoo) dfoo __asm ("abccb");
             int dfoo = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \
               grep 'abccb' conftest.s >/dev/null 2>&1; then
               gcry_cv_broken_alias_attribute=no
            fi
         fi
         ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(if gcc supports -fvisibility=hidden,
        gcry_cv_gcc_has_f_visibility,
        [gcry_cv_gcc_has_f_visibility=no
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-fvisibility=hidden"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],
                           gcry_cv_gcc_has_f_visibility=yes)
         CFLAGS=$_gcc_cflags_save;
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes" \
    && test "$gcry_cv_broken_visibility_attribute" != "yes" \
    && test "$gcry_cv_broken_alias_attribute" != "yes" \
    && test "$gcry_cv_gcc_has_f_visibility" = "yes"
  then
    AC_DEFINE(GCRY_USE_VISIBILITY, 1,
                [Define to use the GNU C visibility attribute.])
    CFLAGS="$CFLAGS -fvisibility=hidden"
 fi
 
 
 # Following attribute tests depend on warnings to cause compile to fail,
 # so set -Werror temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether the compiler supports the GCC style aligned attribute
 #
 AC_CACHE_CHECK([whether the GCC style aligned attribute is supported],
        [gcry_cv_gcc_attribute_aligned],
        [gcry_cv_gcc_attribute_aligned=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct { int a; } foo __attribute__ ((aligned (16)));]])],
           [gcry_cv_gcc_attribute_aligned=yes])])
 if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1,
      [Defined if a GCC style "__attribute__ ((aligned (n))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style packed attribute
 #
 AC_CACHE_CHECK([whether the GCC style packed attribute is supported],
        [gcry_cv_gcc_attribute_packed],
        [gcry_cv_gcc_attribute_packed=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct foolong_s { long b; } __attribute__ ((packed));
             struct foo_s { char a; struct foolong_s b; }
               __attribute__ ((packed));
             enum bar {
               FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))),
             };]])],
           [gcry_cv_gcc_attribute_packed=yes])])
 if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1,
      [Defined if a GCC style "__attribute__ ((packed))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style may_alias attribute
 #
 AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported],
        [gcry_cv_gcc_attribute_may_alias],
        [gcry_cv_gcc_attribute_may_alias=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[typedef struct foo_s { int a; }
             __attribute__ ((may_alias)) foo_t;]])],
           [gcry_cv_gcc_attribute_may_alias=yes])])
 if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1,
      [Defined if a GCC style "__attribute__ ((may_alias))" is supported])
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether the compiler supports 'asm' or '__asm__' keyword for
 # assembler blocks.
 #
 AC_CACHE_CHECK([whether 'asm' assembler keyword is supported],
        [gcry_cv_have_asm],
        [gcry_cv_have_asm=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm("":::"memory"); }]])],
           [gcry_cv_have_asm=yes])])
 AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported],
        [gcry_cv_have___asm__],
        [gcry_cv_have___asm__=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { __asm__("":::"memory"); }]])],
           [gcry_cv_have___asm__=yes])])
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_DEFINE(asm,__asm__,
         [Define to supported assembler block keyword, if plain 'asm' was not
          supported])
    fi
 fi
 
 
 #
 # Check whether the compiler supports inline assembly memory barrier.
 #
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
           [gcry_cv_have_asm_volatile_memory],
           [gcry_cv_have_asm_volatile_memory=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void a(int x)
                {
                  __asm__ volatile("":::"memory");
                  __asm__ volatile("":"+r"(x)::"memory");
                }]])],
              [gcry_cv_have_asm_volatile_memory=yes])])
    fi
 else
    AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
        [gcry_cv_have_asm_volatile_memory],
        [gcry_cv_have_asm_volatile_memory=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(int x)
             {
               asm volatile("":::"memory");
               asm volatile("":"+r"(x)::"memory"); }]])],
           [gcry_cv_have_asm_volatile_memory=yes])])
 fi
 if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1,
      [Define if inline asm memory barrier is supported])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARM
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations],
        [gcry_cv_gcc_arm_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_arm_platform_as_ok="n/a"
         else
           gcry_cv_gcc_arm_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
-                "add %r0, %r0, %r4, ror #12;\n\t"
+                "add r0, r0, r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
               );
               void asmfunc(void);]], [ asmfunc(); ] )],
             [gcry_cv_gcc_arm_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARM assembly implementations])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARMv8/Aarch64
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations],
        [gcry_cv_gcc_aarch64_platform_as_ok],
        [if test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_aarch64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_aarch64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 ".text\n\t"
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
               );
               void asmfunc(void);]], [ asmfunc(); ] )],
             [gcry_cv_gcc_aarch64_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations])
 fi
 
 #
 # Check whether GCC assembler supports for CFI directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
        [gcry_cv_gcc_asm_cfi_directives],
        [gcry_cv_gcc_asm_cfi_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".text\n\t"
                 "ac_test:\n\t"
                 ".cfi_startproc\n\t"
                 ".cfi_remember_state\n\t"
                 ".cfi_adjust_cfa_offset 8\n\t"
                 ".cfi_rel_offset 0, 8\n\t"
                 ".cfi_def_cfa_register 1\n\t"
                 ".cfi_register 2, 3\n\t"
                 ".cfi_restore 2\n\t"
                 ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
                 ".cfi_restore_state\n\t"
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
             );
             void asmfunc(void)]])],
           [gcry_cv_gcc_asm_cfi_directives=yes])])
 if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
              [Defined if underlying assembler supports for CFI directives])
 fi
 
 
 #
 # Check whether GCC assembler supports for ELF directives.
 #
 AC_CACHE_CHECK([whether GCC assembler supports for ELF directives],
        [gcry_cv_gcc_asm_elf_directives],
        [gcry_cv_gcc_asm_elf_directives=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if ELF directives '.type' and '.size' are supported. */
                 ".text\n\t"
                 "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,STT_FUNC;\n\t"
             );]])],
           [gcry_cv_gcc_asm_elf_directives=yes])])
 if test "$gcry_cv_gcc_asm_elf_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_ELF_DIRECTIVES,1,
              [Defined if underlying assembler supports for ELF directives])
 fi
 
 
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
 #
 GNUPG_SYS_SYMBOL_UNDERSCORE()
 
 
 #################################
 ####                         ####
 #### Setup assembler stuff.  ####
 #### Define mpi_cpu_arch.    ####
 ####                         ####
 #################################
 AC_ARG_ENABLE(mpi-path,
               AS_HELP_STRING([--enable-mpi-path=EXTRA_PATH],
               [prepend EXTRA_PATH to list of CPU specific optimizations]),
 	      mpi_extra_path="$enableval",mpi_extra_path="")
 AC_MSG_CHECKING(architecture and mpi assembler functions)
 if test -f $srcdir/mpi/config.links ; then
     . $srcdir/mpi/config.links
     AC_CONFIG_LINKS("$mpi_ln_list")
     ac_cv_mpi_sflags="$mpi_sflags"
     AC_MSG_RESULT($mpi_cpu_arch)
 else
     AC_MSG_RESULT(failed)
     AC_MSG_ERROR([mpi/config.links missing!])
 fi
 MPI_SFLAGS="$ac_cv_mpi_sflags"
 AC_SUBST(MPI_SFLAGS)
 
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
    shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
    avx512support="n/a"
    gfnisupport="n/a"
    padlocksupport="n/a"
    drngsupport="n/a"
 fi
 
 if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
      svesupport="n/a"
      sve2support="n/a"
    fi
 fi
 
 if test "$mpi_cpu_arch" != "ppc"; then
    ppccryptosupport="n/a"
 fi
 
 #############################################
 ####                                     ####
 #### Platform specific compiler checks.  ####
 ####                                     ####
 #############################################
 
 
 # Following tests depend on warnings to cause compile to fail, so set -Werror
 # temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute],
        [gcry_cv_gcc_attribute_ms_abi],
        [gcry_cv_gcc_attribute_ms_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((ms_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_ms_abi=yes])])
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1,
      [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute])
 fi
 
 
 #
 # Check whether compiler supports 'sysv_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute],
        [gcry_cv_gcc_attribute_sysv_abi],
        [gcry_cv_gcc_attribute_sysv_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((sysv_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_sysv_abi=yes])])
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1,
      [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute])
 fi
 
 
 #
 # Check whether default calling convention is 'ms_abi'.
 #
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'ms_abi'],
           [gcry_cv_gcc_default_abi_is_ms_abi],
           [gcry_cv_gcc_default_abi_is_ms_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((ms_abi))(*msabi_func)(void);
                  /* warning on SysV abi targets, passes on Windows based targets */
                  msabi_func = def_func;
                  return msabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_ms_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1,
         [Defined if default calling convention is 'ms_abi'])
    fi
 fi
 
 
 #
 # Check whether default calling convention is 'sysv_abi'.
 #
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'],
           [gcry_cv_gcc_default_abi_is_sysv_abi],
           [gcry_cv_gcc_default_abi_is_sysv_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((sysv_abi))(*sysvabi_func)(void);
                  /* warning on MS ABI targets, passes on SysV ABI targets */
                  sysvabi_func = def_func;
                  return sysvabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_sysv_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1,
         [Defined if default calling convention is 'sysv_abi'])
    fi
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports SSSE3 instructions
 # This is required for the AES-NI instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
        [gcry_cv_gcc_inline_asm_ssse3],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1,
      [Defined if inline assembler supports SSSE3 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PCLMUL instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
        [gcry_cv_gcc_inline_asm_pclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SHA Extensions instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
        [gcry_cv_gcc_inline_asm_shaext],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_shaext="n/a"
         else
           gcry_cv_gcc_inline_asm_shaext=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_shaext=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
      [Defined if inline assembler supports SHA Extensions instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
        [gcry_cv_gcc_inline_asm_sse41],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1,
      [Defined if inline assembler supports SSE4.1 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
        [gcry_cv_gcc_inline_asm_avx],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
      [Defined if inline assembler supports AVX instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
        [gcry_cv_gcc_inline_asm_avx2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1,
      [Defined if inline assembler supports AVX2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX512 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX512 instructions],
        [gcry_cv_gcc_inline_asm_avx512],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_avx512="n/a"
         else
           gcry_cv_gcc_inline_asm_avx512=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpopcntq %%zmm7, %%zmm1%{%%k1%}%{z%};\n\t":::"cc");
               __asm__("vpexpandb %%zmm3, %%zmm1;\n\t":::"cc");
               __asm__("vpxorq %%xmm7, %%xmm7, %%xmm7;\n\t":::"cc");
               __asm__("vpxorq %%ymm7, %%ymm7, %%ymm7;\n\t":::"cc");
               __asm__("vpxorq (%%eax)%{1to8%}, %%zmm7, %%zmm7;\n\t":::"cc");
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx512=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx512" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX512,1,
      [Defined if inline assembler supports AVX512 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports VAES and VPCLMUL instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports VAES and VPCLMUL instructions],
        [gcry_cv_gcc_inline_asm_vaes_vpclmul],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_vaes_vpclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_vaes_vpclmul=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("vaesenclast %%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/
               __asm__("vaesenclast %%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/
               __asm__("vpclmulqdq \$0,%%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/
               __asm__("vpclmulqdq \$0,%%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_vaes_vpclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_vaes_vpclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL,1,
      [Defined if inline assembler supports VAES and VPCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports GFNI instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports GFNI instructions],
        [gcry_cv_gcc_inline_asm_gfni],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_gfni="n/a"
         else
           gcry_cv_gcc_inline_asm_gfni=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("gf2p8affineqb \$123, %%xmm0, %%xmm0;\n\t":::"cc"); /* SSE */
               __asm__("vgf2p8affineinvqb \$234, %%ymm1, %%ymm1, %%ymm1;\n\t":::"cc"); /* AVX */
               __asm__("vgf2p8mulb (%%eax), %%zmm2, %%zmm2;\n\t":::"cc"); /* AVX512 */
             }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_gfni=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_gfni" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_GFNI,1,
      [Defined if inline assembler supports GFNI instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
        [gcry_cv_gcc_inline_asm_bmi2],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[unsigned int a(unsigned int x, unsigned int y) {
               unsigned int tmp1, tmp2;
               asm ("rorxl %2, %1, %0"
                    : "=r" (tmp1)
                    : "rm0" (x), "J" (32 - ((23) & 31)));
               asm ("andnl %2, %1, %0"
                    : "=r" (tmp2)
                    : "r0" (x), "rm" (y));
               return tmp1 + tmp2;
             }]], [ a(1, 2); ] )],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1,
      [Defined if inline assembler supports BMI2 instructions])
 fi
 
 
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");
             void fn(void);]],
             [fn();])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
     # Add '-Wa,--divide' to CPPFLAGS and try check again.
     #
     _gcc_cppflags_save="$CPPFLAGS"
     CPPFLAGS="$CPPFLAGS -Wa,--divide"
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");
               void fn(void);]],
               [fn();])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
       CPPFLAGS="$_gcc_cppflags_save"
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
        [gcry_cv_gcc_amd64_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
                 ".text\n\t"
                 "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
                 /* Test if assembler allows use of '/' for constant division
                  * (Solaris/x86 issue). If previous constant division check
                  * and "-Wa,--divide" workaround failed, this causes assembly
                  * to be disable on this machine. */
                  "xorl \$(123456789/12345678), %ebp;\n\t"
             );
             void asmfunc(void);]], [ asmfunc(); ])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
      AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
       AC_LINK_IFELSE([AC_LANG_PROGRAM(
         [[__asm__(
               ".text\n\t"
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
           );
           void asmfunc(void);]], [ asmfunc(); ])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for assembly
 # implementations that use Intel syntax
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations],
        [gcry_cv_gcc_platform_as_ok_for_intel_syntax],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
                 ".text\n\t"
                 "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
                 "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t"
                 "add eax, ebp;\n\t"
                 "rorx eax, ebp, 1;\n\t"
                 "sub eax, [esp + 4];\n\t"
                 "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
             );
             void actest(void);]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
   AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1,
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
 
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
 AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture],
        [gcry_cv_cc_arm_arch_is_v6],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_cc_arm_arch_is_v6="n/a"
         else
           gcry_cv_cc_arm_arch_is_v6=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[
            #if defined(__arm__) && \
              ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
              || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
              || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
              || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
              || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
              || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
              || defined(__ARM_ARCH_7EM__))
              /* empty */
            #else
              /* fail compile if not ARMv6. */
              not_armv6 not_armv6 = (not_armv6)not_armv6;
            #endif
           ]])],
           [gcry_cv_cc_arm_arch_is_v6=yes])
         fi])
 if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
    AC_DEFINE(HAVE_ARM_ARCH_V6,1,
      [Defined if ARM architecture is v6 or newer])
 fi
 
 
 #
 # Check whether GCC inline assembler supports NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
        [gcry_cv_gcc_inline_asm_neon],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
-                "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
-                "vrev64.8 %q0, %q3;\n\t"
-                "vadd.u64 %q0, %q1;\n\t"
-                "vadd.s64 %d3, %d2, %d3;\n\t"
+                "vld1.64 {q0-q1}, [r0]!;\n\t"
+                "vrev64.8 q0, q3;\n\t"
+                "vadd.u64 q0, q1;\n\t"
+                "vadd.s64 d3, d2, d3;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
      [Defined if inline assembler supports NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch32_crypto],
        [if test "$mpi_cpu_arch" != "arm" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
                 ".text\n\t"
 
                 "testfn:\n\t"
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha1su0.32 q0, q0, q0;\n\t"
                 "sha1su1.32 q0, q0;\n\t"
 
                 "sha256h.32 q0, q0, q0;\n\t"
                 "sha256h2.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha256su0.32 q0, q0;\n\t"
                 "sha256su1.32 q0, q0, q15;\n\t"
 
                 "aese.8 q0, q0;\n\t"
                 "aesd.8 q0, q0;\n\t"
                 "aesmc.8 q0, q0;\n\t"
                 "aesimc.8 q0, q0;\n\t"
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1,
      [Defined if inline assembler supports AArch32 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions],
        [gcry_cv_gcc_inline_asm_aarch64_neon],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1,
      [Defined if inline assembler supports AArch64 NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch64_crypto],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
 
                 "sha1h s0, s0;\n\t"
                 "sha1c q0, s0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
                 "sha1su1 v0.4s, v0.4s;\n\t"
 
                 "sha256h q0, q0, v0.4s;\n\t"
                 "sha256h2 q0, q0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha256su0 v0.4s, v0.4s;\n\t"
                 "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
 
                 "aese v0.16b, v0.16b;\n\t"
                 "aesd v0.16b, v0.16b;\n\t"
                 "aesmc v0.16b, v0.16b;\n\t"
                 "aesimc v0.16b, v0.16b;\n\t"
 
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1,
      [Defined if inline assembler supports AArch64 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 SVE instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE instructions],
        [gcry_cv_gcc_inline_asm_aarch64_sve],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_sve="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_sve=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+sve\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 "mov x0, \#60;\n\t"
                 "whilelo p0.s, xzr, x0;\n\t"
                 "mov z0.s, p0/z, \#55;\n\t"
                 "ld1b {z0.b}, p0/z, [x1];\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_sve=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_sve" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE,1,
      [Defined if inline assembler supports AArch64 SVE instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 SVE2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE2 instructions],
        [gcry_cv_gcc_inline_asm_aarch64_sve2],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_sve2="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_sve2=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+sve2\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
                 ";\n\t"
                 "eor3 z0.d, z0.d, z1.d, z2.d;\n\t"
                 "ext z8.b, {z20.b, z21.b}, \#3;\n\t"
                 "adclt z0.d, z1.d, z2.d;\n\t"
                 "tbl z0.b, {z8.b, z9.b}, z1.b;\n\t"
                 "addhnb z16.s, z17.d, z18.d;\n\t"
                 "mov z0.s, p0/z, \#55;\n\t"
                 "ld1b {z0.b}, p0/z, [x1];\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_sve2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE2,1,
      [Defined if inline assembler supports AArch64 SVE2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions],
        [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4],
        [if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".arch armv8.2-a+sha3+sm4\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
 
                 /* Test for SHA512 instructions */
                 "sha512h q0, q0, v0.2d;\n\t"
                 "sha512h2 q0, q0, v0.2d;\n\t"
                 "sha512su0 v0.2d, v0.2d;\n\t"
                 "sha512su1 v0.2d, v0.2d, v31.2d;\n\t"
 
                 /* Test for SHA3 instructions */
                 "bcax v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
                 "eor3 v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
                 "rax1 v0.2d, v1.2d, v2.2d;\n\t"
                 "xar v0.2d, v1.2d, v2.2d, \#1;\n\t"
 
                 /* Test for SM3 instructions */
                 "sm3partw1 v0.4s, v1.4s, v2.4s;\n\t"
                 "sm3partw2 v0.4s, v1.4s, v2.4s;\n\t"
                 "sm3ss1 v0.4s, v1.4s, v2.4s, v3.4s;\n\t"
                 "sm3tt1a v0.4s, v1.4s, v2.s[0];\n\t"
                 "sm3tt1b v0.4s, v1.4s, v2.s[0];\n\t"
                 "sm3tt2a v0.4s, v1.4s, v2.s[0];\n\t"
                 "sm3tt2b v0.4s, v1.4s, v2.s[0];\n\t"
 
                 /* Test for SM4 instructions */
                 "sm4e v0.4s, v1.4s;\n\t"
                 "sm4ekey v0.4s, v1.4s, v2.4s;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4,1,
      [Defined if inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions])
 fi
 
 
 #
 # Check whether PowerPC AltiVec/VSX intrinsics
 #
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
       [if test "$mpi_cpu_arch" != "ppc" ||
 	  test "$try_asm_modules" != "yes" ; then
 	gcry_cv_cc_ppc_altivec="n/a"
       else
 	gcry_cv_cc_ppc_altivec=no
 	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
 	[[#include <altivec.h>
 	  typedef vector unsigned char block;
 	  typedef vector unsigned int vecu32;
 	  static inline __attribute__((always_inline)) vecu32
 	  vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
 	  {
 	    return vec_sld (a, b, (4 * idx) & 15);
 	  }
 	  block fn(block in)
 	  {
 	    block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	    vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	    y = vec_sld_u32 (y, y, 3);
 	    return vec_cipher_be (t, in) ^ (block)y;
 	  }
 	  ]])],
 	[gcry_cv_cc_ppc_altivec=yes])
       fi])
 if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	    [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
 fi
 
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
     test "$mpi_cpu_arch" = "ppc" &&
     test "$try_asm_modules" == "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
     AC_COMPILE_IFELSE([AC_LANG_SOURCE(
       [[#include <altivec.h>
 	typedef vector unsigned char block;
 	typedef vector unsigned int vecu32;
 	static inline __attribute__((always_inline)) vecu32
 	vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
 	{
 	  return vec_sld (a, b, (4 * idx) & 15);
 	}
 	block fn(block in)
 	{
 	  block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
 	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	  y = vec_sld_u32 (y, y, 3);
 	  return vec_cipher_be (t, in) ^ (block)y;
 	}]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags])
   fi
 fi
 
 AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS,
 	       test "$gcry_cv_cc_ppc_altivec_cflags" = "yes")
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions],
        [gcry_cv_gcc_inline_asm_ppc_altivec],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_altivec="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_altivec=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".globl testfn;\n"
                     ".text\n\t"
                     "testfn:\n"
                     "stvx %v31,%r12,%r0;\n"
                     "lvx  %v20,%r12,%r0;\n"
                     "vcipher %v0, %v1, %v22;\n"
                     "lxvw4x %vs32, %r0, %r1;\n"
                     "vadduwm %v0, %v1, %v22;\n"
                     "vshasigmaw %v0, %v1, 0, 15;\n"
                     "vshasigmad %v0, %v1, 0, 15;\n"
                     "vpmsumd %v11, %v11, %v11;\n"
                   );
             void testfn(void);
             ]], [ testfn(); ] )],
           [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1,
      [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PowerISA 3.00 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions],
        [gcry_cv_gcc_inline_asm_ppc_arch_3_00],
        [if test "$mpi_cpu_arch" != "ppc" ||
            test "$try_asm_modules" != "yes" ; then
           gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_arch_3_00=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".text\n\t"
                     ".globl testfn;\n"
                     "testfn:\n"
                     "stxvb16x %r1,%v12,%v30;\n"
                   );
             void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1,
      [Defined if inline assembler supports PowerISA 3.00 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
       [gcry_cv_gcc_inline_asm_s390x],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x=no
 	  AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	  [[typedef unsigned int u128_t __attribute__ ((mode (TI)));
 	    unsigned int testfunc(unsigned int x, void *y, unsigned int z)
 	    {
 	      unsigned long fac[8];
 	      register unsigned long reg0 asm("0") = 0;
 	      register unsigned long reg1 asm("1") = x;
 	      u128_t r1 = ((u128_t)(unsigned long)y << 64) | (unsigned long)z;
 	      u128_t r2 = 0;
 	      u128_t r3 = 0;
 	      asm volatile (".insn rre,0xb92e << 16, %[r1], %[r2]\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      asm volatile (".insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
 			    : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3)
 			    : "r" (reg0), "r" (reg1)
 			    : "cc", "memory");
 	      reg0 = 8 - 1;
 	      asm ("stfle %1\n\t"
 	           : "+d" (reg0), "=Q" (fac[0])
 	           :
 	           : "cc", "memory");
 	      asm volatile ("mvc 0(16, %0), 0(%1)\n\t"
 			    :
 			    : "a" (y), "a" (fac)
 			    : "memory");
 	      asm volatile ("xc 0(16, %0), 0(%0)\n\t"
 			    :
 			    : "a" (fac)
 			    : "memory");
 	      asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t"
 			    :
 			    :
 			    : "memory", "r11");
 	      asm volatile ("algrk %%r14, %%r14, %%r14\n\t"
 			    :
 			    :
 			    : "memory", "r14");
 	      return (unsigned int)r1 ^ reg0;
 	    }
 	    ]] , [ testfunc(0, 0, 0); ])],
 	  [gcry_cv_gcc_inline_asm_s390x=yes])
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X,1,
      [Defined if inline assembler supports zSeries instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports zSeries vector instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instructions],
       [gcry_cv_gcc_inline_asm_s390x_vx],
       [if test "$mpi_cpu_arch" != "s390x" ||
 	  test "$try_asm_modules" != "yes" ; then
 	  gcry_cv_gcc_inline_asm_s390x_vx="n/a"
 	else
 	  gcry_cv_gcc_inline_asm_s390x_vx=no
 	  if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
 	    AC_LINK_IFELSE([AC_LANG_PROGRAM(
 	    [[void testfunc(void)
 	      {
 		asm volatile (".machine \"z13+vx\"\n\t"
 			      "vx %%v0, %%v1, %%v31\n\t"
 			      "verllf %%v11, %%v11, (16)(0)\n\t"
 			      :
 			      :
 			      : "memory");
 	      }
 	      ]], [ testfunc(); ])],
 	    [gcry_cv_gcc_inline_asm_s390x_vx=yes])
 	  fi
 	fi])
 if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X_VX,1,
      [Defined if inline assembler supports zSeries vector instructions])
 fi
 
 
 #######################################
 #### Checks for library functions. ####
 #######################################
 
 AC_FUNC_VPRINTF
 # We have replacements for these in src/missing-string.c
 AC_CHECK_FUNCS(stpcpy strcasecmp)
 # We have replacements for these in src/g10lib.h
 AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
 AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy sysctlbyname)
 
 GNUPG_CHECK_MLOCK
 
 #
 # Replacement functions.
 #
 AC_REPLACE_FUNCS([getpid clock])
 
 
 #
 # Check whether it is necessary to link against libdl.
 #
 DL_LIBS=""
 if test "$use_hmac_binary_check" != no ; then
   _gcry_save_libs="$LIBS"
   LIBS=""
   AC_SEARCH_LIBS(dlopen, c dl,,,)
   DL_LIBS=$LIBS
   LIBS="$_gcry_save_libs"
 fi
 AC_SUBST(DL_LIBS)
 
 
 #
 # Check whether we can use Linux capabilities as requested.
 #
 if test "$use_capabilities" = "yes" ; then
 use_capabilities=no
 AC_CHECK_HEADERS(sys/capability.h)
 if test "$ac_cv_header_sys_capability_h" = "yes" ; then
   AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1)
   if test "$ac_cv_lib_cap_cap_init" = "yes"; then
      AC_DEFINE(USE_CAPABILITIES,1,
                [define if capabilities should be used])
      LIBS="$LIBS -lcap"
      use_capabilities=yes
   fi
 fi
 if test "$use_capabilities" = "no" ; then
     AC_MSG_WARN([[
 ***
 *** The use of capabilities on this system is not possible.
 *** You need a recent Linux kernel and some patches:
 ***   fcaps-2.2.9-990610.patch      (kernel patch for 2.2.9)
 ***   fcap-module-990613.tar.gz     (kernel module)
 ***   libcap-1.92.tar.gz            (user mode library and utilities)
 *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN
 *** set (filesystems menu). Be warned: This code is *really* ALPHA.
 ***]])
 fi
 fi
 
 # Check whether a random device is available.
 if test "$try_dev_random" = yes ; then
     AC_CACHE_CHECK(for random device, ac_cv_have_dev_random,
     [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then
       ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi])
     if test "$ac_cv_have_dev_random" = yes; then
         AC_DEFINE(HAVE_DEV_RANDOM,1,
                  [defined if the system supports a random device] )
     fi
 else
     AC_MSG_CHECKING(for random device)
     ac_cv_have_dev_random=no
     AC_MSG_RESULT(has been disabled)
 fi
 
 # Figure out the random modules for this configuration.
 if test "$random" = "default"; then
 
     # Select default value.
     if test "$ac_cv_func_getentropy" = yes; then
         random_modules="getentropy"
     elif test "$ac_cv_have_dev_random" = yes; then
         # Try Linuxish random device.
         random_modules="linux"
     else
         case "${host}" in
         *-*-mingw32ce*)
           # WindowsCE random device.
           random_modules="w32ce"
           ;;
         *-*-mingw32*|*-*-cygwin*)
           # Windows random device.
           random_modules="w32"
           ;;
         *)
           # Build everything, allow to select at runtime.
           random_modules="$auto_random_modules"
           ;;
         esac
     fi
 else
     if test "$random" = "auto"; then
         # Build everything, allow to select at runtime.
         random_modules="$auto_random_modules"
     else
         random_modules="$random"
     fi
 fi
 
 
 #
 # Other defines
 #
 if test mym4_isgit = "yes"; then
     AC_DEFINE(IS_DEVELOPMENT_VERSION,1,
               [Defined if this is not a regular release])
 fi
 
 
 AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes)
 
 
 # This is handy for debugging so the compiler doesn't rearrange
 # things and eliminate variables.
 AC_ARG_ENABLE(optimization,
        AS_HELP_STRING([--disable-optimization],
                       [disable compiler optimization]),
                       [if test $enableval = no ; then
                          CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'`
                        fi])
 
 AC_MSG_NOTICE([checking for cc features])
 # CFLAGS mangling when using gcc.
 if test "$GCC" = yes; then
     AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks])
     _gcc_cflags_save=$CFLAGS
     CFLAGS="-fno-delete-null-pointer-checks"
     AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
     AC_MSG_RESULT($_gcc_wopt)
     CFLAGS=$_gcc_cflags_save;
     if test x"$_gcc_wopt" = xyes ; then
        CFLAGS="$CFLAGS -fno-delete-null-pointer-checks"
     fi
 
     CFLAGS="$CFLAGS -Wall"
     if test "$USE_MAINTAINER_MODE" = "yes"; then
         CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes"
         CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security"
 
         # If -Wno-missing-field-initializers is supported we can enable a
         # a bunch of really useful warnings.
         AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wno-missing-field-initializers"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast"
           CFLAGS="$CFLAGS -Wwrite-strings"
           CFLAGS="$CFLAGS -Wdeclaration-after-statement"
           CFLAGS="$CFLAGS -Wno-missing-field-initializers"
           CFLAGS="$CFLAGS -Wno-sign-compare"
         fi
 
         AC_MSG_CHECKING([if gcc supports -Wpointer-arith])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wpointer-arith"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -Wpointer-arith"
         fi
     fi
 fi
 
 # Check whether as(1) supports a noeexecstack feature.  This test
 # includes an override option.
 CL_AS_NOEXECSTACK
 
 
 AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION)
 AC_SUBST(LIBGCRYPT_CONFIG_LIBS)
 AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS)
 AC_SUBST(LIBGCRYPT_CONFIG_HOST)
 AC_SUBST(LIBGCRYPT_THREAD_MODULES)
 
 AC_CONFIG_COMMANDS([gcrypt-conf],[[
 chmod +x src/libgcrypt-config
 ]],[[
 prefix=$prefix
 exec_prefix=$exec_prefix
 libdir=$libdir
 datadir=$datadir
 DATADIRNAME=$DATADIRNAME
 ]])
 
 #####################
 #### Conclusion. ####
 #####################
 
 # Check that requested feature can actually be used and define
 # ENABLE_foo_SUPPORT macros.
 
 if test x"$aesnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$shaextsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
     shaextsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$sse41support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then
     sse41support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx512support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx512" != "yes" ; then
     avx512support="no (unsupported by compiler)"
   fi
 fi
 if test x"$gfnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_gfni" != "yes" ; then
     gfnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
       armcryptosupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$svesupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sve" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_sve" != "yes" ; then
       svesupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$sve2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sve2" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" != "yes" ; then
       sve2support="no (unsupported by compiler)"
     fi
   fi
 fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
 if test x"$shaextsupport" = xyes ; then
   AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
             [Enable support for Intel SHAEXT instructions.])
 fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
 fi
 if test x"$sse41support" = xyes ; then
   AC_DEFINE(ENABLE_SSE41_SUPPORT, 1,
             [Enable support for Intel SSE4.1 instructions.])
 fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
 fi
 if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
 if test x"$avx512support" = xyes ; then
   AC_DEFINE(ENABLE_AVX512_SUPPORT,1,
             [Enable support for Intel AVX512 instructions.])
 fi
 if test x"$gfnisupport" = xyes ; then
   AC_DEFINE(ENABLE_GFNI_SUPPORT,1,
             [Enable support for Intel GFNI instructions.])
 fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
 fi
 if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
 if test x"$svesupport" = xyes ; then
   AC_DEFINE(ENABLE_SVE_SUPPORT,1,
             [Enable support for ARMv8 SVE instructions.])
 fi
 if test x"$sve2support" = xyes ; then
   AC_DEFINE(ENABLE_SVE2_SUPPORT,1,
             [Enable support for ARMv9 SVE2 instructions.])
 fi
 if test x"$ppccryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1,
             [Enable support for POWER 8 (PowerISA 2.07) crypto extension.])
 fi
 if test x"$jentsupport" = xyes ; then
   AC_DEFINE(ENABLE_JENT_SUPPORT, 1,
             [Enable support for the jitter entropy collector.])
 fi
 if test x"$padlocksupport" = xyes ; then
   AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1,
             [Enable support for the PadLock engine.])
 fi
 if test x"$drngsupport" = xyes ; then
   AC_DEFINE(ENABLE_DRNG_SUPPORT, 1,
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
 
 if test x"$force_soft_hwfeatures" = xyes ; then
   AC_DEFINE(ENABLE_FORCE_SOFT_HWFEATURES, 1,
             [Enable forcing 'soft' HW feature bits on (for testing).])
 fi
 
 # Define conditional sources and config.h symbols depending on the
 # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules.
 
 LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS arcfour-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS des-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-amd64.lo"
 
          # Build with the SSSE3 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ssse3-amd64-asm.lo"
 
          # Build with the VAES/AVX2 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-avx2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-arm.lo"
 
          # Build with the ARMv8/AArch32 CE implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aarch64.lo"
 
          # Build with the ARMv8/AArch64 CE implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc9le.lo"
 
          if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
             test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
             # Build with AES-GCM bulk implementation for P10
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-gcm-p10le.lo"
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-p10le.lo"
          fi
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
       s390x-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-s390x.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
          # Build with the AES-NI implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-aesni.lo"
 
          # Build with the Padlock implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-padlock.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo"
    AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-amd64.lo"
 
          if test x"$avx2support" = xyes ; then
             # Build with the AVX2 implementation
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-avx2-amd64.lo"
          fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS twofish-aarch64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the SSE2 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-sse2-amd64.lo"
       ;;
    esac
 
    if test x"$avx2support" = xyes ; then
       # Build with the AVX2 implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx2-amd64.lo"
    fi
 
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo"
    AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(seed, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo"
    AC_DEFINE(USE_SEED, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
 
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
       ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 
    if test x"$avx2support" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aesni-avx2-amd64.lo"
 
         # Build with the VAES/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
 
         # Build with the GFNI/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
 
         # Build with the GFNI/AVX512 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx512-amd64.lo"
       fi
    fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo"
    AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-amd64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS salsa20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo"
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-aarch64.lo"
       ;;
       powerpc64le-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
          # Build with the assembly implementation
          if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
             test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
             GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-p10le-8x.lo"
          fi
       ;;
       powerpc64-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       powerpc-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
       ;;
       s390x-*-*)
          # Build with the s390x/zSeries vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sm4, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4.lo"
    AC_DEFINE(USE_SM4, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx512-amd64.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv9-aarch64-sve-ce.lo"
    esac
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 AM_CONDITIONAL(USE_DSA, [test "$found" = "1"])
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
    AC_DEFINE(USE_DSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rsa, $enabled_pubkey_ciphers)
 AM_CONDITIONAL(USE_RSA, [test "$found" = "1"])
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo"
    AC_DEFINE(USE_RSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(elgamal, $enabled_pubkey_ciphers)
 AM_CONDITIONAL(USE_ELGAMAL, [test "$found" = "1"])
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo"
    AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(ecc, $enabled_pubkey_ciphers)
 AM_CONDITIONAL(USE_ECC, [test "$found" = "1"])
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \
                           ecc.lo ecc-curves.lo ecc-misc.lo \
                           ecc-ecdh.lo ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo \
                           ecc-sm2.lo"
    AC_DEFINE(USE_ECC, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(crc, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
    AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
 
    case "${host}" in
       i?86-*-* | x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-intel-pclmul.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-ce.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
       powerpc64-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
       powerpc-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(gostr3411-94, $enabled_digests)
 if test "$found" = "1" ; then
    # GOST R 34.11-94 internally uses GOST 28147-89
    LIST_MEMBER(gost28147, $enabled_ciphers)
    if test "$found" = "1" ; then
       GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo"
       AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included])
    fi
 fi
 
 LIST_MEMBER(stribog, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo"
    AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo"
    AC_DEFINE(USE_MD2, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md4, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo"
    AC_DEFINE(USE_MD4, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md5, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo"
    AC_DEFINE(USE_MD5, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rmd160, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo"
    AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(sha256, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo"
    AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
    esac
 
    case "$mpi_cpu_arch" in
      x86)
        # Build with the SHAEXT implementation
        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-intel-shaext.lo"
      ;;
    esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx2-bmi2-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx512-amd64.lo"
       ;;
       i?86-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-i386.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv8-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
       ;;
       powerpc-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo"
    AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-amd64-avx512.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo"
    AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(whirlpool, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo"
    AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS whirlpool-sse2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx512.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx512.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
    AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
 
    case "${host}" in
      x86_64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-avx-bmi2-amd64.lo"
      ;;
      aarch64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
      ;;
    esac
 fi
 
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
 case "${host}" in
   x86_64-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx-bmi2-amd64.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-avx2-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv7-neon.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     # Build with the assembly implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-armv8-aarch64-ce.lo"
   ;;
 esac
 
 case "$mpi_cpu_arch" in
   x86)
     # Build with the SHAEXT implementation
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha1-intel-shaext.lo"
   ;;
 esac
 
 # Arch specific GCM implementations
 case "${host}" in
   i?86-*-* | x86_64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-intel-pclmul.lo"
   ;;
   arm*-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv7-neon.lo"
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-armv8-aarch64-ce.lo"
   ;;
   powerpc64le-*-* | powerpc64-*-* | powerpc-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-ppc.lo"
   ;;
 esac
 
 # Arch specific MAC implementations
 case "${host}" in
   s390x-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
   ;;
   x86_64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo"
   ;;
   powerpc64le-*-*)
     # Build with the assembly implementation
     if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
        test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-p10le.lo"
     fi
   ;;
 esac
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
    AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(getentropy, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndgetentropy.lo"
    AC_DEFINE(USE_RNDGETENTROPY, 1, [Defined if the getentropy RNG should be used.])
 fi
 
 LIST_MEMBER(linux, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndoldlinux.lo"
    AC_DEFINE(USE_RNDOLDLINUX, 1, [Defined if the /dev/random RNG should be used.])
 fi
 
 LIST_MEMBER(unix, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo"
    AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.])
 fi
 
 LIST_MEMBER(egd, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo"
    AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.])
 fi
 
 LIST_MEMBER(w32, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo"
    AC_DEFINE(USE_RNDW32, 1,
              [Defined if the Windows specific RNG should be used.])
 fi
 
 LIST_MEMBER(w32ce, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo"
    AC_DEFINE(USE_RNDW32CE, 1,
              [Defined if the WindowsCE specific RNG should be used.])
 fi
 
 if test "$try_asm_modules" = yes ; then
   # Build with assembly implementations
   GCRYPT_CIPHERS="$GCRYPT_CIPHERS $GCRYPT_ASM_CIPHERS"
   GCRYPT_DIGESTS="$GCRYPT_DIGESTS $GCRYPT_ASM_DIGESTS"
 fi
 
 AC_SUBST([GCRYPT_CIPHERS])
 AC_SUBST([GCRYPT_PUBKEY_CIPHERS])
 AC_SUBST([GCRYPT_DIGESTS])
 AC_SUBST([GCRYPT_KDFS])
 AC_SUBST([GCRYPT_RANDOM])
 
 AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers)
 AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers)
 AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests)
 
 # For printing the configuration we need a colon separated list of
 # algorithm names.
 tmp=`echo "$enabled_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp",
                    [List of available cipher algorithms])
 tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp",
                    [List of available public key cipher algorithms])
 tmp=`echo "$enabled_digests" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp",
                    [List of available digest algorithms])
 tmp=`echo "$enabled_kdfs" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp",
                    [List of available KDF algorithms])
 
 
 #
 # Define conditional sources depending on the used hardware platform.
 # Note that all possible modules must also be listed in
 # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES).
 #
 GCRYPT_HWF_MODULES=
 case "$mpi_cpu_arch" in
      x86)
         AC_DEFINE(HAVE_CPU_ARCH_X86, 1,   [Defined for the x86 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo"
         ;;
      alpha)
         AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms])
         ;;
      sparc)
         AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms])
         ;;
      mips)
         AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1,  [Defined for MIPS platforms])
         ;;
      m68k)
         AC_DEFINE(HAVE_CPU_ARCH_M68K, 1,  [Defined for M68k platforms])
         ;;
      ppc)
         AC_DEFINE(HAVE_CPU_ARCH_PPC, 1,   [Defined for PPC platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-ppc.lo"
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      aarch64)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM AArch64 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      s390x)
         AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
 
 #
 # Option to disable building of doc file
 #
 build_doc=yes
 AC_ARG_ENABLE([doc], AS_HELP_STRING([--disable-doc],
                                     [do not build the documentation]),
                      build_doc=$enableval, build_doc=yes)
 AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno])
 
 
 #
 # Provide information about the build.
 #
 BUILD_REVISION="mym4_revision"
 AC_SUBST(BUILD_REVISION)
 AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION",
                    [GIT commit id revision used to build this package])
 
 changequote(,)dnl
 BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'`
 changequote([,])dnl
 BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec"
 BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,`
 AC_SUBST(BUILD_VERSION)
 AC_SUBST(BUILD_FILEVERSION)
 
 AC_ARG_ENABLE([build-timestamp],
   AS_HELP_STRING([--enable-build-timestamp],
                  [set an explicit build timestamp for reproducibility.
                   (default is the current time in ISO-8601 format)]),
      [if test "$enableval" = "yes"; then
         BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date`
       else
         BUILD_TIMESTAMP="$enableval"
       fi],
      [BUILD_TIMESTAMP="<none>"])
 AC_SUBST(BUILD_TIMESTAMP)
 AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP",
                    [The time this package was configured for a build])
 
 
 # And create the files.
 AC_CONFIG_FILES([
 Makefile
 m4/Makefile
 compat/Makefile
 mpi/Makefile
 cipher/Makefile
 random/Makefile
 doc/Makefile
 src/Makefile
 src/gcrypt.h
 src/libgcrypt-config
 src/libgcrypt.pc
 src/versioninfo.rc
 tests/Makefile
 ])
 AC_CONFIG_FILES([tests/hashtest-6g], [chmod +x tests/hashtest-6g])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
 
 
 detection_module="${GCRYPT_HWF_MODULES%.lo}"
 test -n "$detection_module" || detection_module="none"
 
 # Give some feedback
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:])
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Platform:                 ],[$PRINTABLE_OS_NAME ($host)])
 GCRY_MSG_SHOW([Hardware detection module:],[$detection_module])
 GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers])
 GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests])
 GCRY_MSG_WRAP([Enabled kdf algorithms:   ],[$enabled_kdfs])
 GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers])
 GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([FIPS module version:      ],[$fips_module_version])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
 GCRY_MSG_SHOW([Try using Intel AVX512:   ],[$avx512support])
 GCRY_MSG_SHOW([Try using Intel GFNI:     ],[$gfnisupport])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
 GCRY_MSG_SHOW([Try using ARMv8 SVE:      ],[$svesupport])
 GCRY_MSG_SHOW([Try using ARMv9 SVE2:     ],[$sve2support])
 GCRY_MSG_SHOW([Try using PPC crypto:     ],[$ppccryptosupport])
 GCRY_MSG_SHOW([],[])
 
 if test "x${gpg_config_script_warn}" != x; then
 cat <<G10EOF
         Mismatches between the target platform and the to
         be used libraries have been been detected for:
          ${gpg_config_script_warn}
         Please check above for warning messages.
 
 G10EOF
 fi
 
 if test "$gcry_cv_gcc_attribute_aligned" != "yes" ; then
 cat <<G10EOF
    Please not that your compiler does not support the GCC style
    aligned attribute. Using this software may evoke bus errors.
 
 G10EOF
 fi
 
 if test -n "$gpl"; then
   echo "Please note that you are building a version of Libgcrypt with"
   echo "  $gpl"
   echo "included.  These parts are licensed under the GPL and thus the"
   echo "use of this library has to comply with the conditions of the GPL."
   echo ""
 fi
diff --git a/mpi/arm/mpih-add1.S b/mpi/arm/mpih-add1.S
index 09e8b3b2..d59d3f3d 100644
--- a/mpi/arm/mpih-add1.S
+++ b/mpi/arm/mpih-add1.S
@@ -1,76 +1,76 @@
 /* ARM add_n -- Add two limb vectors of the same length > 0 and store
  *              sum in a third limb vector.
  *
  *      Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
  */
 
 #include "sysdep.h"
 #include "asm-syntax.h"
 
 .syntax unified
 .arm
 
 /*******************
  *  mpi_limb_t
- *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	%r0
- *		   mpi_ptr_t s1_ptr,		%r1
- *		   mpi_ptr_t s2_ptr,		%r2
- *		   mpi_size_t size)		%r3
+ *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	r0
+ *		   mpi_ptr_t s1_ptr,		r1
+ *		   mpi_ptr_t s2_ptr,		r2
+ *		   mpi_size_t size)		r3
  */
 
 .text
 
 .globl _gcry_mpih_add_n
 .type  _gcry_mpih_add_n,%function
 _gcry_mpih_add_n:
-	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
-	cmn	%r0, #0; /* clear carry flag */
+	push	{r4, r5, r6, r7, r8, r9, r10, lr};
+	cmn	r0, #0; /* clear carry flag */
 
-	tst	%r3, #3;
+	tst	r3, #3;
 	beq	.Large_loop;
 
 .Loop:
-	ldr	%r4, [%r1], #4;
-	sub	%r3, #1;
-	ldr	%lr, [%r2], #4;
-	adcs	%r4, %lr;
-	tst	%r3, #3;
-	str	%r4, [%r0], #4;
+	ldr	r4, [r1], #4;
+	sub	r3, #1;
+	ldr	lr, [r2], #4;
+	adcs	r4, lr;
+	tst	r3, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r3, #0;
+	teq	r3, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldm	%r1!, {%r4, %r6, %r8, %r10};
-	ldm	%r2!, {%r5, %r7, %r9, %lr};
-	sub	%r3, #4;
-	adcs	%r4, %r5;
-	adcs	%r6, %r7;
-	adcs	%r8, %r9;
-	adcs	%r10, %lr;
-	teq	%r3, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	ldm	r1!, {r4, r6, r8, r10};
+	ldm	r2!, {r5, r7, r9, lr};
+	sub	r3, #4;
+	adcs	r4, r5;
+	adcs	r6, r7;
+	adcs	r8, r9;
+	adcs	r10, lr;
+	teq	r3, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
-	adc	%r0, %r3, #0;
-	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+	adc	r0, r3, #0;
+	pop	{r4, r5, r6, r7, r8, r9, r10, pc};
 .size _gcry_mpih_add_n,.-_gcry_mpih_add_n;
diff --git a/mpi/arm/mpih-mul1.S b/mpi/arm/mpih-mul1.S
index c2e2854b..ea196e8b 100644
--- a/mpi/arm/mpih-mul1.S
+++ b/mpi/arm/mpih-mul1.S
@@ -1,80 +1,80 @@
 /* ARM mul_1 -- Multiply a limb vector with a limb and store the result in
  *              a second limb vector.
  *
  *      Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
  */
 
 #include "sysdep.h"
 #include "asm-syntax.h"
 
 .syntax unified
 .arm
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,		%r0
- *		  mpi_ptr_t s1_ptr,		%r1
- *		  mpi_size_t s1_size,		%r2
- *		  mpi_limb_t s2_limb)		%r3
+ * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,		r0
+ *		  mpi_ptr_t s1_ptr,		r1
+ *		  mpi_size_t s1_size,		r2
+ *		  mpi_limb_t s2_limb)		r3
  */
 
 .text
 
 .globl _gcry_mpih_mul_1
 .type  _gcry_mpih_mul_1,%function
 _gcry_mpih_mul_1:
-	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %lr};
-	mov	%r4, #0;
+	push	{r4, r5, r6, r7, r8, r9, r10, r11, lr};
+	mov	r4, #0;
 
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	.Large_loop;
 
 .Loop:
-	ldr	%r5, [%r1], #4;
-	mov	%lr, #0;
-	umlal	%r4, %lr, %r5, %r3;
-	sub	%r2, #1;
-	str	%r4, [%r0], #4;
-	tst	%r2, #3;
-	mov	%r4, %lr;
+	ldr	r5, [r1], #4;
+	mov	lr, #0;
+	umlal	r4, lr, r5, r3;
+	sub	r2, #1;
+	str	r4, [r0], #4;
+	tst	r2, #3;
+	mov	r4, lr;
 	bne	.Loop;
 
-	teq	%r2, #0;
+	teq	r2, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldm	%r1!, {%r5, %r6, %r7, %r8};
-	mov	%r9, #0;
-	mov	%r10, #0;
-	umlal	%r4, %r9, %r5, %r3;
-	mov	%r11, #0;
-	umlal	%r9, %r10, %r6, %r3;
-	str	%r4, [%r0], #4;
-	mov	%r4, #0;
-	umlal	%r10, %r11, %r7, %r3;
-	subs	%r2, #4;
-	umlal	%r11, %r4, %r8, %r3;
-	stm	%r0!, {%r9, %r10, %r11};
+	ldm	r1!, {r5, r6, r7, r8};
+	mov	r9, #0;
+	mov	r10, #0;
+	umlal	r4, r9, r5, r3;
+	mov	r11, #0;
+	umlal	r9, r10, r6, r3;
+	str	r4, [r0], #4;
+	mov	r4, #0;
+	umlal	r10, r11, r7, r3;
+	subs	r2, #4;
+	umlal	r11, r4, r8, r3;
+	stm	r0!, {r9, r10, r11};
 	bne	.Large_loop;
 
 .Lend:
-	mov	%r0, %r4;
-	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %pc};
+	mov	r0, r4;
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc};
 .size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1;
diff --git a/mpi/arm/mpih-mul2.S b/mpi/arm/mpih-mul2.S
index bce932e9..8793b20f 100644
--- a/mpi/arm/mpih-mul2.S
+++ b/mpi/arm/mpih-mul2.S
@@ -1,94 +1,94 @@
 /* ARM mul_2 -- Multiply a limb vector with a limb and add the result to
  *              a second limb vector.
  *
  *      Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
  */
 
 #include "sysdep.h"
 #include "asm-syntax.h"
 
 .syntax unified
 .arm
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,	%r0
- *		     mpi_ptr_t s1_ptr,		%r1
- *		     mpi_size_t s1_size,	%r2
- *		     mpi_limb_t s2_limb)	%r3
+ * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,	r0
+ *		     mpi_ptr_t s1_ptr,		r1
+ *		     mpi_size_t s1_size,	r2
+ *		     mpi_limb_t s2_limb)	r3
  */
 
 .text
 
 .globl _gcry_mpih_addmul_1
 .type  _gcry_mpih_addmul_1,%function
 _gcry_mpih_addmul_1:
-	push	{%r4, %r5, %r6, %r8, %r10, %lr};
-	mov	%lr, #0;
-	cmn	%r0, #0; /* clear carry flag */
+	push	{r4, r5, r6, r8, r10, lr};
+	mov	lr, #0;
+	cmn	r0, #0; /* clear carry flag */
 
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	.Large_loop;
 .Loop:
-	ldr	%r5, [%r1], #4;
-	ldr	%r4, [%r0];
-	sub	%r2, #1;
-	adcs	%r4, %lr;
-	mov	%lr, #0;
-	umlal	%r4, %lr, %r5, %r3;
-	tst	%r2, #3;
-	str	%r4, [%r0], #4;
+	ldr	r5, [r1], #4;
+	ldr	r4, [r0];
+	sub	r2, #1;
+	adcs	r4, lr;
+	mov	lr, #0;
+	umlal	r4, lr, r5, r3;
+	tst	r2, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r2, #0;
+	teq	r2, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldr	%r5, [%r1], #4;
-	ldm	%r0, {%r4, %r6, %r8, %r10};
+	ldr	r5, [r1], #4;
+	ldm	r0, {r4, r6, r8, r10};
 
-	sub	%r2, #4;
-	adcs	%r4, %lr;
-	mov	%lr, #0;
-	umlal	%r4, %lr, %r5, %r3;
+	sub	r2, #4;
+	adcs	r4, lr;
+	mov	lr, #0;
+	umlal	r4, lr, r5, r3;
 
-	ldr	%r5, [%r1], #4;
-	adcs	%r6, %lr;
-	mov	%lr, #0;
-	umlal	%r6, %lr, %r5, %r3;
+	ldr	r5, [r1], #4;
+	adcs	r6, lr;
+	mov	lr, #0;
+	umlal	r6, lr, r5, r3;
 
-	ldr	%r5, [%r1], #4;
-	adcs	%r8, %lr;
-	mov	%lr, #0;
-	umlal	%r8, %lr, %r5, %r3;
+	ldr	r5, [r1], #4;
+	adcs	r8, lr;
+	mov	lr, #0;
+	umlal	r8, lr, r5, r3;
 
-	ldr	%r5, [%r1], #4;
-	adcs	%r10, %lr;
-	mov	%lr, #0;
-	umlal	%r10, %lr, %r5, %r3;
+	ldr	r5, [r1], #4;
+	adcs	r10, lr;
+	mov	lr, #0;
+	umlal	r10, lr, r5, r3;
 
-	teq	%r2, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	teq	r2, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
-	adc	%r0, %lr, #0;
-	pop	{%r4, %r5, %r6, %r8, %r10, %pc};
+	adc	r0, lr, #0;
+	pop	{r4, r5, r6, r8, r10, pc};
 .size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1;
diff --git a/mpi/arm/mpih-mul3.S b/mpi/arm/mpih-mul3.S
index 33326c78..2477c089 100644
--- a/mpi/arm/mpih-mul3.S
+++ b/mpi/arm/mpih-mul3.S
@@ -1,100 +1,100 @@
 /* ARM mul_3 -- Multiply a limb vector with a limb and subtract the result
  *              from a second limb vector.
  *
  *      Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
  */
 
 #include "sysdep.h"
 #include "asm-syntax.h"
 
 .syntax unified
 .arm
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,	%r0
- *		     mpi_ptr_t s1_ptr,		%r1
- *		     mpi_size_t s1_size,	%r2
- *		     mpi_limb_t s2_limb)	%r3
+ * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,	r0
+ *		     mpi_ptr_t s1_ptr,		r1
+ *		     mpi_size_t s1_size,	r2
+ *		     mpi_limb_t s2_limb)	r3
  */
 
 .text
 
 .globl _gcry_mpih_submul_1
 .type  _gcry_mpih_submul_1,%function
 _gcry_mpih_submul_1:
-	push	{%r4, %r5, %r6, %r8, %r9, %r10, %lr};
-	mov	%lr, #0;
-	cmp	%r0, #0; /* prepare carry flag for sbc */
+	push	{r4, r5, r6, r8, r9, r10, lr};
+	mov	lr, #0;
+	cmp	r0, #0; /* prepare carry flag for sbc */
 
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	.Large_loop;
 .Loop:
-	ldr	%r5, [%r1], #4;
-	mov	%r4, %lr;
-	mov	%lr, #0;
-	ldr	%r6, [%r0];
-	umlal	%r4, %lr, %r5, %r3;
-	sub	%r2, #1;
-	sbcs	%r4, %r6, %r4;
-	tst	%r2, #3;
-	str	%r4, [%r0], #4;
+	ldr	r5, [r1], #4;
+	mov	r4, lr;
+	mov	lr, #0;
+	ldr	r6, [r0];
+	umlal	r4, lr, r5, r3;
+	sub	r2, #1;
+	sbcs	r4, r6, r4;
+	tst	r2, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r2, #0;
+	teq	r2, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldr	%r5, [%r1], #4;
-	mov	%r9, #0;
-	ldr	%r4, [%r0, #0];
+	ldr	r5, [r1], #4;
+	mov	r9, #0;
+	ldr	r4, [r0, #0];
 
-	umlal	%lr, %r9, %r5, %r3;
-	ldr	%r6, [%r0, #4];
-	ldr	%r5, [%r1], #4;
-	sbcs	%r4, %r4, %lr;
+	umlal	lr, r9, r5, r3;
+	ldr	r6, [r0, #4];
+	ldr	r5, [r1], #4;
+	sbcs	r4, r4, lr;
 
-	mov	%lr, #0;
-	umlal	%r9, %lr, %r5, %r3;
-	ldr	%r8, [%r0, #8];
-	ldr	%r5, [%r1], #4;
-	sbcs	%r6, %r6, %r9;
+	mov	lr, #0;
+	umlal	r9, lr, r5, r3;
+	ldr	r8, [r0, #8];
+	ldr	r5, [r1], #4;
+	sbcs	r6, r6, r9;
 
-	mov	%r9, #0;
-	umlal	%lr, %r9, %r5, %r3;
-	ldr	%r10, [%r0, #12];
-	ldr	%r5, [%r1], #4;
-	sbcs	%r8, %r8, %lr;
+	mov	r9, #0;
+	umlal	lr, r9, r5, r3;
+	ldr	r10, [r0, #12];
+	ldr	r5, [r1], #4;
+	sbcs	r8, r8, lr;
 
-	mov	%lr, #0;
-	umlal	%r9, %lr, %r5, %r3;
-	sub	%r2, #4;
-	sbcs	%r10, %r10, %r9;
+	mov	lr, #0;
+	umlal	r9, lr, r5, r3;
+	sub	r2, #4;
+	sbcs	r10, r10, r9;
 
-	teq	%r2, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	teq	r2, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
 	it	cc
-	movcc	%r2, #1;
-	add	%r0, %lr, %r2;
-	pop	{%r4, %r5, %r6, %r8, %r9, %r10, %pc};
+	movcc	r2, #1;
+	add	r0, lr, r2;
+	pop	{r4, r5, r6, r8, r9, r10, pc};
 .size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1;
diff --git a/mpi/arm/mpih-sub1.S b/mpi/arm/mpih-sub1.S
index 593e3cde..476d8a33 100644
--- a/mpi/arm/mpih-sub1.S
+++ b/mpi/arm/mpih-sub1.S
@@ -1,77 +1,77 @@
 /* ARM sub_n -- Subtract two limb vectors of the same length > 0 and store
  *              sum in a third limb vector.
  *
  *      Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
  */
 
 #include "sysdep.h"
 #include "asm-syntax.h"
 
 .syntax unified
 .arm
 
 /*******************
  *  mpi_limb_t
- *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	%r0
- *		   mpi_ptr_t s1_ptr,		%r1
- *		   mpi_ptr_t s2_ptr,		%r2
- *		   mpi_size_t size)		%r3
+ *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	r0
+ *		   mpi_ptr_t s1_ptr,		r1
+ *		   mpi_ptr_t s2_ptr,		r2
+ *		   mpi_size_t size)		r3
  */
 
 .text
 
 .globl _gcry_mpih_sub_n
 .type  _gcry_mpih_sub_n,%function
 _gcry_mpih_sub_n:
-	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
-	cmp	%r0, #0; /* prepare carry flag for sub */
+	push	{r4, r5, r6, r7, r8, r9, r10, lr};
+	cmp	r0, #0; /* prepare carry flag for sub */
 
-	tst	%r3, #3;
+	tst	r3, #3;
 	beq	.Large_loop;
 
 .Loop:
-	ldr	%r4, [%r1], #4;
-	sub	%r3, #1;
-	ldr	%lr, [%r2], #4;
-	sbcs	%r4, %lr;
-	tst	%r3, #3;
-	str	%r4, [%r0], #4;
+	ldr	r4, [r1], #4;
+	sub	r3, #1;
+	ldr	lr, [r2], #4;
+	sbcs	r4, lr;
+	tst	r3, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r3, #0;
+	teq	r3, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldm	%r1!, {%r4, %r6, %r8, %r10};
-	sub	%r3, #4;
-	ldm	%r2!, {%r5, %r7, %r9, %lr};
-	sbcs	%r4, %r5;
-	sbcs	%r6, %r7;
-	sbcs	%r8, %r9;
-	sbcs	%r10, %lr;
-	teq	%r3, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	ldm	r1!, {r4, r6, r8, r10};
+	sub	r3, #4;
+	ldm	r2!, {r5, r7, r9, lr};
+	sbcs	r4, r5;
+	sbcs	r6, r7;
+	sbcs	r8, r9;
+	sbcs	r10, lr;
+	teq	r3, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
-	sbc	%r0, %r3, #0;
-	neg	%r0, %r0;
-	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+	sbc	r0, r3, #0;
+	neg	r0, r0;
+	pop	{r4, r5, r6, r7, r8, r9, r10, pc};
 .size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n;