diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h
index 502c35ae..4ffc1b71 100644
--- a/cipher/asm-common-aarch64.h
+++ b/cipher/asm-common-aarch64.h
@@ -1,90 +1,100 @@
 /* asm-common-aarch64.h  -  Common macros for AArch64 assembly
  *
  * Copyright (C) 2018 Martin Storsjö <martin@martin.st>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef GCRY_ASM_COMMON_AARCH64_H
 #define GCRY_ASM_COMMON_AARCH64_H
 
 #include <config.h>
 
 #ifdef __ELF__
 # define ELF(...) __VA_ARGS__
 #else
 # define ELF(...) /*_*/
 #endif
 
+#ifdef _WIN32
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, name ; \
+	add     reg, reg, #:lo12:name ;
+#else
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, :got:name ; \
+	ldr     reg, [reg, #:got_lo12:name] ;
+#endif
+
 #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
 /* CFI directives to emit DWARF stack unwinding information. */
 # define CFI_STARTPROC()            .cfi_startproc
 # define CFI_ENDPROC()              .cfi_endproc
 # define CFI_REMEMBER_STATE()       .cfi_remember_state
 # define CFI_RESTORE_STATE()        .cfi_restore_state
 # define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
 # define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
 # define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
 # define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
 # define CFI_RESTORE(reg)           .cfi_restore reg
 
 /* CFA expressions are used for pointing CFA and registers to
  * SP relative offsets. */
 # define DW_REGNO_SP 31
 
 /* Fixed length encoding used for integers for now. */
 # define DW_SLEB128_7BIT(value) \
 	0x00|((value) & 0x7f)
 # define DW_SLEB128_28BIT(value) \
 	0x80|((value)&0x7f), \
 	0x80|(((value)>>7)&0x7f), \
 	0x80|(((value)>>14)&0x7f), \
 	0x00|(((value)>>21)&0x7f)
 
 # define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
 	.cfi_escape \
 	  0x0f, /* DW_CFA_def_cfa_expression */ \
 	    DW_SLEB128_7BIT(11), /* length */ \
 	  0x8f, /* DW_OP_breg31, rsp + constant */ \
 	    DW_SLEB128_28BIT(rsp_offs), \
 	  0x06, /* DW_OP_deref */ \
 	  0x23, /* DW_OP_plus_constu */ \
 	    DW_SLEB128_28BIT((cfa_depth)+8)
 
 # define CFI_REG_ON_STACK(regno,rsp_offs) \
 	.cfi_escape \
 	  0x10, /* DW_CFA_expression */ \
 	    DW_SLEB128_7BIT(regno), \
 	    DW_SLEB128_7BIT(5), /* length */ \
 	  0x8f, /* DW_OP_breg31, rsp + constant */ \
 	    DW_SLEB128_28BIT(rsp_offs)
 
 #else
 # define CFI_STARTPROC()
 # define CFI_ENDPROC()
 # define CFI_REMEMBER_STATE()
 # define CFI_RESTORE_STATE()
 # define CFI_ADJUST_CFA_OFFSET(off)
 # define CFI_REL_OFFSET(reg,off)
 # define CFI_DEF_CFA_REGISTER(reg)
 # define CFI_REGISTER(ro,rn)
 # define CFI_RESTORE(reg)
 
 # define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
 # define CFI_REG_ON_STACK(reg,rsp_offs)
 #endif
 
 #endif /* GCRY_ASM_COMMON_AARCH64_H */
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index adb9b1f2..07b4bb5c 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -1,316 +1,307 @@
 /* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
  *
  * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #include "asm-common-aarch64.h"
 
 #if defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
     defined(USE_CHACHA20)
 
 .cpu generic+simd
 
 .text
 
-#ifdef _WIN32
-#define GET_DATA_POINTER(reg, name) \
-	adrp    reg, name ; \
-	add     reg, reg, #:lo12:name ;
-#else
-#define GET_DATA_POINTER(reg, name) \
-	adrp    reg, :got:name ; \
-	ldr     reg, [reg, #:got_lo12:name] ;
-#endif
 
 /* register macros */
 #define INPUT     x0
 #define DST       x1
 #define SRC       x2
 #define NBLKS     x3
 #define ROUND     x4
 #define INPUT_CTR x5
 #define INPUT_POS x6
 #define CTR       x7
 
 /* vector registers */
 #define X0 v16
 #define X1 v17
 #define X2 v18
 #define X3 v19
 #define X4 v20
 #define X5 v21
 #define X6 v22
 #define X7 v23
 #define X8 v24
 #define X9 v25
 #define X10 v26
 #define X11 v27
 #define X12 v28
 #define X13 v29
 #define X14 v30
 #define X15 v31
 
 #define VCTR    v0
 #define VTMP0   v1
 #define VTMP1   v2
 #define VTMP2   v3
 #define VTMP3   v4
 #define X12_TMP v5
 #define X13_TMP v6
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 #define vpunpckldq(s1, s2, dst) \
 	zip1 dst.4s, s2.4s, s1.4s;
 
 #define vpunpckhdq(s1, s2, dst) \
 	zip2 dst.4s, s2.4s, s1.4s;
 
 #define vpunpcklqdq(s1, s2, dst) \
 	zip1 dst.2d, s2.2d, s1.2d;
 
 #define vpunpckhqdq(s1, s2, dst) \
 	zip2 dst.2d, s2.2d, s1.2d;
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 	vpunpckhdq(x1, x0, t2); \
 	vpunpckldq(x1, x0, x0); \
 	\
 	vpunpckldq(x3, x2, t1); \
 	vpunpckhdq(x3, x2, x2); \
 	\
 	vpunpckhqdq(t1, x0, x1); \
 	vpunpcklqdq(t1, x0, x0); \
 	\
 	vpunpckhqdq(x2, t2, x3); \
 	vpunpcklqdq(x2, t2, x2);
 
 #define clear(x) \
 	eor x.16b, x.16b, x.16b;
 
 /**********************************************************************
   4-way chacha20
  **********************************************************************/
 
 #define ROTATE2(dst1,dst2,c,src1,src2)		\
 	shl dst1.4s, src1.4s, #(c);		\
 	shl dst2.4s, src2.4s, #(c);		\
 	sri dst1.4s, src1.4s, #(32 - (c));	\
 	sri dst2.4s, src2.4s, #(32 - (c));
 
 #define ROTATE2_16(dst1,dst2,src1,src2)		\
 	rev32 dst1.8h, src1.8h;			\
 	rev32 dst2.8h, src2.8h;
 
 #define XOR(d,s1,s2) \
 	eor d.16b, s2.16b, s1.16b;
 
 #define PLUS(ds,s) \
 	add ds.4s, ds.4s, s.4s;
 
 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)		\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
 	    ROTATE2_16(d1, d2, tmp1, tmp2);				\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1, tmp2);				\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
 	    ROTATE2(d1, d2,  8, tmp1, tmp2);				\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1, tmp2);
 
 chacha20_data:
 .align 4
 .Linc_counter:
 	.long 0,1,2,3
 
 .align 3
 .globl _gcry_chacha20_aarch64_blocks4
 ELF(.type _gcry_chacha20_aarch64_blocks4,%function;)
 
 _gcry_chacha20_aarch64_blocks4:
 	/* input:
 	 *	x0: input
 	 *	x1: dst
 	 *	x2: src
 	 *	x3: nblks (multiple of 4)
 	 */
 	CFI_STARTPROC()
 
 	GET_DATA_POINTER(CTR, .Linc_counter);
 	add INPUT_CTR, INPUT, #(12*4);
 	mov INPUT_POS, INPUT;
 	ld1 {VCTR.16b}, [CTR];
 
 .Loop4:
 	/* Construct counter vectors X12 and X13 */
 
 	ld1 {X15.16b}, [INPUT_CTR];
 	mov ROUND, #20;
 	ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
 
 	dup X12.4s, X15.s[0];
 	dup X13.4s, X15.s[1];
 	ldr CTR, [INPUT_CTR];
 	add X12.4s, X12.4s, VCTR.4s;
 	dup X0.4s, VTMP1.s[0];
 	dup X1.4s, VTMP1.s[1];
 	dup X2.4s, VTMP1.s[2];
 	dup X3.4s, VTMP1.s[3];
 	dup X14.4s, X15.s[2];
 	cmhi VTMP0.4s, VCTR.4s, X12.4s;
 	dup X15.4s, X15.s[3];
 	add CTR, CTR, #4; /* Update counter */
 	dup X4.4s, VTMP2.s[0];
 	dup X5.4s, VTMP2.s[1];
 	dup X6.4s, VTMP2.s[2];
 	dup X7.4s, VTMP2.s[3];
 	sub X13.4s, X13.4s, VTMP0.4s;
 	dup X8.4s, VTMP3.s[0];
 	dup X9.4s, VTMP3.s[1];
 	dup X10.4s, VTMP3.s[2];
 	dup X11.4s, VTMP3.s[3];
 	mov X12_TMP.16b, X12.16b;
 	mov X13_TMP.16b, X13.16b;
 	str CTR, [INPUT_CTR];
 
 .Lround2:
 	subs ROUND, ROUND, #2
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,VTMP0,VTMP1)
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,VTMP0,VTMP1)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,VTMP0,VTMP1)
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,VTMP0,VTMP1)
 	b.ne .Lround2;
 
 	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
 
 	PLUS(X12, X12_TMP);        /* INPUT + 12 * 4 + counter */
 	PLUS(X13, X13_TMP);        /* INPUT + 13 * 4 + counter */
 
 	dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
 	dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
 	dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
 	dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
 	PLUS(X0, VTMP2);
 	PLUS(X1, VTMP3);
 	PLUS(X2, X12_TMP);
 	PLUS(X3, X13_TMP);
 
 	dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
 	dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
 	dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
 	dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
 	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
 	mov INPUT_POS, INPUT;
 	PLUS(X4, VTMP2);
 	PLUS(X5, VTMP3);
 	PLUS(X6, X12_TMP);
 	PLUS(X7, X13_TMP);
 
 	dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
 	dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
 	dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
 	dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
 	dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
 	dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
 	PLUS(X8, VTMP2);
 	PLUS(X9, VTMP3);
 	PLUS(X10, X12_TMP);
 	PLUS(X11, X13_TMP);
 	PLUS(X14, VTMP0);
 	PLUS(X15, VTMP1);
 
 	transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
 	transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
 	transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
 	transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
 
 	subs NBLKS, NBLKS, #4;
 
 	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
 	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
 	eor VTMP0.16b, X0.16b, VTMP0.16b;
 	eor VTMP1.16b, X4.16b, VTMP1.16b;
 	eor VTMP2.16b, X8.16b, VTMP2.16b;
 	eor VTMP3.16b, X12.16b, VTMP3.16b;
 	eor X12_TMP.16b, X1.16b, X12_TMP.16b;
 	eor X13_TMP.16b, X5.16b, X13_TMP.16b;
 	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
 	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
 	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
 	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
 	eor VTMP0.16b, X9.16b, VTMP0.16b;
 	eor VTMP1.16b, X13.16b, VTMP1.16b;
 	eor VTMP2.16b, X2.16b, VTMP2.16b;
 	eor VTMP3.16b, X6.16b, VTMP3.16b;
 	eor X12_TMP.16b, X10.16b, X12_TMP.16b;
 	eor X13_TMP.16b, X14.16b, X13_TMP.16b;
 	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
 	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
 	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
 	eor VTMP0.16b, X3.16b, VTMP0.16b;
 	eor VTMP1.16b, X7.16b, VTMP1.16b;
 	eor VTMP2.16b, X11.16b, VTMP2.16b;
 	eor VTMP3.16b, X15.16b, VTMP3.16b;
 	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
 
 	b.ne .Loop4;
 
 	/* clear the used vector registers and stack */
 	clear(VTMP0);
 	clear(VTMP1);
 	clear(VTMP2);
 	clear(VTMP3);
 	clear(X12_TMP);
 	clear(X13_TMP);
 	clear(X0);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X8);
 	clear(X9);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 	clear(X14);
 	clear(X15);
 
 	eor x0, x0, x0
 	ret
 	CFI_ENDPROC()
 ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;)
 
 #endif
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 7c6be94e..b0c2cccc 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -1,426 +1,422 @@
 /* cipher-gcm-armv8-aarch64-ce.S - ARM/CE accelerated GHASH
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "asm-common-aarch64.h"
 
 #if defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 
 .cpu generic+simd+crypto
 
 .text
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
-
 
 /* Constants */
 
 .align 4
 gcry_gcm_reduction_constant:
 .Lrconst:
   .quad 0x87
 
 
 /* Register macros */
 
 #define rhash   v0
 #define rr0     v1
 #define rr1     v2
 #define rbuf    v3
 #define rbuf1   v4
 #define rbuf2   v5
 #define rbuf3   v6
 #define rbuf4   v7
 #define rbuf5   v8
 #define rr2     v9
 #define rr3     v10
 #define rr4     v11
 #define rr5     v12
 #define rr6     v13
 #define rr7     v14
 #define rr8     v15
 #define rr9     v16
 
 #define rrconst v18
 #define rh1     v19
 #define rh2     v20
 #define rh3     v21
 #define rh4     v22
 #define rh5     v23
 #define rh6     v24
 #define t0      v25
 #define t1      v26
 #define t2      v27
 #define t3      v28
 #define t4      v29
 #define t5      v30
 #define vZZ     v31
 
 /* GHASH macros */
 
 /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
  * Cryptology — CT-RSA 2015" for details.
  */
 
 /* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */
 #define PMUL_128x128(r0, r1, a, b, T0, T1, interleave_op) \
 	ext T0.16b, b.16b, b.16b, #8; \
 	pmull r0.1q, a.1d, b.1d; \
 	pmull2 r1.1q, a.2d, b.2d; \
 	pmull T1.1q, a.1d, T0.1d; \
 	pmull2 T0.1q, a.2d, T0.2d; \
 	interleave_op; \
 	eor T0.16b, T0.16b, T1.16b; \
 	ext T1.16b, vZZ.16b, T0.16b, #8; \
 	ext T0.16b, T0.16b, vZZ.16b, #8; \
 	eor r0.16b, r0.16b, T1.16b; \
 	eor r1.16b, r1.16b, T0.16b;
 
 /* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
  * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
  * Input: 'aC' and 'bC', Output: 'r0C:r1C' (low 128-bits in r0C, high in r1C)
  */
 #define PMUL_128x128_3(r0A, r1A, aA, bA, t0A, t1A, \
                        r0B, r1B, aB, bB, t0B, t1B, \
                        r0C, r1C, aC, bC, t0C, t1C,  interleave_op) \
         ext t0A.16b, bA.16b, bA.16b, #8; \
         pmull r0A.1q, aA.1d, bA.1d; \
         pmull2 r1A.1q, aA.2d, bA.2d; \
           ext t0B.16b, bB.16b, bB.16b, #8; \
           pmull r0B.1q, aB.1d, bB.1d; \
           pmull2 r1B.1q, aB.2d, bB.2d; \
             ext t0C.16b, bC.16b, bC.16b, #8; \
             pmull r0C.1q, aC.1d, bC.1d; \
             pmull2 r1C.1q, aC.2d, bC.2d; \
         pmull t1A.1q, aA.1d, t0A.1d; \
         pmull2 t0A.1q, aA.2d, t0A.2d; \
           pmull t1B.1q, aB.1d, t0B.1d; \
           pmull2 t0B.1q, aB.2d, t0B.2d; \
             pmull t1C.1q, aC.1d, t0C.1d; \
             pmull2 t0C.1q, aC.2d, t0C.2d; \
         eor t0A.16b, t0A.16b, t1A.16b; \
           eor t0B.16b, t0B.16b, t1B.16b; \
             eor t0C.16b, t0C.16b, t1C.16b; \
               interleave_op; \
         ext t1A.16b, vZZ.16b, t0A.16b, #8; \
         ext t0A.16b, t0A.16b, vZZ.16b, #8; \
           ext t1B.16b, vZZ.16b, t0B.16b, #8; \
           ext t0B.16b, t0B.16b, vZZ.16b, #8; \
             ext t1C.16b, vZZ.16b, t0C.16b, #8; \
             ext t0C.16b, t0C.16b, vZZ.16b, #8; \
         eor r0A.16b, r0A.16b, t1A.16b; \
         eor r1A.16b, r1A.16b, t0A.16b; \
           eor r0B.16b, r0B.16b, t1B.16b; \
           eor r1B.16b, r1B.16b, t0B.16b; \
             eor r0C.16b, r0C.16b, t1C.16b; \
             eor r1C.16b, r1C.16b, t0C.16b; \
 
 /* Input: 'r0:r1', Output: 'a' */
 #define REDUCTION(a, r0, r1, rconst, T0, T1, interleave_op1, interleave_op2, \
                   interleave_op3) \
         pmull2 T0.1q, r1.2d, rconst.2d; \
         interleave_op1; \
         ext T1.16b, T0.16b, vZZ.16b, #8; \
         ext T0.16b, vZZ.16b, T0.16b, #8; \
         interleave_op2; \
         eor r1.16b, r1.16b, T1.16b; \
         eor r0.16b, r0.16b, T0.16b; \
         pmull T0.1q, r1.1d, rconst.1d; \
         interleave_op3; \
         eor a.16b, r0.16b, T0.16b;
 
 /* Other functional macros */
 
 #define _(...) __VA_ARGS__
 #define __ _()
 
 #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
 
 #define VPUSH_ABI \
         stp d8, d9, [sp, #-16]!; \
         CFI_ADJUST_CFA_OFFSET(16); \
         stp d10, d11, [sp, #-16]!; \
         CFI_ADJUST_CFA_OFFSET(16); \
         stp d12, d13, [sp, #-16]!; \
         CFI_ADJUST_CFA_OFFSET(16); \
         stp d14, d15, [sp, #-16]!; \
         CFI_ADJUST_CFA_OFFSET(16);
 
 #define VPOP_ABI \
         ldp d14, d15, [sp], #16; \
         CFI_ADJUST_CFA_OFFSET(-16); \
         ldp d12, d13, [sp], #16; \
         CFI_ADJUST_CFA_OFFSET(-16); \
         ldp d10, d11, [sp], #16; \
         CFI_ADJUST_CFA_OFFSET(-16); \
         ldp d8, d9, [sp], #16; \
         CFI_ADJUST_CFA_OFFSET(-16);
 
 /*
  * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
  *                                          const byte *buf, size_t nblocks,
  *                                          void *gcm_table);
  */
 .align 3
 .globl _gcry_ghash_armv8_ce_pmull
 ELF(.type  _gcry_ghash_armv8_ce_pmull,%function;)
 _gcry_ghash_armv8_ce_pmull:
   /* input:
    *    x0: gcm_key
    *    x1: result/hash
    *    x2: buf
    *    x3: nblocks
    *    x4: gcm_table
    */
   CFI_STARTPROC();
 
   cbz x3, .Ldo_nothing;
 
   GET_DATA_POINTER(x5, .Lrconst)
 
   eor vZZ.16b, vZZ.16b, vZZ.16b
   ld1 {rhash.16b}, [x1]
   ld1 {rh1.16b}, [x0]
 
   rbit rhash.16b, rhash.16b /* bit-swap */
   ld1r {rrconst.2d}, [x5]
 
   cmp x3, #6
   b.lo .Less_than_6
 
   add x6, x4, #64
   VPUSH_ABI
 
   ld1 {rh2.16b-rh5.16b}, [x4]
   ld1 {rh6.16b}, [x6]
 
   sub x3, x3, #6
 
   ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
   ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
 
   rbit rbuf.16b, rbuf.16b /* bit-swap */
   rbit rbuf1.16b, rbuf1.16b /* bit-swap */
   rbit rbuf2.16b, rbuf2.16b /* bit-swap */
   rbit rbuf3.16b, rbuf3.16b /* bit-swap */
   rbit rbuf4.16b, rbuf4.16b /* bit-swap */
   rbit rbuf5.16b, rbuf5.16b /* bit-swap */
   eor rhash.16b, rhash.16b, rbuf.16b
 
   cmp x3, #6
   b.lo .Lend_6
 
 .Loop_6:
 
   /* (in1) * H⁵ => rr0:rr1 */
   /* (in2) * H⁴ => rr2:rr3 */
   /* (in0 ^ hash) * H⁶ => rr4:rr5 */
   PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
                  rr2, rr3, rbuf2, rh4, t2, t3,
                  rr4, rr5, rhash, rh6, t4, t5,
                  _(sub x3, x3, #6))
 
   ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
   cmp x3, #6
 
   eor rr0.16b, rr0.16b, rr2.16b
   eor rr1.16b, rr1.16b, rr3.16b
 
   /* (in3) * H³ => rr2:rr3 */
   /* (in4) * H² => rr6:rr7 */
   /* (in5) * H¹ => rr8:rr9 */
   PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1,
                  rr6, rr7, rbuf4, rh2, t2, t3,
                  rr8, rr9, rbuf5, rh1, t4, t5,
                  _(eor rr0.16b, rr0.16b, rr4.16b;
                    eor rr1.16b, rr1.16b, rr5.16b))
 
   eor rr0.16b, rr0.16b, rr2.16b
   eor rr1.16b, rr1.16b, rr3.16b
   rbit rbuf.16b, rbuf.16b
   eor rr0.16b, rr0.16b, rr6.16b
   eor rr1.16b, rr1.16b, rr7.16b
   rbit rbuf1.16b, rbuf1.16b
   eor rr0.16b, rr0.16b, rr8.16b
   eor rr1.16b, rr1.16b, rr9.16b
   ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
 
   REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
             _(rbit rbuf2.16b, rbuf2.16b),
             _(rbit rbuf3.16b, rbuf3.16b),
             _(rbit rbuf4.16b, rbuf4.16b))
 
   rbit rbuf5.16b, rbuf5.16b
   eor rhash.16b, rhash.16b, rbuf.16b
 
   b.hs .Loop_6
 
 .Lend_6:
 
   /* (in1) * H⁵ => rr0:rr1 */
   /* (in0 ^ hash) * H⁶ => rr2:rr3 */
   /* (in2) * H⁴ => rr4:rr5 */
   PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
                  rr2, rr3, rhash, rh6, t2, t3,
                  rr4, rr5, rbuf2, rh4, t4, t5,
                  __)
   eor rr0.16b, rr0.16b, rr2.16b
   eor rr1.16b, rr1.16b, rr3.16b
   eor rr0.16b, rr0.16b, rr4.16b
   eor rr1.16b, rr1.16b, rr5.16b
 
   /* (in3) * H³ => rhash:rbuf */
   /* (in4) * H² => rr6:rr7 */
   /* (in5) * H¹ => rr8:rr9 */
   PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1,
                  rr6, rr7, rbuf4, rh2, t2, t3,
                  rr8, rr9, rbuf5, rh1, t4, t5,
                  _(CLEAR_REG(rh4);
                    CLEAR_REG(rh5);
                    CLEAR_REG(rh6)))
   eor rr0.16b, rr0.16b, rhash.16b
   eor rr1.16b, rr1.16b, rbuf.16b
   eor rr0.16b, rr0.16b, rr6.16b
   eor rr1.16b, rr1.16b, rr7.16b
   eor rr0.16b, rr0.16b, rr8.16b
   eor rr1.16b, rr1.16b, rr9.16b
 
   REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
             _(CLEAR_REG(rh2);
               CLEAR_REG(rh3);
               CLEAR_REG(rr2);
               CLEAR_REG(rbuf2);
               CLEAR_REG(rbuf3)),
             _(CLEAR_REG(rr3);
               CLEAR_REG(rr4);
               CLEAR_REG(rr5);
               CLEAR_REG(rr6);
               CLEAR_REG(rr7)),
             _(CLEAR_REG(rr8);
               CLEAR_REG(rr9);
               CLEAR_REG(rbuf1);
               CLEAR_REG(rbuf2)))
 
   CLEAR_REG(rbuf4)
   CLEAR_REG(rbuf5)
   CLEAR_REG(t2)
   CLEAR_REG(t3)
   CLEAR_REG(t4)
   CLEAR_REG(t5)
 
   VPOP_ABI
 
   cbz x3, .Ldone
 
 .Less_than_6:
   /* Handle remaining blocks. */
 
   ld1 {rbuf.16b}, [x2], #16
   sub x3, x3, #1
 
   rbit rbuf.16b, rbuf.16b /* bit-swap */
 
   eor rhash.16b, rhash.16b, rbuf.16b
 
   cbz x3, .Lend
 
 .Loop:
   PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16))
   REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
             _(sub x3, x3, #1),
             _(rbit rbuf.16b, rbuf.16b),
             __)
   eor rhash.16b, rhash.16b, rbuf.16b
 
   cbnz x3, .Loop
 
 .Lend:
   PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf)))
   REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __)
 
 .Ldone:
   CLEAR_REG(rr1)
   CLEAR_REG(rr0)
   rbit rhash.16b, rhash.16b /* bit-swap */
   CLEAR_REG(t0)
   CLEAR_REG(t1)
 
   st1 {rhash.2d}, [x1]
   CLEAR_REG(rhash)
 
 .Ldo_nothing:
   mov x0, #0
   ret
   CFI_ENDPROC()
 ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
 
 
 /*
  * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
  */
 .align 3
 .globl _gcry_ghash_setup_armv8_ce_pmull
 ELF(.type  _gcry_ghash_setup_armv8_ce_pmull,%function;)
 _gcry_ghash_setup_armv8_ce_pmull:
   /* input:
    *	x0: gcm_key
    *	x1: gcm_table
    */
   CFI_STARTPROC()
 
   GET_DATA_POINTER(x2, .Lrconst)
 
   /* H¹ */
   ld1 {rh1.16b}, [x0]
   rbit rh1.16b, rh1.16b
   st1 {rh1.16b}, [x0]
 
   ld1r {rrconst.2d}, [x2]
 
   /* H² */
   PMUL_128x128(rr0, rr1, rh1, rh1, t0, t1, __)
   REDUCTION(rh2, rr0, rr1, rrconst, t0, t1, __, __, __)
 
   /* H³ */
   PMUL_128x128(rr0, rr1, rh2, rh1, t0, t1, __)
   REDUCTION(rh3, rr0, rr1, rrconst, t0, t1, __, __, __)
 
   /* H⁴ */
   PMUL_128x128(rr0, rr1, rh2, rh2, t0, t1, __)
   REDUCTION(rh4, rr0, rr1, rrconst, t0, t1, __, __, __)
 
   /* H⁵ */
   PMUL_128x128(rr0, rr1, rh2, rh3, t0, t1, __)
   REDUCTION(rh5, rr0, rr1, rrconst, t0, t1, __, __, __)
 
   /* H⁶ */
   PMUL_128x128(rr0, rr1, rh3, rh3, t0, t1, __)
   REDUCTION(rh6, rr0, rr1, rrconst, t0, t1, __, __, __)
 
   st1 {rh2.16b-rh4.16b}, [x1], #(3*16)
   st1 {rh5.16b-rh6.16b}, [x1]
 
   ret
   CFI_ENDPROC()
 ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;)
 
 #endif
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
index f269b74a..060abdfe 100644
--- a/cipher/crc-armv8-aarch64-ce.S
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -1,500 +1,497 @@
 /* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation
  * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "asm-common-aarch64.h"
 
 #if defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 
 .cpu generic+simd+crypto
 
 .text
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
 
 /* Structure of crc32_consts_s */
 
 #define consts_k(idx)    ((idx) * 8)
 #define consts_my_p(idx) (consts_k(6) + (idx) * 8)
 
 /* Constants */
 
 .align 6
 .Lcrc32_constants:
 .Lcrc32_partial_fold_input_mask:
   .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 .Lcrc32_refl_shuf_shift:
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
   .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 .Lcrc32_shuf_shift:
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 .Lcrc32_bswap_shuf:
   .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
   .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 
 
 /*
  * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
  *                                  const struct crc32_consts_s *consts);
  */
 .align 3
 .globl _gcry_crc32r_armv8_ce_bulk
 ELF(.type  _gcry_crc32r_armv8_ce_bulk,%function;)
 _gcry_crc32r_armv8_ce_bulk:
   /* input:
    *    x0: pcrc
    *    x1: inbuf
    *    x2: inlen
    *    x3: consts
    */
   CFI_STARTPROC()
 
   GET_DATA_POINTER(x7, .Lcrc32_constants)
   add x9, x3, #consts_k(5 - 1)
   cmp x2, #128
 
   b.lo .Lcrc32r_fold_by_one_setup
 
   eor v4.16b, v4.16b, v4.16b
   add x4, x3, #consts_k(1 - 1)
   ld1 {v4.s}[0], [x0]             /* load pcrc */
   ld1 {v0.16b-v3.16b}, [x1], #64  /* load 64 bytes of input */
   sub x2, x2, #64
   ld1 {v6.16b}, [x4]
   eor v0.16b, v0.16b, v4.16b
 
   add x4, x3, #consts_k(3 - 1)
   add x5, x3, #consts_my_p(0)
 
 .Lcrc32r_fold_by_four:
 
   /* Fold by 4. */
   ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
   sub x2, x2, #64
   pmull v20.1q, v0.1d, v6.1d
   pmull v21.1q, v1.1d, v6.1d
   pmull v22.1q, v2.1d, v6.1d
   pmull v23.1q, v3.1d, v6.1d
   cmp x2, #64
   pmull2 v24.1q, v0.2d, v6.2d
   pmull2 v25.1q, v1.2d, v6.2d
   pmull2 v26.1q, v2.2d, v6.2d
   pmull2 v27.1q, v3.2d, v6.2d
   eor v0.16b, v20.16b, v16.16b
   eor v1.16b, v21.16b, v17.16b
   eor v2.16b, v22.16b, v18.16b
   eor v3.16b, v23.16b, v19.16b
   eor v0.16b, v0.16b, v24.16b
   eor v1.16b, v1.16b, v25.16b
   eor v2.16b, v2.16b, v26.16b
   eor v3.16b, v3.16b, v27.16b
   b.hs .Lcrc32r_fold_by_four
 
   ld1 {v6.16b}, [x4]
   ld1 {v5.16b}, [x5]
 
   cmp x2, #16
 
   /* Fold 4 to 1. */
 
   pmull v16.1q, v0.1d, v6.1d
   pmull2 v4.1q, v0.2d, v6.2d
   eor v0.16b, v16.16b, v1.16b
   eor v0.16b, v0.16b, v4.16b
 
   pmull v16.1q, v0.1d, v6.1d
   pmull2 v4.1q, v0.2d, v6.2d
   eor v0.16b, v16.16b, v2.16b
   eor v0.16b, v0.16b, v4.16b
 
   pmull v16.1q, v0.1d, v6.1d
   pmull2 v4.1q, v0.2d, v6.2d
   eor v0.16b, v16.16b, v3.16b
   eor v0.16b, v0.16b, v4.16b
 
   b.lo .Lcrc32r_fold_by_one_done
   b .Lcrc32r_fold_by_one
 
 .Lcrc32r_fold_by_one_setup:
 
   eor v1.16b, v1.16b, v1.16b
   add x4, x3, #consts_k(3 - 1)
   add x5, x3, #consts_my_p(0)
   sub x2, x2, #16
   ld1 {v1.s}[0], [x0]             /* load pcrc */
   ld1 {v0.16b}, [x1], #16         /* load 16 bytes of input */
   cmp x2, #16
   ld1 {v6.16b}, [x4]              /* load k3k4 */
   ld1 {v5.16b}, [x5]              /* load my_p */
   eor v0.16b, v0.16b, v1.16b
   b.lo .Lcrc32r_fold_by_one_done
 
 .Lcrc32r_fold_by_one:
   sub x2, x2, #16
   ld1 {v2.16b}, [x1], #16         /* load 16 bytes of input */
   pmull v3.1q, v0.1d, v6.1d
   pmull2 v1.1q, v0.2d, v6.2d
   cmp x2, #16
   eor v0.16b, v3.16b, v2.16b
   eor v0.16b, v0.16b, v1.16b
 
   b.hs .Lcrc32r_fold_by_one
 
 .Lcrc32r_fold_by_one_done:
 
   cmp x2, #0
   b.eq .Lcrc32r_final_fold
 
   /* Partial fold. */
 
   add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants
   add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16
   add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
   sub x8, x2, #16
   add x4, x4, x2
   add x5, x5, x2
   add x6, x6, x2
   add x8, x1, x8
 
   /* Load last input and add padding zeros. */
   ld1 {v4.16b}, [x4]
   eor x2, x2, x2
   ld1 {v3.16b}, [x5]
   ld1 {v2.16b}, [x6]
   tbl v30.16b, {v0.16b}, v4.16b
   ld1 {v4.16b}, [x8]
   tbl v1.16b, {v0.16b}, v3.16b
 
   pmull v0.1q, v30.1d, v6.1d
   and v2.16b, v2.16b, v4.16b
   pmull2 v31.1q, v30.2d, v6.2d
   orr v2.16b, v2.16b, v1.16b
   eor v0.16b, v0.16b, v31.16b
   eor v0.16b, v0.16b, v2.16b
 
 .Lcrc32r_final_fold:
 
   /* Final fold. */
 
   eor v2.16b, v2.16b, v2.16b      /* zero reg */
   ld1 {v7.16b}, [x9]
 
   /* reduce 128-bits to 96-bits */
   ext v6.16b, v6.16b, v6.16b, #8  /* swap high and low parts */
   mov v1.16b, v0.16b
   pmull v0.1q, v0.1d, v6.1d
   ext v6.16b, v5.16b, v5.16b, #8  /* swap high and low parts */
   ext v1.16b, v1.16b, v2.16b, #8  /* high to low, high zeroed */
   eor v3.16b, v0.16b, v1.16b
 
   /* reduce 96-bits to 64-bits */
   eor v1.16b, v1.16b, v1.16b
   ext v0.16b, v3.16b, v2.16b, #4  /* [00][00][x2][x1] */
   mov v1.s[0], v3.s[0]            /* [00][00][00][x0] */
   eor v3.16b, v3.16b, v3.16b
   pmull v1.1q, v1.1d, v7.1d       /* [00][00][xx][xx] */
   eor v0.16b, v0.16b, v1.16b      /* top 64-bit are zero */
 
   /* barrett reduction */
   mov v3.s[1], v0.s[0]            /* [00][00][x1][00] */
   ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */
   pmull v1.1q, v3.1d, v5.1d       /* [00][xx][xx][00] */
   pmull v1.1q, v1.1d, v6.1d       /* [00][xx][xx][00] */
   eor v0.16b, v0.16b, v1.16b
 
   /* store CRC */
   st1 {v0.s}[2], [x0]
 
   ret
   CFI_ENDPROC()
 ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
 
 /*
  * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
  *                                         const struct crc32_consts_s *consts);
  */
 .align 3
 .globl _gcry_crc32r_armv8_ce_reduction_4
 ELF(.type  _gcry_crc32r_armv8_ce_reduction_4,%function;)
 _gcry_crc32r_armv8_ce_reduction_4:
   /* input:
    *    w0: data
    *    w1: crc
    *    x2: crc32 constants
    */
   CFI_STARTPROC()
 
   eor v0.16b, v0.16b, v0.16b
   add x2, x2, #consts_my_p(0)
   eor v1.16b, v1.16b, v1.16b
   ld1 {v5.16b}, [x2]
 
   mov v0.s[0], w0
   pmull v0.1q, v0.1d, v5.1d     /* [00][00][xx][xx] */
   mov v1.s[1], w1
   mov v0.s[2], v0.s[0]          /* [00][x0][x1][x0] */
   pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
   eor v0.16b, v0.16b, v1.16b
 
   mov w0, v0.s[1]
 
   ret
   CFI_ENDPROC()
 ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
 
 /*
  * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
  *                                 const struct crc32_consts_s *consts);
  */
 .align 3
 .globl _gcry_crc32_armv8_ce_bulk
 ELF(.type  _gcry_crc32_armv8_ce_bulk,%function;)
 _gcry_crc32_armv8_ce_bulk:
   /* input:
    *    x0: pcrc
    *    x1: inbuf
    *    x2: inlen
    *    x3: consts
    */
   CFI_STARTPROC()
 
   GET_DATA_POINTER(x7, .Lcrc32_constants)
   add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
   cmp x2, #128
   ld1 {v7.16b}, [x4]
 
   b.lo .Lcrc32_fold_by_one_setup
 
   eor v4.16b, v4.16b, v4.16b
   add x4, x3, #consts_k(1 - 1)
   ld1 {v4.s}[0], [x0]            /* load pcrc */
   ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */
   sub x2, x2, #64
   ld1 {v6.16b}, [x4]
   eor v0.16b, v0.16b, v4.16b
   ext v4.16b, v6.16b, v6.16b, #8
   tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
   tbl v1.16b, { v1.16b }, v7.16b /* byte swap */
   tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
   tbl v3.16b, { v3.16b }, v7.16b /* byte swap */
 
   add x4, x3, #consts_k(3 - 1)
   add x5, x3, #consts_my_p(0)
 
 .Lcrc32_fold_by_four:
 
   /* Fold by 4. */
   ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
   sub x2, x2, #64
   tbl v16.16b, { v16.16b }, v7.16b /* byte swap */
   tbl v17.16b, { v17.16b }, v7.16b /* byte swap */
   tbl v18.16b, { v18.16b }, v7.16b /* byte swap */
   tbl v19.16b, { v19.16b }, v7.16b /* byte swap */
   cmp x2, #64
   pmull2 v20.1q, v0.2d, v4.2d
   pmull2 v21.1q, v1.2d, v4.2d
   pmull2 v22.1q, v2.2d, v4.2d
   pmull2 v23.1q, v3.2d, v4.2d
   pmull v24.1q, v0.1d, v4.1d
   pmull v25.1q, v1.1d, v4.1d
   pmull v26.1q, v2.1d, v4.1d
   pmull v27.1q, v3.1d, v4.1d
   eor v0.16b, v20.16b, v16.16b
   eor v1.16b, v21.16b, v17.16b
   eor v2.16b, v22.16b, v18.16b
   eor v3.16b, v23.16b, v19.16b
   eor v0.16b, v0.16b, v24.16b
   eor v1.16b, v1.16b, v25.16b
   eor v2.16b, v2.16b, v26.16b
   eor v3.16b, v3.16b, v27.16b
   b.hs .Lcrc32_fold_by_four
 
   ld1 {v6.16b}, [x4]
   ld1 {v5.16b}, [x5]
   ext v6.16b, v6.16b, v6.16b, #8
   ext v5.16b, v5.16b, v5.16b, #8
 
   cmp x2, #16
 
   /* Fold 4 to 1. */
 
   pmull2 v16.1q, v0.2d, v6.2d
   pmull v4.1q, v0.1d, v6.1d
   eor v0.16b, v16.16b, v1.16b
   eor v0.16b, v0.16b, v4.16b
 
   pmull2 v16.1q, v0.2d, v6.2d
   pmull v4.1q, v0.1d, v6.1d
   eor v0.16b, v16.16b, v2.16b
   eor v0.16b, v0.16b, v4.16b
 
   pmull2 v16.1q, v0.2d, v6.2d
   pmull v4.1q, v0.1d, v6.1d
   eor v0.16b, v16.16b, v3.16b
   eor v0.16b, v0.16b, v4.16b
 
   b.lo .Lcrc32_fold_by_one_done
   b .Lcrc32_fold_by_one
 
 .Lcrc32_fold_by_one_setup:
 
   eor v1.16b, v1.16b, v1.16b
   add x4, x3, #consts_k(3 - 1)
   add x5, x3, #consts_my_p(0)
   ld1 {v1.s}[0], [x0]            /* load pcrc */
   sub x2, x2, #16
   ld1 {v0.16b}, [x1], #16        /* load 16 bytes of input */
   ld1 {v6.16b}, [x4]             /* load k3k4 */
   ld1 {v5.16b}, [x5]             /* load my_p */
   eor v0.16b, v0.16b, v1.16b
   cmp x2, #16
   ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
   ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */
   tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
   b.lo .Lcrc32_fold_by_one_done
 
 .Lcrc32_fold_by_one:
   sub x2, x2, #16
   ld1 {v2.16b}, [x1], #16        /* load 16 bytes of input */
   pmull2 v3.1q, v0.2d, v6.2d
   tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
   pmull v1.1q, v0.1d, v6.1d
   cmp x2, #16
   eor v0.16b, v3.16b, v2.16b
   eor v0.16b, v0.16b, v1.16b
 
   b.hs .Lcrc32_fold_by_one
 
 .Lcrc32_fold_by_one_done:
 
   cmp x2, #0
   b.eq .Lcrc32_final_fold
 
   /* Partial fold. */
 
   add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32
   add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16
   add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
   sub x8, x2, #16
   sub x4, x4, x2
   add x5, x5, x2
   add x6, x6, x2
   add x8, x1, x8
 
   /* Load last input and add padding zeros. */
   ld1 {v4.16b}, [x4]
   eor x2, x2, x2
   ld1 {v3.16b}, [x5]
   ld1 {v2.16b}, [x6]
   tbl v30.16b, {v0.16b}, v4.16b
   ld1 {v4.16b}, [x8]
   tbl v1.16b, {v0.16b}, v3.16b
   and v2.16b, v2.16b, v4.16b
 
   pmull2 v0.1q, v30.2d, v6.2d
   orr v2.16b, v2.16b, v1.16b
   pmull v1.1q, v30.1d, v6.1d
   tbl v2.16b, {v2.16b}, v7.16b   /* byte swap */
   eor v0.16b, v0.16b, v1.16b
   eor v0.16b, v0.16b, v2.16b
 
 .Lcrc32_final_fold:
 
   /* Final fold. */
 
   eor v2.16b, v2.16b, v2.16b     /* zero reg */
 
   /* reduce 128-bits to 96-bits */
   add x4, x3, #consts_k(4)
   ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
   eor v6.16b, v6.16b, v6.16b
   mov v1.16b, v0.16b
   pmull2 v0.1q, v0.2d, v3.2d
   ld1 {v6.d}[1], [x4]            /* load k4 */
   ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */
   eor v3.16b, v0.16b, v1.16b     /* bottom 32-bit are zero */
 
   /* reduce 96-bits to 64-bits */
   eor v0.16b, v0.16b, v0.16b
   eor v1.16b, v1.16b, v1.16b
   mov v0.s[1], v3.s[1]           /* [00][00][x1][00] */
   mov v1.s[2], v3.s[3]           /* [00][x3][00][00] */
   mov v0.s[2], v3.s[2]           /* [00][x2][x1][00] */
   eor v3.16b, v3.16b, v3.16b
   pmull2 v1.1q, v1.2d, v6.2d     /* [00][xx][xx][00] */
   eor v0.16b, v0.16b, v1.16b     /* top and bottom 32-bit are zero */
 
   /* barrett reduction */
   mov v3.s[0], v0.s[1]           /* [00][00][00][x1] */
   pmull2 v0.1q, v0.2d, v5.2d     /* [00][xx][xx][xx] */
   ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */
   pmull v0.1q, v0.1d, v5.1d
   eor v0.16b, v0.16b, v3.16b
 
   /* store CRC in input endian */
   rev32 v0.8b, v0.8b             /* byte swap */
   st1 {v0.s}[0], [x0]
 
   ret
   CFI_ENDPROC()
 ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
 
 /*
  * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
  *                                        const struct crc32_consts_s *consts);
  */
 .align 3
 .globl _gcry_crc32_armv8_ce_reduction_4
 ELF(.type  _gcry_crc32_armv8_ce_reduction_4,%function;)
 _gcry_crc32_armv8_ce_reduction_4:
   /* input:
    *    w0: data
    *    w1: crc
    *    x2: crc32 constants
    */
   CFI_STARTPROC()
 
   eor v0.16b, v0.16b, v0.16b
   add x2, x2, #consts_my_p(0)
   eor v1.16b, v1.16b, v1.16b
   ld1 {v5.16b}, [x2]
 
   mov v0.s[1], w0
   pmull v0.1q, v0.1d, v5.1d     /* [00][xx][xx][00] */
   mov v1.s[0], w1
   pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
   eor v0.16b, v0.16b, v1.16b
 
   rev32 v0.8b, v0.8b            /* Return in input endian */
   mov w0, v0.s[0]
 
   ret
   CFI_ENDPROC()
 ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
 
 #endif
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 71b45b85..3af29e0d 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -1,1618 +1,1613 @@
 /* rijndael-armv8-aarch64-ce.S - ARMv8/CE accelerated AES
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "asm-common-aarch64.h"
 
 #if defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 
 .cpu generic+simd+crypto
 
 .text
 
 
-#define GET_DATA_POINTER(reg, name) \
-	adrp    reg, :got:name ; \
-	ldr     reg, [reg, #:got_lo12:name] ;
-
-
 /* Register macros */
 
 #define vk0 v17
 #define vk1 v18
 #define vk2 v19
 #define vk3 v20
 #define vk4 v21
 #define vk5 v22
 #define vk6 v23
 #define vk7 v24
 #define vk8 v25
 #define vk9 v26
 #define vk10 v27
 #define vk11 v28
 #define vk12 v29
 #define vk13 v30
 #define vk14 v31
 
 
 /* AES macros */
 
 #define aes_preload_keys(keysched, nrounds) \
 	cmp nrounds, #12; \
 	ld1 {vk0.16b-vk3.16b}, [keysched], #64; \
 	ld1 {vk4.16b-vk7.16b}, [keysched], #64; \
 	ld1 {vk8.16b-vk10.16b}, [keysched], #48; \
 	b.lo 1f; \
 	ld1 {vk11.16b-vk12.16b}, [keysched], #32; \
 	b.eq 1f; \
 	ld1 {vk13.16b-vk14.16b}, [keysched]; \
 1:	;
 
 #define do_aes_one128(ed, mcimc, vo, vb) \
 	aes##ed    vb.16b, vk0.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk1.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk2.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk3.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk4.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk5.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk6.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk7.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk8.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk9.16b; \
 	eor        vo.16b, vb.16b, vk10.16b;
 
 #define do_aes_one192(ed, mcimc, vo, vb) \
 	aes##ed    vb.16b, vk0.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk1.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk2.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk3.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk4.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk5.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk6.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk7.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk8.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk9.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk10.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk11.16b; \
 	eor        vo.16b, vb.16b, vk12.16b;
 
 #define do_aes_one256(ed, mcimc, vo, vb) \
 	aes##ed    vb.16b, vk0.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk1.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk2.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk3.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk4.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk5.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk6.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk7.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk8.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk9.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk10.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk11.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk12.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk13.16b; \
 	eor        vo.16b, vb.16b, vk14.16b;
 
 #define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
 	aes##ed    b0.16b, key.16b; \
 	aes##mcimc b0.16b, b0.16b; \
 	  aes##ed    b1.16b, key.16b; \
 	  aes##mcimc b1.16b, b1.16b; \
 	    aes##ed    b2.16b, key.16b; \
 	    aes##mcimc b2.16b, b2.16b; \
 	      aes##ed    b3.16b, key.16b; \
 	      aes##mcimc b3.16b, b3.16b;
 
 #define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \
 	aes##ed    b0.16b, key1.16b; \
 	eor        b0.16b, b0.16b, key2.16b; \
 	  aes##ed    b1.16b, key1.16b; \
 	  eor        b1.16b, b1.16b, key2.16b; \
 	    aes##ed    b2.16b, key1.16b; \
 	    eor        b2.16b, b2.16b, key2.16b; \
 	      aes##ed    b3.16b, key1.16b; \
 	      eor        b3.16b, b3.16b, key2.16b;
 
 #define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
 	aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10);
 
 #define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
 	aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12);
 
 #define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \
 	aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14);
 
 
 /* Other functional macros */
 
 #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
 
 #define aes_clear_keys(nrounds) \
 	cmp nrounds, #12; \
 	CLEAR_REG(vk0); \
 	CLEAR_REG(vk1); \
 	CLEAR_REG(vk2); \
 	CLEAR_REG(vk3); \
 	CLEAR_REG(vk4); \
 	CLEAR_REG(vk5); \
 	CLEAR_REG(vk6); \
 	CLEAR_REG(vk7); \
 	CLEAR_REG(vk9); \
 	CLEAR_REG(vk8); \
 	CLEAR_REG(vk10); \
 	b.lo 1f; \
 	CLEAR_REG(vk11); \
 	CLEAR_REG(vk12); \
 	b.eq 1f; \
 	CLEAR_REG(vk13); \
 	CLEAR_REG(vk14); \
 1:	;
 
 
 /*
  * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
  *                                     const byte *src,
  *                                     unsigned int nrounds);
  */
 .align 3
 .globl _gcry_aes_enc_armv8_ce
 ELF(.type  _gcry_aes_enc_armv8_ce,%function;)
 _gcry_aes_enc_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: dst
    *    x2: src
    *    w3: nrounds
    */
   CFI_STARTPROC();
 
   aes_preload_keys(x0, w3);
 
   ld1 {v0.16b}, [x2]
 
   b.hi .Lenc1_256
   b.eq .Lenc1_192
 
 .Lenc1_128:
   do_aes_one128(e, mc, v0, v0);
 
 .Lenc1_tail:
   CLEAR_REG(vk0)
   CLEAR_REG(vk1)
   CLEAR_REG(vk2)
   CLEAR_REG(vk3)
   CLEAR_REG(vk4)
   CLEAR_REG(vk5)
   CLEAR_REG(vk6)
   CLEAR_REG(vk7)
   CLEAR_REG(vk8)
   CLEAR_REG(vk9)
   CLEAR_REG(vk10)
   st1 {v0.16b}, [x1]
   CLEAR_REG(v0)
 
   mov x0, #0
   ret
 
 .Lenc1_192:
   do_aes_one192(e, mc, v0, v0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   b .Lenc1_tail
 
 .Lenc1_256:
   do_aes_one256(e, mc, v0, v0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   CLEAR_REG(vk13)
   CLEAR_REG(vk14)
   b .Lenc1_tail
   CFI_ENDPROC();
 ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;)
 
 
 /*
  * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
  *                                     const byte *src,
  *                                     unsigned int nrounds);
  */
 .align 3
 .globl _gcry_aes_dec_armv8_ce
 ELF(.type  _gcry_aes_dec_armv8_ce,%function;)
 _gcry_aes_dec_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: dst
    *    x2: src
    *    w3: nrounds
    */
   CFI_STARTPROC();
 
   aes_preload_keys(x0, w3);
 
   ld1 {v0.16b}, [x2]
 
   b.hi .Ldec1_256
   b.eq .Ldec1_192
 
 .Ldec1_128:
   do_aes_one128(d, imc, v0, v0);
 
 .Ldec1_tail:
   CLEAR_REG(vk0)
   CLEAR_REG(vk1)
   CLEAR_REG(vk2)
   CLEAR_REG(vk3)
   CLEAR_REG(vk4)
   CLEAR_REG(vk5)
   CLEAR_REG(vk6)
   CLEAR_REG(vk7)
   CLEAR_REG(vk8)
   CLEAR_REG(vk9)
   CLEAR_REG(vk10)
   st1 {v0.16b}, [x1]
   CLEAR_REG(v0)
 
   mov x0, #0
   ret
 
 .Ldec1_192:
   do_aes_one192(d, imc, v0, v0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   b .Ldec1_tail
 
 .Ldec1_256:
   do_aes_one256(d, imc, v0, v0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   CLEAR_REG(vk13)
   CLEAR_REG(vk14)
   b .Ldec1_tail
   CFI_ENDPROC();
 ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
 
 
 /*
  * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, size_t nblocks,
  *                                  int cbc_mac, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cbc_enc_armv8_ce
 ELF(.type  _gcry_aes_cbc_enc_armv8_ce,%function;)
 _gcry_aes_cbc_enc_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: outbuf
    *    x2: inbuf
    *    x3: iv
    *    x4: nblocks
    *    w5: cbc_mac
    *    w6: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lcbc_enc_skip
 
   cmp w5, #0
   ld1 {v1.16b}, [x3] /* load IV */
   cset x5, eq
 
   aes_preload_keys(x0, w6);
   lsl x5, x5, #4
 
   b.eq .Lcbc_enc_loop192
   b.hi .Lcbc_enc_loop256
 
 #define CBC_ENC(bits) \
   .Lcbc_enc_loop##bits: \
     ld1 {v0.16b}, [x2], #16; /* load plaintext */ \
     eor v1.16b, v0.16b, v1.16b; \
     sub x4, x4, #1; \
     \
     do_aes_one##bits(e, mc, v1, v1); \
     \
     st1 {v1.16b}, [x1], x5; /* store ciphertext */ \
     \
     cbnz x4, .Lcbc_enc_loop##bits; \
     b .Lcbc_enc_done;
 
   CBC_ENC(128)
   CBC_ENC(192)
   CBC_ENC(256)
 
 #undef CBC_ENC
 
 .Lcbc_enc_done:
   aes_clear_keys(w6)
 
   st1 {v1.16b}, [x3] /* store IV */
 
   CLEAR_REG(v1)
   CLEAR_REG(v0)
 
 .Lcbc_enc_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
 
 /*
  * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cbc_dec_armv8_ce
 ELF(.type  _gcry_aes_cbc_dec_armv8_ce,%function;)
 _gcry_aes_cbc_dec_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: outbuf
    *    x2: inbuf
    *    x3: iv
    *    x4: nblocks
    *    w5: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lcbc_dec_skip
 
   ld1 {v0.16b}, [x3] /* load IV */
 
   aes_preload_keys(x0, w5);
 
   b.eq .Lcbc_dec_entry_192
   b.hi .Lcbc_dec_entry_256
 
 #define CBC_DEC(bits) \
   .Lcbc_dec_entry_##bits: \
     cmp x4, #4; \
     b.lo .Lcbc_dec_loop_##bits; \
     \
   .Lcbc_dec_loop4_##bits: \
     \
     ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \
     sub x4, x4, #4; \
     mov v5.16b, v1.16b; \
     mov v6.16b, v2.16b; \
     mov v7.16b, v3.16b; \
     mov v16.16b, v4.16b; \
     cmp x4, #4; \
     \
     do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v0.16b; \
     eor v2.16b, v2.16b, v5.16b; \
     st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \
     eor v3.16b, v3.16b, v6.16b; \
     eor v4.16b, v4.16b, v7.16b; \
     mov v0.16b, v16.16b; /* next IV */ \
     st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \
     \
     b.hs .Lcbc_dec_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     CLEAR_REG(v16); \
     cbz x4, .Lcbc_dec_done; \
     \
   .Lcbc_dec_loop_##bits: \
     ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
     sub x4, x4, #1; \
     mov v2.16b, v1.16b; \
     \
     do_aes_one##bits(d, imc, v1, v1); \
     \
     eor v1.16b, v1.16b, v0.16b; \
     mov v0.16b, v2.16b; \
     st1 {v1.16b}, [x1], #16; /* store plaintext */ \
     \
     cbnz x4, .Lcbc_dec_loop_##bits; \
     b .Lcbc_dec_done;
 
   CBC_DEC(128)
   CBC_DEC(192)
   CBC_DEC(256)
 
 #undef CBC_DEC
 
 .Lcbc_dec_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store IV */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
 .Lcbc_dec_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;)
 
 
 /*
  * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_ctr_enc_armv8_ce
 ELF(.type  _gcry_aes_ctr_enc_armv8_ce,%function;)
 _gcry_aes_ctr_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
    *    x4: nblocks
    *    w5: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lctr_enc_skip
 
   mov x6, #1
   movi v16.16b, #0
   mov v16.D[1], x6
 
   /* load IV */
   ldp x9, x10, [x3]
   ld1 {v0.16b}, [x3]
   rev x9, x9
   rev x10, x10
 
   aes_preload_keys(x0, w5);
 
   b.eq .Lctr_enc_entry_192
   b.hi .Lctr_enc_entry_256
 
 #define CTR_ENC(bits) \
   .Lctr_enc_entry_##bits: \
     cmp x4, #4; \
     b.lo .Lctr_enc_loop_##bits; \
     \
   .Lctr_enc_loop4_##bits: \
     cmp x10, #0xfffffffffffffffc; \
     sub x4, x4, #4; \
     b.lo .Lctr_enc_loop4_##bits##_nocarry; \
     \
     adds x10, x10, #1; \
     mov v1.16b, v0.16b; \
     adc x9, x9, xzr; \
     mov v2.D[1], x10; \
     mov v2.D[0], x9; \
     \
     adds x10, x10, #1; \
     rev64 v2.16b, v2.16b; \
     adc x9, x9, xzr; \
     mov v3.D[1], x10; \
     mov v3.D[0], x9; \
     \
     adds x10, x10, #1; \
     rev64 v3.16b, v3.16b; \
     adc x9, x9, xzr; \
     mov v4.D[1], x10; \
     mov v4.D[0], x9; \
     \
     adds x10, x10, #1; \
     rev64 v4.16b, v4.16b; \
     adc x9, x9, xzr; \
     mov v0.D[1], x10; \
     mov v0.D[0], x9; \
     rev64 v0.16b, v0.16b; \
     \
     b .Lctr_enc_loop4_##bits##_store_ctr; \
     \
   .Lctr_enc_loop4_##bits##_nocarry: \
     \
     add v3.2d, v16.2d, v16.2d; /* 2 */ \
     rev64 v6.16b, v0.16b; \
     add x10, x10, #4; \
     add v4.2d, v3.2d, v16.2d;  /* 3 */ \
     add v0.2d, v3.2d, v3.2d;   /* 4 */ \
     rev64 v1.16b, v6.16b; \
     add v2.2d, v6.2d, v16.2d; \
     add v3.2d, v6.2d, v3.2d; \
     add v4.2d, v6.2d, v4.2d; \
     add v0.2d, v6.2d, v0.2d; \
     rev64 v2.16b, v2.16b; \
     rev64 v3.16b, v3.16b; \
     rev64 v0.16b, v0.16b; \
     rev64 v4.16b, v4.16b; \
     \
   .Lctr_enc_loop4_##bits##_store_ctr: \
     \
     st1 {v0.16b}, [x3]; \
     cmp x4, #4; \
     ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
     \
     do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v5.16b; \
     ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
     eor v2.16b, v2.16b, v6.16b; \
     eor v3.16b, v3.16b, v7.16b; \
     eor v4.16b, v4.16b, v5.16b; \
     st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lctr_enc_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x4, .Lctr_enc_done; \
     \
   .Lctr_enc_loop_##bits: \
     \
     adds x10, x10, #1; \
     mov v1.16b, v0.16b; \
     adc x9, x9, xzr; \
     mov v0.D[1], x10; \
     mov v0.D[0], x9; \
     sub x4, x4, #1; \
     ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
     rev64 v0.16b, v0.16b; \
     \
     do_aes_one##bits(e, mc, v1, v1); \
     \
     eor v1.16b, v2.16b, v1.16b; \
     st1 {v1.16b}, [x1], #16; /* store plaintext */ \
     \
     cbnz x4, .Lctr_enc_loop_##bits; \
     b .Lctr_enc_done;
 
   CTR_ENC(128)
   CTR_ENC(192)
   CTR_ENC(256)
 
 #undef CTR_ENC
 
 .Lctr_enc_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store IV */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
 .Lctr_enc_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
 
 
 /*
  * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cfb_enc_armv8_ce
 ELF(.type  _gcry_aes_cfb_enc_armv8_ce,%function;)
 _gcry_aes_cfb_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
    *    x4: nblocks
    *    w5: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lcfb_enc_skip
 
   /* load IV */
   ld1 {v0.16b}, [x3]
 
   aes_preload_keys(x0, w5);
 
   b.eq .Lcfb_enc_entry_192
   b.hi .Lcfb_enc_entry_256
 
 #define CFB_ENC(bits) \
   .Lcfb_enc_entry_##bits: \
   .Lcfb_enc_loop_##bits: \
     ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
     sub x4, x4, #1; \
     \
     do_aes_one##bits(e, mc, v0, v0); \
     \
     eor v0.16b, v1.16b, v0.16b; \
     st1 {v0.16b}, [x1], #16; /* store ciphertext */ \
     \
     cbnz x4, .Lcfb_enc_loop_##bits; \
     b .Lcfb_enc_done;
 
   CFB_ENC(128)
   CFB_ENC(192)
   CFB_ENC(256)
 
 #undef CFB_ENC
 
 .Lcfb_enc_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store IV */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
 
 .Lcfb_enc_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;)
 
 
 /*
  * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_cfb_dec_armv8_ce
 ELF(.type  _gcry_aes_cfb_dec_armv8_ce,%function;)
 _gcry_aes_cfb_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
    *    x4: nblocks
    *    w5: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lcfb_dec_skip
 
   /* load IV */
   ld1 {v0.16b}, [x3]
 
   aes_preload_keys(x0, w5);
 
   b.eq .Lcfb_dec_entry_192
   b.hi .Lcfb_dec_entry_256
 
 #define CFB_DEC(bits) \
   .Lcfb_dec_entry_##bits: \
     cmp x4, #4; \
     b.lo .Lcfb_dec_loop_##bits; \
     \
   .Lcfb_dec_loop4_##bits: \
     \
     ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \
     mov v1.16b, v0.16b; \
     sub x4, x4, #4; \
     cmp x4, #4; \
     mov v5.16b, v2.16b; \
     mov v6.16b, v3.16b; \
     mov v7.16b, v4.16b; \
     ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \
     \
     do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v5.16b; \
     eor v2.16b, v2.16b, v6.16b; \
     eor v3.16b, v3.16b, v7.16b; \
     eor v4.16b, v4.16b, v0.16b; \
     st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lcfb_dec_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x4, .Lcfb_dec_done; \
     \
   .Lcfb_dec_loop_##bits: \
     \
     ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
     \
     sub x4, x4, #1; \
     \
     do_aes_one##bits(e, mc, v0, v0); \
     \
     eor v2.16b, v1.16b, v0.16b; \
     mov v0.16b, v1.16b; \
     st1 {v2.16b}, [x1], #16; /* store plaintext */ \
     \
     cbnz x4, .Lcfb_dec_loop_##bits; \
     b .Lcfb_dec_done;
 
   CFB_DEC(128)
   CFB_DEC(192)
   CFB_DEC(256)
 
 #undef CFB_DEC
 
 .Lcfb_dec_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store IV */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
 .Lcfb_dec_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
 
 
 /*
  * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
  *                                  unsigned char *L_table,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds,
  *                                  unsigned int blkn);
  */
 
 .align 3
 .globl _gcry_aes_ocb_enc_armv8_ce
 ELF(.type  _gcry_aes_ocb_enc_armv8_ce,%function;)
 _gcry_aes_ocb_enc_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: outbuf
    *    x2: inbuf
    *    x3: offset
    *    x4: checksum
    *    x5: Ltable
    *    x6: nblocks (0 < nblocks <= 32)
    *    w7: nrounds
    *    %st+0: blkn => w12
    */
   CFI_STARTPROC();
 
   ldr w12, [sp]
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
   aes_preload_keys(x0, w7);
 
   b.eq .Locb_enc_entry_192
   b.hi .Locb_enc_entry_256
 
 #define OCB_ENC(bits, ...) \
   .Locb_enc_entry_##bits: \
     cmp x6, #4; \
     add x12, x12, #1; \
     b.lo .Locb_enc_loop_##bits; \
     \
   .Locb_enc_loop4_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
     add w9, w12, #1; \
     add w10, w12, #2; \
     add w11, w12, #3; \
     rbit w8, w12; \
     add w12, w12, #4; \
     rbit w9, w9; \
     rbit w10, w10; \
     rbit w11, w11; \
     clz w8, w8; /* ntz(i+0) */ \
     clz w9, w9; /* ntz(i+1) */ \
     clz w10, w10; /* ntz(i+2) */ \
     clz w11, w11; /* ntz(i+3) */ \
     add x8, x5, x8, lsl #4; \
     ld1 {v1.16b-v4.16b}, [x2], #64;   /* load P_i+<0-3> */ \
     add x9, x5, x9, lsl #4; \
     add x10, x5, x10, lsl #4; \
     add x11, x5, x11, lsl #4; \
     \
     sub x6, x6, #4; \
     \
     ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
     eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
     ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
     eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
     ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
     eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
     eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
     ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
     eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
     eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
     eor v1.16b, v1.16b, v5.16b;       /* P_i+0 xor Offset_i+0 */ \
     eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
     eor v2.16b, v2.16b, v6.16b;       /* P_i+1 xor Offset_i+1 */ \
     eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
     cmp x6, #4; \
     eor v3.16b, v3.16b, v7.16b;       /* P_i+2 xor Offset_i+2 */ \
     eor v4.16b, v4.16b, v0.16b;       /* P_i+3 xor Offset_i+3 */ \
     \
     do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
     eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
     eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
     eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
     st1 {v1.16b-v4.16b}, [x1], #64; \
     \
     b.hs .Locb_enc_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x6, .Locb_enc_done; \
     \
   .Locb_enc_loop_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
     rbit x8, x12; \
     add x12, x12, #1; \
     clz x8, x8; /* ntz(i) */ \
     add x8, x5, x8, lsl #4; \
     \
     ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
     ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
     sub x6, x6, #1; \
     eor v0.16b, v0.16b, v2.16b; \
     eor v16.16b, v16.16b, v1.16b; \
     eor v1.16b, v1.16b, v0.16b; \
     \
     do_aes_one##bits(e, mc, v1, v1); \
     \
     eor v1.16b, v1.16b, v0.16b; \
     st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
     \
     cbnz x6, .Locb_enc_loop_##bits; \
     b .Locb_enc_done;
 
   OCB_ENC(128)
   OCB_ENC(192)
   OCB_ENC(256)
 
 #undef OCB_ENC
 
 .Locb_enc_done:
   aes_clear_keys(w7)
 
   st1 {v16.16b}, [x4] /* store checksum */
   st1 {v0.16b}, [x3] /* store offset */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
 
 
 /*
  * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
  *                                  unsigned char *checksum,
  *                                  unsigned char *L_table,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds,
  *                                  unsigned int blkn);
  */
 
 .align 3
 .globl _gcry_aes_ocb_dec_armv8_ce
 ELF(.type  _gcry_aes_ocb_dec_armv8_ce,%function;)
 _gcry_aes_ocb_dec_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: outbuf
    *    x2: inbuf
    *    x3: offset
    *    x4: checksum
    *    x5: Ltable
    *    x6: nblocks (0 < nblocks <= 32)
    *    w7: nrounds
    *    %st+0: blkn => w12
    */
   CFI_STARTPROC();
 
   ldr w12, [sp]
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
   aes_preload_keys(x0, w7);
 
   b.eq .Locb_dec_entry_192
   b.hi .Locb_dec_entry_256
 
 #define OCB_DEC(bits) \
   .Locb_dec_entry_##bits: \
     cmp x6, #4; \
     add w12, w12, #1; \
     b.lo .Locb_dec_loop_##bits; \
     \
   .Locb_dec_loop4_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
     add w9, w12, #1; \
     add w10, w12, #2; \
     add w11, w12, #3; \
     rbit w8, w12; \
     add w12, w12, #4; \
     rbit w9, w9; \
     rbit w10, w10; \
     rbit w11, w11; \
     clz w8, w8; /* ntz(i+0) */ \
     clz w9, w9; /* ntz(i+1) */ \
     clz w10, w10; /* ntz(i+2) */ \
     clz w11, w11; /* ntz(i+3) */ \
     add x8, x5, x8, lsl #4; \
     ld1 {v1.16b-v4.16b}, [x2], #64;   /* load C_i+<0-3> */ \
     add x9, x5, x9, lsl #4; \
     add x10, x5, x10, lsl #4; \
     add x11, x5, x11, lsl #4; \
     \
     sub x6, x6, #4; \
     \
     ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
     ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
     ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
     eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
     ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
     eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
     eor v1.16b, v1.16b, v5.16b;       /* C_i+0 xor Offset_i+0 */ \
     eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
     eor v2.16b, v2.16b, v6.16b;       /* C_i+1 xor Offset_i+1 */ \
     eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
     cmp x6, #4; \
     eor v3.16b, v3.16b, v7.16b;       /* C_i+2 xor Offset_i+2 */ \
     eor v4.16b, v4.16b, v0.16b;       /* C_i+3 xor Offset_i+3 */ \
     \
     do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
     eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
     eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
     eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
     eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
     eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
     eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
     eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
     st1 {v1.16b-v4.16b}, [x1], #64; \
     \
     b.hs .Locb_dec_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x6, .Locb_dec_done; \
     \
   .Locb_dec_loop_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     \
     rbit w8, w12; \
     add w12, w12, #1; \
     clz w8, w8; /* ntz(i) */ \
     add x8, x5, x8, lsl #4; \
     \
     ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
     ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
     sub x6, x6, #1; \
     eor v0.16b, v0.16b, v2.16b; \
     eor v1.16b, v1.16b, v0.16b; \
     \
     do_aes_one##bits(d, imc, v1, v1) \
     \
     eor v1.16b, v1.16b, v0.16b; \
     st1 {v1.16b}, [x1], #16; /* store plaintext */ \
     eor v16.16b, v16.16b, v1.16b; \
     \
     cbnz x6, .Locb_dec_loop_##bits; \
     b .Locb_dec_done;
 
   OCB_DEC(128)
   OCB_DEC(192)
   OCB_DEC(256)
 
 #undef OCB_DEC
 
 .Locb_dec_done:
   aes_clear_keys(w7)
 
   st1 {v16.16b}, [x4] /* store checksum */
   st1 {v0.16b}, [x3] /* store offset */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
 
 
 /*
  * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
  *                                   unsigned char *L_table,
  *                                   size_t nblocks,
  *                                   unsigned int nrounds,
  *                                   unsigned int blkn);
  */
 
 .align 3
 .globl _gcry_aes_ocb_auth_armv8_ce
 ELF(.type  _gcry_aes_ocb_auth_armv8_ce,%function;)
 _gcry_aes_ocb_auth_armv8_ce:
   /* input:
    *    x0: keysched
    *    x1: abuf
    *    x2: offset => x3
    *    x3: checksum => x4
    *    x4: Ltable => x5
    *    x5: nblocks => x6  (0 < nblocks <= 32)
    *    w6: nrounds => w7
    *    w7: blkn => w12
    */
   CFI_STARTPROC();
 
   mov w12, w7
   mov w7, w6
   mov x6, x5
   mov x5, x4
   mov x4, x3
   mov x3, x2
 
   aes_preload_keys(x0, w7);
 
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
   beq .Locb_auth_entry_192
   bhi .Locb_auth_entry_256
 
 #define OCB_AUTH(bits) \
   .Locb_auth_entry_##bits: \
     cmp x6, #4; \
     add w12, w12, #1; \
     b.lo .Locb_auth_loop_##bits; \
     \
   .Locb_auth_loop4_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
     add w9, w12, #1; \
     add w10, w12, #2; \
     add w11, w12, #3; \
     rbit w8, w12; \
     add w12, w12, #4; \
     rbit w9, w9; \
     rbit w10, w10; \
     rbit w11, w11; \
     clz w8, w8; /* ntz(i+0) */ \
     clz w9, w9; /* ntz(i+1) */ \
     clz w10, w10; /* ntz(i+2) */ \
     clz w11, w11; /* ntz(i+3) */ \
     add x8, x5, x8, lsl #4; \
     ld1 {v1.16b-v4.16b}, [x1], #64;   /* load A_i+<0-3> */ \
     add x9, x5, x9, lsl #4; \
     add x10, x5, x10, lsl #4; \
     add x11, x5, x11, lsl #4; \
     \
     sub x6, x6, #4; \
     \
     ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
     ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
     ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
     eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
     ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
     eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
     eor v1.16b, v1.16b, v5.16b;       /* A_i+0 xor Offset_i+0 */ \
     eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
     eor v2.16b, v2.16b, v6.16b;       /* A_i+1 xor Offset_i+1 */ \
     eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
     cmp x6, #4; \
     eor v3.16b, v3.16b, v7.16b;       /* A_i+2 xor Offset_i+2 */ \
     eor v4.16b, v4.16b, v0.16b;       /* A_i+3 xor Offset_i+3 */ \
     \
     do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v2.16b; \
     eor v16.16b, v16.16b, v3.16b; \
     eor v1.16b, v1.16b, v4.16b; \
     eor v16.16b, v16.16b, v1.16b; \
     \
     b.hs .Locb_auth_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x6, .Locb_auth_done; \
     \
   .Locb_auth_loop_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
     \
     rbit w8, w12; \
     add w12, w12, #1; \
     clz w8, w8; /* ntz(i) */ \
     add x8, x5, x8, lsl #4; \
     \
     ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
     ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
     sub x6, x6, #1; \
     eor v0.16b, v0.16b, v2.16b; \
     eor v1.16b, v1.16b, v0.16b; \
     \
     do_aes_one##bits(e, mc, v1, v1) \
     \
     eor v16.16b, v16.16b, v1.16b; \
     \
     cbnz x6, .Locb_auth_loop_##bits; \
     b .Locb_auth_done;
 
   OCB_AUTH(128)
   OCB_AUTH(192)
   OCB_AUTH(256)
 
 #undef OCB_AUTH
 
 .Locb_auth_done:
   aes_clear_keys(w7)
 
   st1 {v16.16b}, [x4] /* store checksum */
   st1 {v0.16b}, [x3] /* store offset */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
 
 
 /*
  * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *tweak,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_xts_enc_armv8_ce
 ELF(.type  _gcry_aes_xts_enc_armv8_ce,%function;)
 _gcry_aes_xts_enc_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: tweak
    *    x4: nblocks
    *    w5: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lxts_enc_skip
 
   /* load tweak */
   ld1 {v0.16b}, [x3]
 
   /* load gfmul mask */
   mov x6, #0x87
   mov x7, #0x01
   mov v16.D[0], x6
   mov v16.D[1], x7
 
   aes_preload_keys(x0, w5);
 
   b.eq .Lxts_enc_entry_192
   b.hi .Lxts_enc_entry_256
 
 #define XTS_ENC(bits) \
   .Lxts_enc_entry_##bits: \
     cmp x4, #4; \
     b.lo .Lxts_enc_loop_##bits; \
     \
   .Lxts_enc_loop4_##bits: \
     \
     ext v4.16b, v0.16b, v0.16b, #8; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v5.2d, v0.2d, v0.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v5.16b, v5.16b, v2.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v6.2d, v5.2d, v5.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v6.16b, v6.16b, v2.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v7.2d, v6.2d, v6.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v7.16b, v7.16b, v2.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v3.2d, v7.2d, v7.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v3.16b, v3.16b, v2.16b; \
     ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
     st1 {v3.16b}, [x3]; \
     sub x4, x4, #4; \
     eor v1.16b, v1.16b, v0.16b; \
     \
     ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
     cmp x4, #4; \
     eor v2.16b, v2.16b, v5.16b; \
     eor v3.16b, v3.16b, v6.16b; \
     eor v4.16b, v4.16b, v7.16b; \
     \
     do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v0.16b; \
     ld1 {v0.16b}, [x3]; \
     eor v2.16b, v2.16b, v5.16b; \
     eor v3.16b, v3.16b, v6.16b; \
     eor v4.16b, v4.16b, v7.16b; \
     st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lxts_enc_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x4, .Lxts_enc_done; \
     \
   .Lxts_enc_loop_##bits: \
     \
     ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
     ext v3.16b, v0.16b, v0.16b, #8; \
     mov v2.16b, v0.16b; \
     sshr v3.2d, v3.2d, #63; \
     add v0.2d, v0.2d, v0.2d; \
     and v3.16b, v3.16b, v16.16b; \
     eor v1.16b, v1.16b, v2.16b; \
     eor v0.16b, v0.16b, v3.16b; \
     sub x4, x4, #1; \
     \
     do_aes_one##bits(e, mc, v1, v1); \
     \
     eor v1.16b, v1.16b, v2.16b; \
     st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
     \
     cbnz x4, .Lxts_enc_loop_##bits; \
     b .Lxts_enc_done;
 
   XTS_ENC(128)
   XTS_ENC(192)
   XTS_ENC(256)
 
 #undef XTS_ENC
 
 .Lxts_enc_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store tweak */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
 .Lxts_enc_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;)
 
 
 /*
  * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *tweak,
  *                                  size_t nblocks,
  *                                  unsigned int nrounds);
  */
 
 .align 3
 .globl _gcry_aes_xts_dec_armv8_ce
 ELF(.type  _gcry_aes_xts_dec_armv8_ce,%function;)
 _gcry_aes_xts_dec_armv8_ce:
   /* input:
    *    r0: keysched
    *    r1: outbuf
    *    r2: inbuf
    *    r3: tweak
    *    x4: nblocks
    *    w5: nrounds
    */
   CFI_STARTPROC();
 
   cbz x4, .Lxts_dec_skip
 
   /* load tweak */
   ld1 {v0.16b}, [x3]
 
   /* load gfmul mask */
   mov x6, #0x87
   mov x7, #0x01
   mov v16.D[0], x6
   mov v16.D[1], x7
 
   aes_preload_keys(x0, w5);
 
   b.eq .Lxts_dec_entry_192
   b.hi .Lxts_dec_entry_256
 
 #define XTS_DEC(bits) \
   .Lxts_dec_entry_##bits: \
     cmp x4, #4; \
     b.lo .Lxts_dec_loop_##bits; \
     \
   .Lxts_dec_loop4_##bits: \
     \
     ext v4.16b, v0.16b, v0.16b, #8; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v5.2d, v0.2d, v0.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v5.16b, v5.16b, v2.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v6.2d, v5.2d, v5.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v6.16b, v6.16b, v2.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v7.2d, v6.2d, v6.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v7.16b, v7.16b, v2.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v3.2d, v7.2d, v7.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
     eor v3.16b, v3.16b, v2.16b; \
     ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
     st1 {v3.16b}, [x3]; \
     sub x4, x4, #4; \
     eor v1.16b, v1.16b, v0.16b; \
     \
     ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
     cmp x4, #4; \
     eor v2.16b, v2.16b, v5.16b; \
     eor v3.16b, v3.16b, v6.16b; \
     eor v4.16b, v4.16b, v7.16b; \
     \
     do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
     \
     eor v1.16b, v1.16b, v0.16b; \
     ld1 {v0.16b}, [x3]; \
     eor v2.16b, v2.16b, v5.16b; \
     eor v3.16b, v3.16b, v6.16b; \
     eor v4.16b, v4.16b, v7.16b; \
     st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lxts_dec_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
     cbz x4, .Lxts_dec_done; \
     \
   .Lxts_dec_loop_##bits: \
     \
     ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
     ext v3.16b, v0.16b, v0.16b, #8; \
     mov v2.16b, v0.16b; \
     sshr v3.2d, v3.2d, #63; \
     add v0.2d, v0.2d, v0.2d; \
     and v3.16b, v3.16b, v16.16b; \
     eor v1.16b, v1.16b, v2.16b; \
     eor v0.16b, v0.16b, v3.16b; \
     sub x4, x4, #1; \
     \
     do_aes_one##bits(d, imc, v1, v1); \
     \
     eor v1.16b, v1.16b, v2.16b; \
     st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
     \
     cbnz x4, .Lxts_dec_loop_##bits; \
     b .Lxts_dec_done;
 
   XTS_DEC(128)
   XTS_DEC(192)
   XTS_DEC(256)
 
 #undef XTS_DEC
 
 .Lxts_dec_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store tweak */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
 .Lxts_dec_skip:
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;)
 
 
 /*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
 .align 3
 .globl _gcry_aes_sbox4_armv8_ce
 ELF(.type  _gcry_aes_sbox4_armv8_ce,%function;)
 _gcry_aes_sbox4_armv8_ce:
   /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
    * Cryptology — CT-RSA 2015" for details.
    */
   CFI_STARTPROC();
   movi v0.16b, #0x52
   movi v1.16b, #0
   mov v0.S[0], w0
   aese v0.16b, v1.16b
   addv s0, v0.4s
   mov w0, v0.S[0]
   CLEAR_REG(v0)
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;)
 
 
 /*
  * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
  */
 .align 3
 .globl _gcry_aes_invmixcol_armv8_ce
 ELF(.type  _gcry_aes_invmixcol_armv8_ce,%function;)
 _gcry_aes_invmixcol_armv8_ce:
   CFI_STARTPROC();
   ld1 {v0.16b}, [x1]
   aesimc v0.16b, v0.16b
   st1 {v0.16b}, [x0]
   CLEAR_REG(v0)
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;)
 
 #endif
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index 7dc26c0f..223268ca 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -1,206 +1,201 @@
 /* sha1-armv8-aarch64-ce.S - ARM/CE accelerated SHA-1 transform function
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "asm-common-aarch64.h"
 
 #if defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && defined(USE_SHA1)
 
 .cpu generic+simd+crypto
 
 .text
 
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
-
-
 /* Constants */
 
 #define K1  0x5A827999
 #define K2  0x6ED9EBA1
 #define K3  0x8F1BBCDC
 #define K4  0xCA62C1D6
 .align 4
 gcry_sha1_aarch64_ce_K_VEC:
 .LK_VEC:
 .LK1:	.long K1, K1, K1, K1
 .LK2:	.long K2, K2, K2, K2
 .LK3:	.long K3, K3, K3, K3
 .LK4:	.long K4, K4, K4, K4
 
 
 /* Register macros */
 
 #define sH4    s0
 #define vH4    v0
 #define vH0123 v1
 
 #define qABCD q2
 #define sABCD s2
 #define vABCD v2
 #define sE0   s3
 #define vE0   v3
 #define sE1   s4
 #define vE1   v4
 
 #define vT0   v5
 #define vT1   v6
 
 #define vW0 v16
 #define vW1 v17
 #define vW2 v18
 #define vW3 v19
 
 #define vK1 v20
 #define vK2 v21
 #define vK3 v22
 #define vK4 v23
 
 
 /* Round macros */
 
 #define _(...) /*_*/
 #define do_add(dst, src0, src1) add dst.4s, src0.4s, src1.4s;
 #define do_sha1su0(w0,w1,w2) sha1su0 w0.4s,w1.4s,w2.4s;
 #define do_sha1su1(w0,w3) sha1su1 w0.4s,w3.4s;
 
 #define do_rounds(f, e0, e1, t, k, w0, w1, w2, w3, add_fn, sha1su0_fn, sha1su1_fn) \
         sha1su1_fn( v##w3, v##w2     ); \
         sha1h       e0, sABCD; \
         sha1##f     qABCD, e1, v##t.4s; \
         add_fn(     v##t, v##w2, v##k   ); \
         sha1su0_fn( v##w0, v##w1, v##w2 );
 
 
 /* Other functional macros */
 
 #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
 
 
 /*
  * unsigned int
  * _gcry_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
  *                                size_t nblks)
  */
 .align 3
 .globl _gcry_sha1_transform_armv8_ce
 ELF(.type  _gcry_sha1_transform_armv8_ce,%function;)
 _gcry_sha1_transform_armv8_ce:
   /* input:
    *	x0: ctx, CTX
    *	x1: data (64*nblks bytes)
    *	x2: nblks
    */
   CFI_STARTPROC();
 
   cbz x2, .Ldo_nothing;
 
   GET_DATA_POINTER(x4, .LK_VEC);
 
   ld1 {vH0123.4s}, [x0]     /* load h0,h1,h2,h3 */
   ld1 {vK1.4s-vK4.4s}, [x4] /* load K1,K2,K3,K4 */
   ldr sH4, [x0, #16]        /* load h4 */
 
   ld1 {vW0.16b-vW3.16b}, [x1], #64
   mov vABCD.16b, vH0123.16b
 
   rev32 vW0.16b, vW0.16b
   rev32 vW1.16b, vW1.16b
   rev32 vW2.16b, vW2.16b
   do_add(vT0, vW0, vK1)
   rev32 vW3.16b, vW3.16b
   do_add(vT1, vW1, vK1)
 
 .Loop:
   do_rounds(c, sE1, sH4, T0, K1, W0, W1, W2, W3, do_add, do_sha1su0, _)
   sub x2, x2, #1
   do_rounds(c, sE0, sE1, T1, K1, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
   do_rounds(c, sE1, sE0, T0, K1, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
   do_rounds(c, sE0, sE1, T1, K2, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
   do_rounds(c, sE1, sE0, T0, K2, W0, W1, W2, W3, do_add, do_sha1su0, do_sha1su1)
 
   do_rounds(p, sE0, sE1, T1, K2, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
   do_rounds(p, sE1, sE0, T0, K2, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
   do_rounds(p, sE0, sE1, T1, K2, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
   do_rounds(p, sE1, sE0, T0, K3, W0, W1, W2, W3, do_add, do_sha1su0, do_sha1su1)
   do_rounds(p, sE0, sE1, T1, K3, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
 
   do_rounds(m, sE1, sE0, T0, K3, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
   do_rounds(m, sE0, sE1, T1, K3, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
   do_rounds(m, sE1, sE0, T0, K3, W0, W1, W2, W3, do_add, do_sha1su0, do_sha1su1)
   do_rounds(m, sE0, sE1, T1, K4, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
   do_rounds(m, sE1, sE0, T0, K4, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
 
   do_rounds(p, sE0, sE1, T1, K4, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
   cbz x2, .Lend
 
   ld1 {vW0.16b-vW1.16b}, [x1], #32 /* preload */
   do_rounds(p, sE1, sE0, T0, K4, _  , _  , W2, W3, do_add, _, do_sha1su1)
   rev32 vW0.16b, vW0.16b
   ld1 {vW2.16b}, [x1], #16
   rev32 vW1.16b, vW1.16b
   do_rounds(p, sE0, sE1, T1, K4, _  , _  , W3, _  , do_add, _, _)
   ld1 {vW3.16b}, [x1], #16
   rev32 vW2.16b, vW2.16b
   do_rounds(p, sE1, sE0, T0, _, _, _, _, _, _, _, _)
   rev32 vW3.16b, vW3.16b
   do_rounds(p, sE0, sE1, T1, _, _, _, _, _, _, _, _)
 
   do_add(vT0, vW0, vK1)
   add vH4.2s, vH4.2s, vE0.2s
   add vABCD.4s, vABCD.4s, vH0123.4s
   do_add(vT1, vW1, vK1)
 
   mov vH0123.16b, vABCD.16b
 
   b .Loop
 
 .Lend:
   do_rounds(p, sE1, sE0, T0, K4, _  , _  , W2, W3, do_add, _, do_sha1su1)
   do_rounds(p, sE0, sE1, T1, K4, _  , _  , W3, _  , do_add, _, _)
   do_rounds(p, sE1, sE0, T0, _, _, _, _, _, _, _, _)
   do_rounds(p, sE0, sE1, T1, _, _, _, _, _, _, _, _)
 
   add vH4.2s, vH4.2s, vE0.2s
   add vH0123.4s, vH0123.4s, vABCD.4s
 
   CLEAR_REG(vW0)
   CLEAR_REG(vW1)
   CLEAR_REG(vW2)
   CLEAR_REG(vW3)
   CLEAR_REG(vABCD)
   CLEAR_REG(vE1)
   CLEAR_REG(vE0)
 
   str sH4, [x0, #16]    /* store h4 */
   st1 {vH0123.4s}, [x0] /* store h0,h1,h2,h3 */
 
   CLEAR_REG(vH0123)
   CLEAR_REG(vH4)
 
 .Ldo_nothing:
   mov x0, #0
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;)
 
 #endif
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index 706e0dfd..f57cae29 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -1,220 +1,215 @@
 /* sha256-armv8-aarch64-ce.S - ARM/CE accelerated SHA-256 transform function
  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "asm-common-aarch64.h"
 
 #if defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && defined(USE_SHA256)
 
 .cpu generic+simd+crypto
 
 .text
 
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
-
-
 /* Constants */
 
 .align 4
 gcry_sha256_aarch64_ce_K:
 .LK:
   .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
   .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
   .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
   .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
   .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
   .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
   .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
   .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
   .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
   .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
   .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
   .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
   .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
   .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
   .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
   .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
 
 /* Register macros */
 
 #define vH0123 v0
 #define vH4567 v1
 
 #define vABCD0 v2
 #define qABCD0 q2
 #define vABCD1 v3
 #define qABCD1 q3
 #define vEFGH  v4
 #define qEFGH  q4
 
 #define vT0 v5
 #define vT1 v6
 
 #define vW0 v16
 #define vW1 v17
 #define vW2 v18
 #define vW3 v19
 
 #define vK0 v20
 #define vK1 v21
 #define vK2 v22
 #define vK3 v23
 
 
 /* Round macros */
 
 #define _(...) /*_*/
 
 #define do_loadk(nk0, nk1) ld1 {nk0.16b-nk1.16b},[x3],#32;
 #define do_add(a, b) add a.4s, a.4s, b.4s;
 #define do_sha256su0(w0, w1) sha256su0 w0.4s, w1.4s;
 #define do_sha256su1(w0, w2, w3) sha256su1 w0.4s, w2.4s, w3.4s;
 
 #define do_rounds(k, nk0, nk1, w0, w1, w2, w3, loadk_fn, add_fn, su0_fn, su1_fn) \
         loadk_fn(   v##nk0, v##nk1     ); \
         su0_fn(     v##w0, v##w1       ); \
         mov         vABCD1.16b, vABCD0.16b; \
         sha256h     qABCD0, qEFGH, v##k.4s; \
         sha256h2    qEFGH, qABCD1, v##k.4s; \
         add_fn(     v##nk0, v##w2      ); \
         su1_fn(     v##w0, v##w2, v##w3   );
 
 
 /* Other functional macros */
 
 #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
 
 
 /*
  * unsigned int
  * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data,
  *                                  size_t num_blks)
  */
 .align 3
 .globl _gcry_sha256_transform_armv8_ce
 ELF(.type  _gcry_sha256_transform_armv8_ce,%function;)
 _gcry_sha256_transform_armv8_ce:
   /* input:
    *	r0: ctx, CTX
    *	r1: data (64*nblks bytes)
    *	r2: nblks
    */
   CFI_STARTPROC();
 
   cbz x2, .Ldo_nothing;
 
   GET_DATA_POINTER(x3, .LK);
   mov x4, x3
 
   ld1 {vH0123.4s-vH4567.4s}, [x0]  /* load state */
 
   ld1 {vW0.16b-vW1.16b}, [x1], #32
   do_loadk(vK0, vK1)
   ld1 {vW2.16b-vW3.16b}, [x1], #32
   mov vABCD0.16b, vH0123.16b
   mov vEFGH.16b, vH4567.16b
 
   rev32 vW0.16b, vW0.16b
   rev32 vW1.16b, vW1.16b
   rev32 vW2.16b, vW2.16b
   do_add(vK0, vW0)
   rev32 vW3.16b, vW3.16b
   do_add(vK1, vW1)
 
 .Loop:
   do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
   sub x2,x2,#1
   do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
   do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
   do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
 
   do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
   do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
   do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
   do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
 
   do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
   do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
   do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
   do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
 
   cbz x2, .Lend
 
   do_rounds(K0, K2, K3, W0, _  , W2, W3, do_loadk, do_add, _, _)
   ld1 {vW0.16b}, [x1], #16
   mov x3, x4
   do_rounds(K1, K3, _ , W1, _  , W3, _  , _       , do_add, _, _)
   ld1 {vW1.16b}, [x1], #16
   rev32 vW0.16b, vW0.16b
   do_rounds(K2, K0, K1, W2, _  , W0, _  , do_loadk, do_add, _, _)
   rev32 vW1.16b, vW1.16b
   ld1 {vW2.16b}, [x1], #16
   do_rounds(K3, K1, _ , W3, _  , W1, _  , _       , do_add, _, _)
   ld1 {vW3.16b}, [x1], #16
 
   do_add(vH0123, vABCD0)
   do_add(vH4567, vEFGH)
 
   rev32 vW2.16b, vW2.16b
   mov vABCD0.16b, vH0123.16b
   rev32 vW3.16b, vW3.16b
   mov vEFGH.16b, vH4567.16b
 
   b .Loop
 
 .Lend:
 
   do_rounds(K0, K2, K3, W0, _  , W2, W3, do_loadk, do_add, _, _)
   do_rounds(K1, K3, _ , W1, _  , W3, _  , _       , do_add, _, _)
   do_rounds(K2, _ , _ , W2, _  , _  , _  , _       , _, _, _)
   do_rounds(K3, _ , _ , W3, _  , _  , _  , _       , _, _, _)
 
   CLEAR_REG(vW0)
   CLEAR_REG(vW1)
   CLEAR_REG(vW2)
   CLEAR_REG(vW3)
   CLEAR_REG(vK0)
   CLEAR_REG(vK1)
   CLEAR_REG(vK2)
   CLEAR_REG(vK3)
 
   do_add(vH0123, vABCD0)
   do_add(vH4567, vEFGH)
 
   CLEAR_REG(vABCD0)
   CLEAR_REG(vABCD1)
   CLEAR_REG(vEFGH)
 
   st1 {vH0123.4s-vH4567.4s}, [x0] /* store state */
 
   CLEAR_REG(vH0123)
   CLEAR_REG(vH4567)
 
 .Ldo_nothing:
   mov x0, #0
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;)
 
 #endif