diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index c08f3453..221dfeff 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -1,102 +1,108 @@
 /*
 ** RC4 implementation optimized for AMD64.
 **
 ** Author: Marc Bevand <bevand_m (at) epita.fr>
 ** Licence: I hereby disclaim the copyright on this code and place it
 ** in the public domain.
 **
 ** The throughput achieved by this code is about 320 MBytes/sec, on
 ** a 1.8 GHz AMD Opteron (rev C0) processor.
 **
 ** 2013/12/20 <jussi.kivilinna@iki.fi>:
 **  - Integrated to libgcrypt
 **  - 4.18 cycles/byte on Intel i5-4570
 */
 
 #ifdef __x86_64__
 #include <config.h>
 #if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #include "asm-common-amd64.h"
 
 .text
 .align 16
 .globl _gcry_arcfour_amd64
 ELF(.type _gcry_arcfour_amd64,@function)
 _gcry_arcfour_amd64:
+	CFI_STARTPROC()
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	push	%rbp
+	CFI_PUSH(%rbp)
 	push	%rbx
+	CFI_PUSH(%rbx)
 	mov	%rdi,		%rbp	# key = ARG(key)
 	mov	%rsi,		%rbx	# rbx = ARG(len)
 	mov	%rdx,		%rsi	# in = ARG(in)
 	mov	%rcx,		%rdi	# out = ARG(out)
 	mov	(4*256)(%rbp),	%ecx	# x = key->x
 	mov	(4*256+4)(%rbp),%edx	# y = key->y
 	inc	%rcx			# x++
 	and	$255,		%rcx	# x &= 0xff
 	lea	-8(%rbx,%rsi),	%rbx	# rbx = in+len-8
 	mov	%rbx,		%r9	# tmp = in+len-8
 	mov	(%rbp,%rcx,4),	%eax	# tx = d[x]
 	cmp	%rsi,		%rbx	# cmp in with in+len-8
 	jl	.Lend			# jump if (in+len-8 < in)
 
 .Lstart:
 	add	$8,		%rsi		# increment in
 	add	$8,		%rdi		# increment out
 
 	# generate the next 8 bytes of the rc4 stream into %r8
 	mov	$8,		%r11		# byte counter
 1:	add	%al,		%dl		# y += tx
 	mov	(%rbp,%rdx,4),	%ebx		# ty = d[y]
 	mov	%ebx,		(%rbp,%rcx,4)	# d[x] = ty
 	add	%al,		%bl		# val = ty + tx
 	mov	%eax,		(%rbp,%rdx,4)	# d[y] = tx
 	inc	%cl				# x++		(NEXT ROUND)
 	mov	(%rbp,%rcx,4),	%eax		# tx = d[x]	(NEXT ROUND)
 	shl	$8,		%r8
 	movb	(%rbp,%rbx,4),	%r8b		# val = d[val]
 	dec	%r11b
 	jnz 1b
 
 	# xor 8 bytes
 	bswap	%r8
 	xor	-8(%rsi),	%r8
 	cmp	%r9,		%rsi		# cmp in+len-8 with in
 	mov	%r8,		-8(%rdi)
 	jle	.Lstart				# jump if (in <= in+len-8)
 
 .Lend:
 	add	$8,		%r9		# tmp = in+len
 
 	# handle the last bytes, one by one
 1:	cmp	%rsi,		%r9		# cmp in with in+len
 	jle	.Lfinished			# jump if (in+len <= in)
 	add	%al,		%dl		# y += tx
 	mov	(%rbp,%rdx,4),	%ebx		# ty = d[y]
 	mov	%ebx,		(%rbp,%rcx,4)	# d[x] = ty
 	add	%al,		%bl		# val = ty + tx
 	mov	%eax,		(%rbp,%rdx,4)	# d[y] = tx
 	inc	%cl				# x++		(NEXT ROUND)
 	mov	(%rbp,%rcx,4),	%eax		# tx = d[x]	(NEXT ROUND)
 	movb	(%rbp,%rbx,4),	%r8b		# val = d[val]
 	xor	(%rsi),		%r8b		# xor 1 byte
 	movb	%r8b,		(%rdi)
 	inc	%rsi				# in++
 	inc	%rdi				# out++
 	jmp 1b
 
 .Lfinished:
 	dec	%rcx				# x--
 	movb	%cl,		(4*256)(%rbp)	# key->y = y
 	movb	%dl,		(4*256+4)(%rbp)	# key->x = x
 	pop	%rbx
+	CFI_POP(%rbx)
 	pop	%rbp
+	CFI_POP(%rbp)
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC()
 .L__gcry_arcfour_amd64_end:
 ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
 
 #endif
 #endif
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 7eb42649..9d4a028a 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -1,90 +1,189 @@
 /* asm-common-amd64.h  -  Common macros for AMD64 assembly
  *
  * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef GCRY_ASM_COMMON_AMD64_H
 #define GCRY_ASM_COMMON_AMD64_H
 
 #include <config.h>
 
 #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
 # define ELF(...) __VA_ARGS__
 #else
 # define ELF(...) /*_*/
 #endif
 
 #ifdef __PIC__
 #  define rRIP (%rip)
 #else
 #  define rRIP
 #endif
 
 #ifdef __PIC__
 #  define RIP %rip
 #else
 #  define RIP
 #endif
 
+#ifdef __PIC__
+#  define ADD_RIP +rip
+#else
+#  define ADD_RIP
+#endif
+
 #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
 #  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
 #else
 #  ifdef __code_model_large__
 #    define GET_EXTERN_POINTER(name, reg) \
 	       pushq %r15; \
 	       pushq %r14; \
 	    1: leaq 1b(%rip), reg; \
 	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
 	       movabsq $name@GOT, %r15; \
 	       addq %r14, reg; \
 	       popq %r14; \
 	       movq (reg, %r15), reg; \
 	       popq %r15;
 #  else
 #    define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
 #  endif
 #endif
 
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+	CFI_ADJUST_CFA_OFFSET(-8);
+# define CFI_LEAVE() \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_rax 0
+# define DW_REGNO_rdx 1
+# define DW_REGNO_rcx 2
+# define DW_REGNO_rbx 3
+# define DW_REGNO_rsi 4
+# define DW_REGNO_rdi 5
+# define DW_REGNO_rbp 6
+# define DW_REGNO_rsp 7
+# define DW_REGNO_r8  8
+# define DW_REGNO_r9  9
+# define DW_REGNO_r10 10
+# define DW_REGNO_r11 11
+# define DW_REGNO_r12 12
+# define DW_REGNO_r13 13
+# define DW_REGNO_r14 14
+# define DW_REGNO_r15 15
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(reg,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(DW_REGNO(reg)), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 # define ENTER_SYSV_FUNC_PARAMS_0_4 \
 	pushq %rdi; \
+	CFI_PUSH(%rdi); \
 	pushq %rsi; \
+	CFI_PUSH(%rsi); \
 	movq %rcx, %rdi; \
 	movq %rdx, %rsi; \
 	movq %r8, %rdx; \
 	movq %r9, %rcx; \
 
 # define ENTER_SYSV_FUNC_PARAMS_5 \
 	ENTER_SYSV_FUNC_PARAMS_0_4; \
 	movq 0x38(%rsp), %r8;
 
 # define ENTER_SYSV_FUNC_PARAMS_6 \
 	ENTER_SYSV_FUNC_PARAMS_5; \
 	movq 0x40(%rsp), %r9;
 
 # define EXIT_SYSV_FUNC \
 	popq %rsi; \
-	popq %rdi;
+	CFI_POP(%rsi); \
+	popq %rdi; \
+	CFI_POP(%rdi);
 #else
 # define ENTER_SYSV_FUNC_PARAMS_0_4
 # define ENTER_SYSV_FUNC_PARAMS_5
 # define ENTER_SYSV_FUNC_PARAMS_6
 # define EXIT_SYSV_FUNC
 #endif
 
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S
index 6bcc5652..08c816cd 100644
--- a/cipher/blake2b-amd64-avx2.S
+++ b/cipher/blake2b-amd64-avx2.S
@@ -1,298 +1,300 @@
 /* blake2b-amd64-avx2.S  -  AVX2 implementation of BLAKE2b
  *
  * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /* The code is based on public-domain/CC0 BLAKE2 reference implementation
  * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
  * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #include "asm-common-amd64.h"
 
 .text
 
 /* register macros */
 #define RSTATE  %rdi
 #define RINBLKS %rsi
 #define RNBLKS  %rdx
 #define RIV     %rcx
 
 /* state structure */
 #define STATE_H 0
 #define STATE_T (STATE_H + 8 * 8)
 #define STATE_F (STATE_T + 2 * 8)
 
 /* vector registers */
 #define ROW1  %ymm0
 #define ROW2  %ymm1
 #define ROW3  %ymm2
 #define ROW4  %ymm3
 #define TMP1  %ymm4
 #define TMP1x %xmm4
 #define R16   %ymm5
 #define R24   %ymm6
 
 #define MA1   %ymm8
 #define MA2   %ymm9
 #define MA3   %ymm10
 #define MA4   %ymm11
 #define MA1x  %xmm8
 #define MA2x  %xmm9
 #define MA3x  %xmm10
 #define MA4x  %xmm11
 
 #define MB1   %ymm12
 #define MB2   %ymm13
 #define MB3   %ymm14
 #define MB4   %ymm15
 #define MB1x  %xmm12
 #define MB2x  %xmm13
 #define MB3x  %xmm14
 #define MB4x  %xmm15
 
 /**********************************************************************
   blake2b/AVX2
  **********************************************************************/
 
 #define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                    s0, s1, s2, s3, s4, s5, s6, s7, s8, \
                    s9, s10, s11, s12, s13, s14, s15) \
         vmovq (s0)*8(RINBLKS), m1x; \
         vmovq (s4)*8(RINBLKS), TMP1x; \
         vpinsrq $1, (s2)*8(RINBLKS), m1x, m1x; \
         vpinsrq $1, (s6)*8(RINBLKS), TMP1x, TMP1x; \
         vinserti128 $1, TMP1x, m1, m1; \
           vmovq (s1)*8(RINBLKS), m2x; \
           vmovq (s5)*8(RINBLKS), TMP1x; \
           vpinsrq $1, (s3)*8(RINBLKS), m2x, m2x; \
           vpinsrq $1, (s7)*8(RINBLKS), TMP1x, TMP1x; \
           vinserti128 $1, TMP1x, m2, m2; \
             vmovq (s8)*8(RINBLKS), m3x; \
             vmovq (s12)*8(RINBLKS), TMP1x; \
             vpinsrq $1, (s10)*8(RINBLKS), m3x, m3x; \
             vpinsrq $1, (s14)*8(RINBLKS), TMP1x, TMP1x; \
             vinserti128 $1, TMP1x, m3, m3; \
               vmovq (s9)*8(RINBLKS), m4x; \
               vmovq (s13)*8(RINBLKS), TMP1x; \
               vpinsrq $1, (s11)*8(RINBLKS), m4x, m4x; \
               vpinsrq $1, (s15)*8(RINBLKS), TMP1x, TMP1x; \
               vinserti128 $1, TMP1x, m4, m4;
 
 #define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
 #define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                    14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
 #define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                    11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
 #define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                     7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
 #define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                     9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
 #define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                     2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
 #define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                    12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
 #define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                    13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
 #define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                     6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
 #define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
                    10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
 #define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
 #define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
         LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
 
 #define LOAD_MSG(r, m1, m2, m3, m4) \
         LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x)
 
 #define ROR_32(in, out) vpshufd $0xb1, in, out;
 
 #define ROR_24(in, out) vpshufb R24, in, out;
 
 #define ROR_16(in, out) vpshufb R16, in, out;
 
 #define ROR_63(in, out) \
         vpsrlq $63, in, TMP1; \
         vpaddq in, in, out; \
         vpxor  TMP1, out, out;
 
 #define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
         vpaddq m, r1, r1; \
         vpaddq r2, r1, r1; \
         vpxor r1, r4, r4; \
         ROR_A(r4, r4); \
         vpaddq r4, r3, r3; \
         vpxor r3, r2, r2; \
         ROR_B(r2, r2);
 
 #define G1(r1, r2, r3, r4, m) \
         G(r1, r2, r3, r4, m, ROR_32, ROR_24);
 
 #define G2(r1, r2, r3, r4, m) \
         G(r1, r2, r3, r4, m, ROR_16, ROR_63);
 
 #define MM_SHUFFLE(z,y,x,w) \
         (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
 
 #define DIAGONALIZE(r1, r2, r3, r4) \
         vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \
         vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
         vpermq $MM_SHUFFLE(2,1,0,3), r4, r4;
 
 #define UNDIAGONALIZE(r1, r2, r3, r4) \
         vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \
         vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
         vpermq $MM_SHUFFLE(0,3,2,1), r4, r4;
 
 #define ROUND(r, m1, m2, m3, m4) \
         G1(ROW1, ROW2, ROW3, ROW4, m1); \
         G2(ROW1, ROW2, ROW3, ROW4, m2); \
         DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
         G1(ROW1, ROW2, ROW3, ROW4, m3); \
         G2(ROW1, ROW2, ROW3, ROW4, m4); \
         UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
 
 blake2b_data:
 .align 32
 .Liv:
         .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
         .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
         .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
         .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
 .Lshuf_ror16:
         .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9
 .Lshuf_ror24:
         .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10
 
 .align 64
 .globl _gcry_blake2b_transform_amd64_avx2
 ELF(.type _gcry_blake2b_transform_amd64_avx2,@function;)
 
 _gcry_blake2b_transform_amd64_avx2:
         /* input:
          *	%rdi: state
          *	%rsi: blks
          *	%rdx: num_blks
          */
+        CFI_STARTPROC();
 
         vzeroupper;
 
         addq $128, (STATE_T + 0)(RSTATE);
         adcq $0, (STATE_T + 8)(RSTATE);
 
         vbroadcasti128 .Lshuf_ror16 (RIP), R16;
         vbroadcasti128 .Lshuf_ror24 (RIP), R24;
 
         vmovdqa .Liv+(0 * 8) (RIP), ROW3;
         vmovdqa .Liv+(4 * 8) (RIP), ROW4;
 
         vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1;
         vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2;
 
         vpxor (STATE_T)(RSTATE), ROW4, ROW4;
 
         LOAD_MSG(0, MA1, MA2, MA3, MA4);
         LOAD_MSG(1, MB1, MB2, MB3, MB4);
 
 .Loop:
         ROUND(0, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(2, MA1, MA2, MA3, MA4);
         ROUND(1, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(3, MB1, MB2, MB3, MB4);
         ROUND(2, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(4, MA1, MA2, MA3, MA4);
         ROUND(3, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(5, MB1, MB2, MB3, MB4);
         ROUND(4, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(6, MA1, MA2, MA3, MA4);
         ROUND(5, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(7, MB1, MB2, MB3, MB4);
         ROUND(6, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(8, MA1, MA2, MA3, MA4);
         ROUND(7, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(9, MB1, MB2, MB3, MB4);
         ROUND(8, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(10, MA1, MA2, MA3, MA4);
         ROUND(9, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(11, MB1, MB2, MB3, MB4);
         sub $1, RNBLKS;
         jz .Loop_end;
 
         lea 128(RINBLKS), RINBLKS;
         addq $128, (STATE_T + 0)(RSTATE);
         adcq $0, (STATE_T + 8)(RSTATE);
 
         ROUND(10, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(0, MA1, MA2, MA3, MA4);
         ROUND(11, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(1, MB1, MB2, MB3, MB4);
 
         vpxor ROW3, ROW1, ROW1;
         vpxor ROW4, ROW2, ROW2;
 
         vmovdqa .Liv+(0 * 8) (RIP), ROW3;
         vmovdqa .Liv+(4 * 8) (RIP), ROW4;
 
         vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
         vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
 
         vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
         vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
 
         vpxor (STATE_T)(RSTATE), ROW4, ROW4;
 
         jmp .Loop;
 
 .Loop_end:
         ROUND(10, MA1, MA2, MA3, MA4);
         ROUND(11, MB1, MB2, MB3, MB4);
 
         vpxor ROW3, ROW1, ROW1;
         vpxor ROW4, ROW2, ROW2;
         vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
         vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
 
         vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
         vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
 
         xor %eax, %eax;
         vzeroall;
         ret;
+        CFI_ENDPROC();
 ELF(.size _gcry_blake2b_transform_amd64_avx2,
     .-_gcry_blake2b_transform_amd64_avx2;)
 
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S
index f7312dbd..19837326 100644
--- a/cipher/blake2s-amd64-avx.S
+++ b/cipher/blake2s-amd64-avx.S
@@ -1,276 +1,278 @@
 /* blake2s-amd64-avx.S  -  AVX implementation of BLAKE2s
  *
  * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /* The code is based on public-domain/CC0 BLAKE2 reference implementation
  * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
  * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_AVX) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #include "asm-common-amd64.h"
 
 .text
 
 /* register macros */
 #define RSTATE  %rdi
 #define RINBLKS %rsi
 #define RNBLKS  %rdx
 #define RIV     %rcx
 
 /* state structure */
 #define STATE_H 0
 #define STATE_T (STATE_H + 8 * 4)
 #define STATE_F (STATE_T + 2 * 4)
 
 /* vector registers */
 #define ROW1  %xmm0
 #define ROW2  %xmm1
 #define ROW3  %xmm2
 #define ROW4  %xmm3
 #define TMP1  %xmm4
 #define TMP1x %xmm4
 #define R16   %xmm5
 #define R8    %xmm6
 
 #define MA1   %xmm8
 #define MA2   %xmm9
 #define MA3   %xmm10
 #define MA4   %xmm11
 
 #define MB1   %xmm12
 #define MB2   %xmm13
 #define MB3   %xmm14
 #define MB4   %xmm15
 
 /**********************************************************************
   blake2s/AVX
  **********************************************************************/
 
 #define GATHER_MSG(m1, m2, m3, m4, \
                    s0, s1, s2, s3, s4, s5, s6, s7, s8, \
                    s9, s10, s11, s12, s13, s14, s15) \
         vmovd (s0)*4(RINBLKS), m1; \
           vmovd (s1)*4(RINBLKS), m2; \
             vmovd (s8)*4(RINBLKS), m3; \
               vmovd (s9)*4(RINBLKS), m4; \
         vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
           vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
             vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
               vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
         vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
           vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
             vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
               vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
         vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
           vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
             vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
               vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
 
 #define LOAD_MSG_0(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
 #define LOAD_MSG_1(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                    14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
 #define LOAD_MSG_2(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                    11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
 #define LOAD_MSG_3(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                     7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
 #define LOAD_MSG_4(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                     9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
 #define LOAD_MSG_5(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                     2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
 #define LOAD_MSG_6(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                    12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
 #define LOAD_MSG_7(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                    13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
 #define LOAD_MSG_8(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                     6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
 #define LOAD_MSG_9(m1, m2, m3, m4) \
         GATHER_MSG(m1, m2, m3, m4, \
                    10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
 
 #define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
 
 #define ROR_16(in, out) vpshufb R16, in, out;
 
 #define ROR_8(in, out)  vpshufb R8, in, out;
 
 #define ROR_12(in, out) \
         vpsrld $12, in, TMP1; \
         vpslld $(32 - 12), in, out; \
         vpxor TMP1, out, out;
 
 #define ROR_7(in, out) \
         vpsrld $7, in, TMP1; \
         vpslld $(32 - 7), in, out; \
         vpxor TMP1, out, out;
 
 #define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
         vpaddd m, r1, r1; \
         vpaddd r2, r1, r1; \
         vpxor r1, r4, r4; \
         ROR_A(r4, r4); \
         vpaddd r4, r3, r3; \
         vpxor r3, r2, r2; \
         ROR_B(r2, r2);
 
 #define G1(r1, r2, r3, r4, m) \
         G(r1, r2, r3, r4, m, ROR_16, ROR_12);
 
 #define G2(r1, r2, r3, r4, m) \
         G(r1, r2, r3, r4, m, ROR_8, ROR_7);
 
 #define MM_SHUFFLE(z,y,x,w) \
         (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
 
 #define DIAGONALIZE(r1, r2, r3, r4) \
         vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
         vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
         vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
 
 #define UNDIAGONALIZE(r1, r2, r3, r4) \
         vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
         vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
         vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
 
 #define ROUND(r, m1, m2, m3, m4) \
         G1(ROW1, ROW2, ROW3, ROW4, m1); \
         G2(ROW1, ROW2, ROW3, ROW4, m2); \
         DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
         G1(ROW1, ROW2, ROW3, ROW4, m3); \
         G2(ROW1, ROW2, ROW3, ROW4, m4); \
         UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
 
 blake2s_data:
 .align 16
 .Liv:
         .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
         .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 .Lshuf_ror16:
         .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_ror8:
         .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
 
 .align 64
 .globl _gcry_blake2s_transform_amd64_avx
 ELF(.type _gcry_blake2s_transform_amd64_avx,@function;)
 
 _gcry_blake2s_transform_amd64_avx:
         /* input:
          *	%rdi: state
          *	%rsi: blks
          *	%rdx: num_blks
          */
+        CFI_STARTPROC();
 
         vzeroupper;
 
         addq $64, (STATE_T + 0)(RSTATE);
 
         vmovdqa .Lshuf_ror16 (RIP), R16;
         vmovdqa .Lshuf_ror8 (RIP), R8;
 
         vmovdqa .Liv+(0 * 4) (RIP), ROW3;
         vmovdqa .Liv+(4 * 4) (RIP), ROW4;
 
         vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
         vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
 
         vpxor (STATE_T)(RSTATE), ROW4, ROW4;
 
         LOAD_MSG(0, MA1, MA2, MA3, MA4);
         LOAD_MSG(1, MB1, MB2, MB3, MB4);
 
 .Loop:
         ROUND(0, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(2, MA1, MA2, MA3, MA4);
         ROUND(1, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(3, MB1, MB2, MB3, MB4);
         ROUND(2, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(4, MA1, MA2, MA3, MA4);
         ROUND(3, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(5, MB1, MB2, MB3, MB4);
         ROUND(4, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(6, MA1, MA2, MA3, MA4);
         ROUND(5, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(7, MB1, MB2, MB3, MB4);
         ROUND(6, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(8, MA1, MA2, MA3, MA4);
         ROUND(7, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(9, MB1, MB2, MB3, MB4);
         sub $1, RNBLKS;
         jz .Loop_end;
 
         lea 64(RINBLKS), RINBLKS;
         addq $64, (STATE_T + 0)(RSTATE);
 
         ROUND(8, MA1, MA2, MA3, MA4);
                                       LOAD_MSG(0, MA1, MA2, MA3, MA4);
         ROUND(9, MB1, MB2, MB3, MB4);
                                       LOAD_MSG(1, MB1, MB2, MB3, MB4);
 
         vpxor ROW3, ROW1, ROW1;
         vpxor ROW4, ROW2, ROW2;
 
         vmovdqa .Liv+(0 * 4) (RIP), ROW3;
         vmovdqa .Liv+(4 * 4) (RIP), ROW4;
 
         vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
         vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
 
         vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
         vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
 
         vpxor (STATE_T)(RSTATE), ROW4, ROW4;
 
         jmp .Loop;
 
 .Loop_end:
         ROUND(8, MA1, MA2, MA3, MA4);
         ROUND(9, MB1, MB2, MB3, MB4);
 
         vpxor ROW3, ROW1, ROW1;
         vpxor ROW4, ROW2, ROW2;
         vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
         vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
 
         vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
         vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
 
         xor %eax, %eax;
         vzeroall;
         ret;
+        CFI_ENDPROC();
 ELF(.size _gcry_blake2s_transform_amd64_avx,
     .-_gcry_blake2s_transform_amd64_avx;)
 
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 02d3b710..bdb361d7 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -1,555 +1,601 @@
 /* blowfish-amd64.S  -  AMD64 assembly implementation of Blowfish cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(USE_BLOWFISH) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #include "asm-common-amd64.h"
 
 .text
 
 /* structure of BLOWFISH_context: */
 #define s0	0
 #define s1	((s0) + 256 * 4)
 #define s2	((s1) + 256 * 4)
 #define s3	((s2) + 256 * 4)
 #define p	((s3) + 256 * 4)
 
 /* register macros */
 #define CTX %rdi
 #define RIO %rsi
 
 #define RX0 %rax
 #define RX1 %rbx
 #define RX2 %rcx
 #define RX3 %rdx
 
 #define RX0d %eax
 #define RX1d %ebx
 #define RX2d %ecx
 #define RX3d %edx
 
 #define RX0bl %al
 #define RX1bl %bl
 #define RX2bl %cl
 #define RX3bl %dl
 
 #define RX0bh %ah
 #define RX1bh %bh
 #define RX2bh %ch
 #define RX3bh %dh
 
 #define RT0 %rbp
 #define RT1 %rsi
 #define RT2 %r8
 #define RT3 %r9
 
 #define RT0d %ebp
 #define RT1d %esi
 #define RT2d %r8d
 #define RT3d %r9d
 
 #define RKEY %r10
 
 /***********************************************************************
  * 1-way blowfish
  ***********************************************************************/
 #define F() \
 	movzbl RX0bh,		RT1d; \
 	movzbl RX0bl,		RT3d; \
 	rorq $16,		RX0; \
 	movzbl RX0bh,		RT0d; \
 	movzbl RX0bl,		RT2d; \
 	rorq $16,		RX0; \
 	movl s0(CTX,RT0,4),	RT0d; \
 	addl s1(CTX,RT2,4),	RT0d; \
 	xorl s2(CTX,RT1,4),	RT0d; \
 	addl s3(CTX,RT3,4),	RT0d; \
 	xorq RT0,		RX0;
 
 #define load_roundkey_enc(n) \
 	movq p+4*(n)(CTX), 	RX3;
 
 #define add_roundkey_enc() \
 	xorq RX3, 		RX0;
 
 #define round_enc(n) \
 	add_roundkey_enc(); \
 	load_roundkey_enc(n); \
 	\
 	F(); \
 	F();
 
 #define load_roundkey_dec(n) \
 	movq p+4*(n-1)(CTX),	RX3; \
 	rorq $32,		RX3;
 
 #define add_roundkey_dec() \
 	xorq RX3, 		RX0;
 
 #define round_dec(n) \
 	add_roundkey_dec(); \
 	load_roundkey_dec(n); \
 	\
 	F(); \
 	F();
 
 #define read_block() \
 	movq (RIO), 		RX0; \
 	rorq $32, 		RX0; \
 	bswapq 			RX0;
 
 #define write_block() \
 	bswapq 			RX0; \
 	movq RX0, 		(RIO);
 
 .align 8
 ELF(.type   __blowfish_enc_blk1,@function;)
 
 __blowfish_enc_blk1:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RX0: input plaintext block
 	 * output:
 	 *	RX0: output plaintext block
 	 */
+	CFI_STARTPROC();
 	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
 
 	load_roundkey_enc(0);
 	round_enc(2);
 	round_enc(4);
 	round_enc(6);
 	round_enc(8);
 	round_enc(10);
 	round_enc(12);
 	round_enc(14);
 	round_enc(16);
 	add_roundkey_enc();
 
 	movq %r11, %rbp;
+	CFI_RESTORE(%rbp)
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_do_encrypt
 ELF(.type   _gcry_blowfish_amd64_do_encrypt,@function;)
 
 _gcry_blowfish_amd64_do_encrypt:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: u32 *ret_xl
 	 *	%rdx: u32 *ret_xr
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movl (%rdx), RX0d;
 	shlq $32, RX0;
 	movl (%rsi), RT3d;
 	movq %rdx, %r10;
 	orq RT3, RX0;
 	movq %rsi, RX2;
 
 	call __blowfish_enc_blk1;
 
 	movl RX0d, (%r10);
 	shrq $32, RX0;
 	movl RX0d, (RX2);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_encrypt_block
 ELF(.type   _gcry_blowfish_amd64_encrypt_block,@function;)
 
 _gcry_blowfish_amd64_encrypt_block:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rsi, %r10;
 
 	movq %rdx, RIO;
 	read_block();
 
 	call __blowfish_enc_blk1;
 
 	movq %r10, RIO;
 	write_block();
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_decrypt_block
 ELF(.type   _gcry_blowfish_amd64_decrypt_block,@function;)
 
 _gcry_blowfish_amd64_decrypt_block:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
 
 	movq %rsi, %r10;
 	movq %rdx, RIO;
 
 	read_block();
 
 	load_roundkey_dec(17);
 	round_dec(15);
 	round_dec(13);
 	round_dec(11);
 	round_dec(9);
 	round_dec(7);
 	round_dec(5);
 	round_dec(3);
 	round_dec(1);
 	add_roundkey_dec();
 
 	movq %r10, RIO;
 	write_block();
 
 	movq %r11, %rbp;
+	CFI_RESTORE(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
 /**********************************************************************
   4-way blowfish, four blocks parallel
  **********************************************************************/
 #define F4(x) \
 	movzbl x ## bh,		RT1d; \
 	movzbl x ## bl,		RT3d; \
 	rorq $16,		x; \
 	movzbl x ## bh,		RT0d; \
 	movzbl x ## bl,		RT2d; \
 	rorq $16,		x; \
 	movl s0(CTX,RT0,4),	RT0d; \
 	addl s1(CTX,RT2,4),	RT0d; \
 	xorl s2(CTX,RT1,4),	RT0d; \
 	addl s3(CTX,RT3,4),	RT0d; \
 	xorq RT0,		x;
 
 #define add_preloaded_roundkey4() \
 	xorq RKEY,		RX0; \
 	xorq RKEY,		RX1; \
 	xorq RKEY,		RX2; \
 	xorq RKEY,		RX3;
 
 #define preload_roundkey_enc(n) \
 	movq p+4*(n)(CTX),	RKEY;
 
 #define add_roundkey_enc4(n) \
 	add_preloaded_roundkey4(); \
 	preload_roundkey_enc(n + 2);
 
 #define round_enc4(n) \
 	add_roundkey_enc4(n); \
 	\
 	F4(RX0); \
 	F4(RX1); \
 	F4(RX2); \
 	F4(RX3); \
 	\
 	F4(RX0); \
 	F4(RX1); \
 	F4(RX2); \
 	F4(RX3);
 
 #define preload_roundkey_dec(n) \
 	movq p+4*((n)-1)(CTX),	RKEY; \
 	rorq $32,		RKEY;
 
 #define add_roundkey_dec4(n) \
 	add_preloaded_roundkey4(); \
 	preload_roundkey_dec(n - 2);
 
 #define round_dec4(n) \
 	add_roundkey_dec4(n); \
 	\
 	F4(RX0); \
 	F4(RX1); \
 	F4(RX2); \
 	F4(RX3); \
 	\
 	F4(RX0); \
 	F4(RX1); \
 	F4(RX2); \
 	F4(RX3);
 
 #define inbswap_block4() \
 	rorq $32,		RX0; \
 	bswapq 			RX0; \
 	rorq $32,		RX1; \
 	bswapq 			RX1; \
 	rorq $32,		RX2; \
 	bswapq 			RX2; \
 	rorq $32,		RX3; \
 	bswapq 			RX3;
 
 #define inctrswap_block4() \
 	rorq $32,		RX0; \
 	rorq $32,		RX1; \
 	rorq $32,		RX2; \
 	rorq $32,		RX3;
 
 #define outbswap_block4() \
 	bswapq 			RX0; \
 	bswapq 			RX1; \
 	bswapq 			RX2; \
 	bswapq 			RX3;
 
 .align 8
 ELF(.type   __blowfish_enc_blk4,@function;)
 
 __blowfish_enc_blk4:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
 	 * output:
 	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	preload_roundkey_enc(0);
 
 	round_enc4(0);
 	round_enc4(2);
 	round_enc4(4);
 	round_enc4(6);
 	round_enc4(8);
 	round_enc4(10);
 	round_enc4(12);
 	round_enc4(14);
 	add_preloaded_roundkey4();
 
 	outbswap_block4();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
 
 .align 8
 ELF(.type   __blowfish_dec_blk4,@function;)
 
 __blowfish_dec_blk4:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RX0,RX1,RX2,RX3: four input ciphertext blocks
 	 * output:
 	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
 	 */
+	CFI_STARTPROC();
 	preload_roundkey_dec(17);
 
 	inbswap_block4();
 
 	round_dec4(17);
 	round_dec4(15);
 	round_dec4(13);
 	round_dec4(11);
 	round_dec4(9);
 	round_dec4(7);
 	round_dec4(5);
 	round_dec4(3);
 	add_preloaded_roundkey4();
 
 	outbswap_block4();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_ctr_enc
 ELF(.type   _gcry_blowfish_amd64_ctr_enc,@function;)
 _gcry_blowfish_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (4 blocks)
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
 	movq %rcx, %r13; /*iv*/
 	movq %rdx, %r12; /*src*/
 	movq %rsi, %r11; /*dst*/
 
 	/* load IV and byteswap */
 	movq (%r13), RT0;
 	bswapq RT0;
 	movq RT0, RX0;
 
 	/* construct IVs */
 	leaq 1(RT0), RX1;
 	leaq 2(RT0), RX2;
 	leaq 3(RT0), RX3;
 	leaq 4(RT0), RT0;
 	bswapq RT0;
 
 	inctrswap_block4();
 
 	/* store new IV */
 	movq RT0, (%r13);
 
 	call __blowfish_enc_blk4;
 
 	/* XOR key-stream with plaintext */
 	xorq 0 * 8(%r12), RX0;
 	xorq 1 * 8(%r12), RX1;
 	xorq 2 * 8(%r12), RX2;
 	xorq 3 * 8(%r12), RX3;
 	movq RX0, 0 * 8(%r11);
 	movq RX1, 1 * 8(%r11);
 	movq RX2, 2 * 8(%r11);
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_cbc_dec
 ELF(.type   _gcry_blowfish_amd64_cbc_dec,@function;)
 _gcry_blowfish_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (4 blocks)
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
 	movq %rsi, %r11; /*dst*/
 	movq %rdx, %r12; /*src*/
 	movq %rcx, %r13; /*iv*/
 
 	/* load input */
 	movq 0 * 8(%r12), RX0;
 	movq 1 * 8(%r12), RX1;
 	movq 2 * 8(%r12), RX2;
 	movq 3 * 8(%r12), RX3;
 
 	call __blowfish_dec_blk4;
 
 	movq 3 * 8(%r12), RT0;
 	xorq      (%r13), RX0;
 	xorq 0 * 8(%r12), RX1;
 	xorq 1 * 8(%r12), RX2;
 	xorq 2 * 8(%r12), RX3;
 	movq RT0, (%r13); /* store new IV */
 
 	movq RX0, 0 * 8(%r11);
 	movq RX1, 1 * 8(%r11);
 	movq RX2, 2 * 8(%r11);
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_cfb_dec
 ELF(.type   _gcry_blowfish_amd64_cfb_dec,@function;)
 _gcry_blowfish_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (4 blocks)
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
 	movq %rcx, %r13; /*iv*/
 	movq %rdx, %r12; /*src*/
 	movq %rsi, %r11; /*dst*/
 
 	/* Load input */
 	movq (%r13), RX0;
 	movq 0 * 8(%r12), RX1;
 	movq 1 * 8(%r12), RX2;
 	movq 2 * 8(%r12), RX3;
 
 	inbswap_block4();
 
 	/* Update IV */
 	movq 3 * 8(%r12), RT0;
 	movq RT0, (%r13);
 
 	call __blowfish_enc_blk4;
 
 	xorq 0 * 8(%r12), RX0;
 	xorq 1 * 8(%r12), RX1;
 	xorq 2 * 8(%r12), RX2;
 	xorq 3 * 8(%r12), RX3;
 	movq RX0, 0 * 8(%r11);
 	movq RX1, 1 * 8(%r11);
 	movq RX2, 2 * 8(%r11);
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
 #endif /*defined(USE_BLOWFISH)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 8022934f..e16d4f61 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1,2591 +1,2647 @@
 /* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
  *
  * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
 #define key_table 0
 #define key_bitlength CAMELLIA_TABLE_BYTE_LEN
 
 /* register macros */
 #define CTX %rdi
 #define RIO %r8
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
 	vpand x, mask4bit, tmp0; \
 	vpandn x, mask4bit, x; \
 	vpsrld $4, x, x; \
 	\
 	vpshufb tmp0, lo_t, tmp0; \
 	vpshufb x, hi_t, x; \
 	vpxor tmp0, x, x;
 
 /**********************************************************************
   16-way camellia
  **********************************************************************/
 
 /*
  * IN:
  *   x0..x7: byte-sliced AB state
  *   mem_cd: register pointer storing CD state
  *   key: index for key material
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
 		  t7, mem_cd, key) \
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
-	vmovdqa .Linv_shift_row RIP, t4; \
-	vbroadcastss .L0f0f0f0f RIP, t7; \
-	vmovdqa .Lpre_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpre_tf_hi_s1 RIP, t1; \
+	vmovdqa .Linv_shift_row rRIP, t4; \
+	vbroadcastss .L0f0f0f0f rRIP, t7; \
+	vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
 	vpshufb t4, x7, x7; \
 	vpshufb t4, x1, x1; \
 	vpshufb t4, x4, x4; \
 	vpshufb t4, x2, x2; \
 	vpshufb t4, x5, x5; \
 	vpshufb t4, x3, x3; \
 	vpshufb t4, x6, x6; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
-	vmovdqa .Lpre_tf_lo_s4 RIP, t2; \
-	vmovdqa .Lpre_tf_hi_s4 RIP, t3; \
+	vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \
+	vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t6); \
 	filter_8bit(x7, t0, t1, t7, t6); \
 	filter_8bit(x1, t0, t1, t7, t6); \
 	filter_8bit(x4, t0, t1, t7, t6); \
 	filter_8bit(x2, t0, t1, t7, t6); \
 	filter_8bit(x5, t0, t1, t7, t6); \
 	\
 	/* prefilter sbox 4 */ \
 	vpxor t4, t4, t4; \
 	filter_8bit(x3, t2, t3, t7, t6); \
 	filter_8bit(x6, t2, t3, t7, t6); \
 	\
 	/* AES subbytes + AES shift rows */ \
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
 	vaesenclast t4, x0, x0; \
 	vaesenclast t4, x7, x7; \
 	vaesenclast t4, x1, x1; \
 	vaesenclast t4, x4, x4; \
 	vaesenclast t4, x2, x2; \
 	vaesenclast t4, x5, x5; \
 	vaesenclast t4, x3, x3; \
 	vaesenclast t4, x6, x6; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
-	vmovdqa .Lpost_tf_lo_s3 RIP, t2; \
-	vmovdqa .Lpost_tf_hi_s3 RIP, t3; \
+	vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \
+	vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t6); \
 	filter_8bit(x7, t0, t1, t7, t6); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
-	vmovdqa .Lpost_tf_lo_s2 RIP, t4; \
-	vmovdqa .Lpost_tf_hi_s2 RIP, t5; \
+	vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \
+	vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
 	vpxor t6, t6, t6; \
 	vmovq key, t0; \
 	\
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
 	\
 	vpsrldq $5, t0, t5; \
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
 	vpsrldq $3, t0, t3; \
 	vpsrldq $4, t0, t4; \
 	vpshufb t6, t0, t0; \
 	vpshufb t6, t1, t1; \
 	vpshufb t6, t2, t2; \
 	vpshufb t6, t3, t3; \
 	vpshufb t6, t4, t4; \
 	vpsrldq $2, t5, t7; \
 	vpshufb t6, t7, t7; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
 	vpxor x6, x1, x1; \
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
 	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
 	vpxor t3, x4, x4; \
 	vpxor 0 * 16(mem_cd), x4, x4; \
 	\
 	vpxor t2, x5, x5; \
 	vpxor 1 * 16(mem_cd), x5, x5; \
 	\
 	vpsrldq $1, t5, t3; \
 	vpshufb t6, t5, t5; \
 	vpshufb t6, t3, t6; \
 	\
 	vpxor t1, x6, x6; \
 	vpxor 2 * 16(mem_cd), x6, x6; \
 	\
 	vpxor t0, x7, x7; \
 	vpxor 3 * 16(mem_cd), x7, x7; \
 	\
 	vpxor t7, x0, x0; \
 	vpxor 4 * 16(mem_cd), x0, x0; \
 	\
 	vpxor t6, x1, x1; \
 	vpxor 5 * 16(mem_cd), x1, x1; \
 	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 16(mem_cd), x2, x2; \
 	\
 	vpxor t4, x3, x3; \
 	vpxor 7 * 16(mem_cd), x3, x3;
 
 /*
  * IN/OUT:
  *  x0..x7: byte-sliced AB state preloaded
  *  mem_ab: byte-sliced AB state in memory
  *  mem_cb: byte-sliced CD state in memory
  */
 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 	roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
 	\
 	vmovdqu x4, 0 * 16(mem_cd); \
 	vmovdqu x5, 1 * 16(mem_cd); \
 	vmovdqu x6, 2 * 16(mem_cd); \
 	vmovdqu x7, 3 * 16(mem_cd); \
 	vmovdqu x0, 4 * 16(mem_cd); \
 	vmovdqu x1, 5 * 16(mem_cd); \
 	vmovdqu x2, 6 * 16(mem_cd); \
 	vmovdqu x3, 7 * 16(mem_cd); \
 	\
 	roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
 	\
 	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 
 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
 
 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 	/* Store new AB state */ \
 	vmovdqu x0, 0 * 16(mem_ab); \
 	vmovdqu x1, 1 * 16(mem_ab); \
 	vmovdqu x2, 2 * 16(mem_ab); \
 	vmovdqu x3, 3 * 16(mem_ab); \
 	vmovdqu x4, 4 * 16(mem_ab); \
 	vmovdqu x5, 5 * 16(mem_ab); \
 	vmovdqu x6, 6 * 16(mem_ab); \
 	vmovdqu x7, 7 * 16(mem_ab);
 
 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 
 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 
 /*
  * IN:
  *  v0..3: byte-sliced 32-bit integers
  * OUT:
  *  v0..3: (IN <<< 1)
  */
 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
 	vpcmpgtb v0, zero, t0; \
 	vpaddb v0, v0, v0; \
 	vpabsb t0, t0; \
 	\
 	vpcmpgtb v1, zero, t1; \
 	vpaddb v1, v1, v1; \
 	vpabsb t1, t1; \
 	\
 	vpcmpgtb v2, zero, t2; \
 	vpaddb v2, v2, v2; \
 	vpabsb t2, t2; \
 	\
 	vpor t0, v1, v1; \
 	\
 	vpcmpgtb v3, zero, t0; \
 	vpaddb v3, v3, v3; \
 	vpabsb t0, t0; \
 	\
 	vpor t1, v2, v2; \
 	vpor t2, v3, v3; \
 	vpor t0, v0, v0;
 
 /*
  * IN:
  *   r: byte-sliced AB state in memory
  *   l: byte-sliced CD state in memory
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 	      tt1, tt2, tt3, kll, klr, krl, krr) \
 	/* \
 	 * t0 = kll; \
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
 	vpxor tt0, tt0, tt0; \
 	vmovd kll, t0; \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
 	vpand l2, t2, t2; \
 	vpand l3, t3, t3; \
 	\
 	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor l4, t0, l4; \
 	vmovdqu l4, 4 * 16(l); \
 	vpxor l5, t1, l5; \
 	vmovdqu l5, 5 * 16(l); \
 	vpxor l6, t2, l6; \
 	vmovdqu l6, 6 * 16(l); \
 	vpxor l7, t3, l7; \
 	vmovdqu l7, 7 * 16(l); \
 	\
 	/* \
 	 * t2 = krr; \
 	 * t2 |= rr; \
 	 * rl ^= t2; \
 	 */ \
 	\
 	vmovd krr, t0; \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpor 4 * 16(r), t0, t0; \
 	vpor 5 * 16(r), t1, t1; \
 	vpor 6 * 16(r), t2, t2; \
 	vpor 7 * 16(r), t3, t3; \
 	\
 	vpxor 0 * 16(r), t0, t0; \
 	vpxor 1 * 16(r), t1, t1; \
 	vpxor 2 * 16(r), t2, t2; \
 	vpxor 3 * 16(r), t3, t3; \
 	vmovdqu t0, 0 * 16(r); \
 	vmovdqu t1, 1 * 16(r); \
 	vmovdqu t2, 2 * 16(r); \
 	vmovdqu t3, 3 * 16(r); \
 	\
 	/* \
 	 * t2 = krl; \
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
 	vmovd krl, t0; \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpand 0 * 16(r), t0, t0; \
 	vpand 1 * 16(r), t1, t1; \
 	vpand 2 * 16(r), t2, t2; \
 	vpand 3 * 16(r), t3, t3; \
 	\
 	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor 4 * 16(r), t0, t0; \
 	vpxor 5 * 16(r), t1, t1; \
 	vpxor 6 * 16(r), t2, t2; \
 	vpxor 7 * 16(r), t3, t3; \
 	vmovdqu t0, 4 * 16(r); \
 	vmovdqu t1, 5 * 16(r); \
 	vmovdqu t2, 6 * 16(r); \
 	vmovdqu t3, 7 * 16(r); \
 	\
 	/* \
 	 * t0 = klr; \
 	 * t0 |= lr; \
 	 * ll ^= t0; \
 	 */ \
 	\
 	vmovd klr, t0; \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
 	vpor l6, t2, t2; \
 	vpor l7, t3, t3; \
 	\
 	vpxor l0, t0, l0; \
 	vmovdqu l0, 0 * 16(l); \
 	vpxor l1, t1, l1; \
 	vmovdqu l1, 1 * 16(l); \
 	vpxor l2, t2, l2; \
 	vmovdqu l2, 2 * 16(l); \
 	vpxor l3, t3, l3; \
 	vmovdqu l3, 3 * 16(l);
 
 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1, x0, x1; \
 	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2, t2, x2;
 
 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
 			      a3, b3, c3, d3, st0, st1) \
 	vmovdqu d2, st0; \
 	vmovdqu d3, st1; \
 	transpose_4x4(a0, a1, a2, a3, d2, d3); \
 	transpose_4x4(b0, b1, b2, b3, d2, d3); \
 	vmovdqu st0, d2; \
 	vmovdqu st1, d3; \
 	\
 	vmovdqu a0, st0; \
 	vmovdqu a1, st1; \
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
-	vmovdqu .Lshufb_16x16b RIP, a0; \
+	vmovdqu .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
 	vpshufb a0, b0, b0; \
 	vpshufb a0, b1, b1; \
 	vpshufb a0, b2, b2; \
 	vpshufb a0, b3, b3; \
 	vpshufb a0, a1, a1; \
 	vpshufb a0, c0, c0; \
 	vpshufb a0, c1, c1; \
 	vpshufb a0, c2, c2; \
 	vpshufb a0, c3, c3; \
 	vpshufb a0, d0, d0; \
 	vpshufb a0, d1, d1; \
 	vpshufb a0, d2, d2; \
 	vpshufb a0, d3, d3; \
 	vmovdqu d3, st1; \
 	vmovdqu st0, d3; \
 	vpshufb a0, d3, a0; \
 	vmovdqu d2, st0; \
 	\
 	transpose_4x4(a0, b0, c0, d0, d2, d3); \
 	transpose_4x4(a1, b1, c1, d1, d2, d3); \
 	vmovdqu st0, d2; \
 	vmovdqu st1, d3; \
 	\
 	vmovdqu b0, st0; \
 	vmovdqu b1, st1; \
 	transpose_4x4(a2, b2, c2, d2, b0, b1); \
 	transpose_4x4(a3, b3, c3, d3, b0, b1); \
 	vmovdqu st0, b0; \
 	vmovdqu st1, b1; \
 	/* does not adjust output bytes inside vectors */
 
 #define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
 	vpunpcklbw a, b, t0; \
 	vpunpckhbw a, b, b; \
 	\
 	vpunpcklbw c, d, t1; \
 	vpunpckhbw c, d, d; \
 	\
 	vpunpcklbw e, f, t2; \
 	vpunpckhbw e, f, f; \
 	\
 	vpunpcklbw g, h, t3; \
 	vpunpckhbw g, h, h; \
 	\
 	vpunpcklwd t0, t1, g; \
 	vpunpckhwd t0, t1, t0; \
 	\
 	vpunpcklwd b, d, t1; \
 	vpunpckhwd b, d, e; \
 	\
 	vpunpcklwd t2, t3, c; \
 	vpunpckhwd t2, t3, t2; \
 	\
 	vpunpcklwd f, h, t3; \
 	vpunpckhwd f, h, b; \
 	\
 	vpunpcklwd e, b, t4; \
 	vpunpckhwd e, b, b; \
 	\
 	vpunpcklwd t1, t3, e; \
 	vpunpckhwd t1, t3, f; \
 	\
-	vmovdqa .Ltranspose_8x8_shuf RIP, t3; \
+	vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \
 	\
 	vpunpcklwd g, c, d; \
 	vpunpckhwd g, c, c; \
 	\
 	vpunpcklwd t0, t2, t1; \
 	vpunpckhwd t0, t2, h; \
 	\
 	vpunpckhqdq b, h, a; \
 	vpshufb t3, a, a; \
 	vpunpcklqdq b, h, b; \
 	vpshufb t3, b, b; \
 	\
 	vpunpckhqdq e, d, g; \
 	vpshufb t3, g, g; \
 	vpunpcklqdq e, d, h; \
 	vpshufb t3, h, h; \
 	\
 	vpunpckhqdq f, c, e; \
 	vpshufb t3, e, e; \
 	vpunpcklqdq f, c, f; \
 	vpshufb t3, f, f; \
 	\
 	vpunpckhqdq t4, t1, c; \
 	vpshufb t3, c, c; \
 	vpunpcklqdq t4, t1, d; \
 	vpshufb t3, d, d;
 
 /* load blocks to registers and apply pre-whitening */
 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 16(rio), x0, y7; \
 	vpxor 1 * 16(rio), x0, y6; \
 	vpxor 2 * 16(rio), x0, y5; \
 	vpxor 3 * 16(rio), x0, y4; \
 	vpxor 4 * 16(rio), x0, y3; \
 	vpxor 5 * 16(rio), x0, y2; \
 	vpxor 6 * 16(rio), x0, y1; \
 	vpxor 7 * 16(rio), x0, y0; \
 	vpxor 8 * 16(rio), x0, x7; \
 	vpxor 9 * 16(rio), x0, x6; \
 	vpxor 10 * 16(rio), x0, x5; \
 	vpxor 11 * 16(rio), x0, x4; \
 	vpxor 12 * 16(rio), x0, x3; \
 	vpxor 13 * 16(rio), x0, x2; \
 	vpxor 14 * 16(rio), x0, x1; \
 	vpxor 15 * 16(rio), x0, x0;
 
 /* byteslice pre-whitened blocks and store to temporary memory */
 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd) \
 	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
 			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
 	\
 	vmovdqu x0, 0 * 16(mem_ab); \
 	vmovdqu x1, 1 * 16(mem_ab); \
 	vmovdqu x2, 2 * 16(mem_ab); \
 	vmovdqu x3, 3 * 16(mem_ab); \
 	vmovdqu x4, 4 * 16(mem_ab); \
 	vmovdqu x5, 5 * 16(mem_ab); \
 	vmovdqu x6, 6 * 16(mem_ab); \
 	vmovdqu x7, 7 * 16(mem_ab); \
 	vmovdqu y0, 0 * 16(mem_cd); \
 	vmovdqu y1, 1 * 16(mem_cd); \
 	vmovdqu y2, 2 * 16(mem_cd); \
 	vmovdqu y3, 3 * 16(mem_cd); \
 	vmovdqu y4, 4 * 16(mem_cd); \
 	vmovdqu y5, 5 * 16(mem_cd); \
 	vmovdqu y6, 6 * 16(mem_cd); \
 	vmovdqu y7, 7 * 16(mem_cd);
 
 /* de-byteslice, apply post-whitening and store blocks */
 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
 			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
 	\
 	vmovdqu x0, stack_tmp0; \
 	\
 	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
 	vpxor x0, y5, y5; \
 	vpxor x0, y4, y4; \
 	vpxor x0, y3, y3; \
 	vpxor x0, y2, y2; \
 	vpxor x0, y1, y1; \
 	vpxor x0, y0, y0; \
 	vpxor x0, x7, x7; \
 	vpxor x0, x6, x6; \
 	vpxor x0, x5, x5; \
 	vpxor x0, x4, x4; \
 	vpxor x0, x3, x3; \
 	vpxor x0, x2, x2; \
 	vpxor x0, x1, x1; \
 	vpxor stack_tmp0, x0, x0;
 
 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio) \
 	vmovdqu x0, 0 * 16(rio); \
 	vmovdqu x1, 1 * 16(rio); \
 	vmovdqu x2, 2 * 16(rio); \
 	vmovdqu x3, 3 * 16(rio); \
 	vmovdqu x4, 4 * 16(rio); \
 	vmovdqu x5, 5 * 16(rio); \
 	vmovdqu x6, 6 * 16(rio); \
 	vmovdqu x7, 7 * 16(rio); \
 	vmovdqu y0, 8 * 16(rio); \
 	vmovdqu y1, 9 * 16(rio); \
 	vmovdqu y2, 10 * 16(rio); \
 	vmovdqu y3, 11 * 16(rio); \
 	vmovdqu y4, 12 * 16(rio); \
 	vmovdqu y5, 13 * 16(rio); \
 	vmovdqu y6, 14 * 16(rio); \
 	vmovdqu y7, 15 * 16(rio);
 
 .text
 .align 16
 
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 
 .Lshufb_16x16b:
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
 
 .Lpack_bswap:
 	.long 0x00010203
 	.long 0x04050607
 	.long 0x80808080
 	.long 0x80808080
 
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 /*
  * pre-SubByte transform
  *
  * pre-lookup for sbox1, sbox2, sbox3:
  *   swap_bitendianness(
  *       isom_map_camellia_to_aes(
  *           camellia_f(
  *               swap_bitendianess(in)
  *           )
  *       )
  *   )
  *
  * (note: '⊕ 0xc5' inside camellia_f())
  */
 .Lpre_tf_lo_s1:
 	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 .Lpre_tf_hi_s1:
 	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 
 /*
  * pre-SubByte transform
  *
  * pre-lookup for sbox4:
  *   swap_bitendianness(
  *       isom_map_camellia_to_aes(
  *           camellia_f(
  *               swap_bitendianess(in <<< 1)
  *           )
  *       )
  *   )
  *
  * (note: '⊕ 0xc5' inside camellia_f())
  */
 .Lpre_tf_lo_s4:
 	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 .Lpre_tf_hi_s4:
 	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox1, sbox4:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  )
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s1:
 	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 .Lpost_tf_hi_s1:
 	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox2:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  ) <<< 1
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s2:
 	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 .Lpost_tf_hi_s2:
 	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox3:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  ) >>> 1
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s3:
 	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 .Lpost_tf_hi_s3:
 	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 
 /* For isolating SubBytes from AESENCLAST, inverse shift row */
 .Linv_shift_row:
 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
 /* shuffle mask for 8x8 byte transpose */
 .Ltranspose_8x8_shuf:
 	.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
 
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
 
 
 .align 8
 ELF(.type   __camellia_enc_blk16,@function;)
 
 __camellia_enc_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rax: temporary storage, 256 bytes
 	 *	%xmm0..%xmm15: 16 plaintext blocks
 	 * output:
 	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 16(%rax), %rcx;
 
 	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		      %xmm15, %rax, %rcx);
 
 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 0);
 
 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 	      %xmm15,
 	      ((key_table + (8) * 8) + 0)(CTX),
 	      ((key_table + (8) * 8) + 4)(CTX),
 	      ((key_table + (8) * 8) + 8)(CTX),
 	      ((key_table + (8) * 8) + 12)(CTX));
 
 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 8);
 
 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 	      %xmm15,
 	      ((key_table + (16) * 8) + 0)(CTX),
 	      ((key_table + (16) * 8) + 4)(CTX),
 	      ((key_table + (16) * 8) + 8)(CTX),
 	      ((key_table + (16) * 8) + 12)(CTX));
 
 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 16);
 
 	movl $24, %r8d;
 	cmpl $128, key_bitlength(CTX);
 	jne .Lenc_max32;
 
 .Lenc_done:
 	/* load CD for output */
 	vmovdqu 0 * 16(%rcx), %xmm8;
 	vmovdqu 1 * 16(%rcx), %xmm9;
 	vmovdqu 2 * 16(%rcx), %xmm10;
 	vmovdqu 3 * 16(%rcx), %xmm11;
 	vmovdqu 4 * 16(%rcx), %xmm12;
 	vmovdqu 5 * 16(%rcx), %xmm13;
 	vmovdqu 6 * 16(%rcx), %xmm14;
 	vmovdqu 7 * 16(%rcx), %xmm15;
 
 	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
 
 	ret;
 
 .align 8
 .Lenc_max32:
 	movl $32, %r8d;
 
 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 	      %xmm15,
 	      ((key_table + (24) * 8) + 0)(CTX),
 	      ((key_table + (24) * 8) + 4)(CTX),
 	      ((key_table + (24) * 8) + 8)(CTX),
 	      ((key_table + (24) * 8) + 12)(CTX));
 
 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
 .align 8
 ELF(.type   __camellia_dec_blk16,@function;)
 
 __camellia_dec_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rax: temporary storage, 256 bytes
 	 *	%r8d: 24 for 16 byte key, 32 for larger
 	 *	%xmm0..%xmm15: 16 encrypted blocks
 	 * output:
 	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 16(%rax), %rcx;
 
 	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		      %xmm15, %rax, %rcx);
 
 	cmpl $32, %r8d;
 	je .Ldec_max32;
 
 .Ldec_max24:
 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 16);
 
 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 	      %xmm15,
 	      ((key_table + (16) * 8) + 8)(CTX),
 	      ((key_table + (16) * 8) + 12)(CTX),
 	      ((key_table + (16) * 8) + 0)(CTX),
 	      ((key_table + (16) * 8) + 4)(CTX));
 
 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 8);
 
 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 	      %xmm15,
 	      ((key_table + (8) * 8) + 8)(CTX),
 	      ((key_table + (8) * 8) + 12)(CTX),
 	      ((key_table + (8) * 8) + 0)(CTX),
 	      ((key_table + (8) * 8) + 4)(CTX));
 
 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 0);
 
 	/* load CD for output */
 	vmovdqu 0 * 16(%rcx), %xmm8;
 	vmovdqu 1 * 16(%rcx), %xmm9;
 	vmovdqu 2 * 16(%rcx), %xmm10;
 	vmovdqu 3 * 16(%rcx), %xmm11;
 	vmovdqu 4 * 16(%rcx), %xmm12;
 	vmovdqu 5 * 16(%rcx), %xmm13;
 	vmovdqu 6 * 16(%rcx), %xmm14;
 	vmovdqu 7 * 16(%rcx), %xmm15;
 
 	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
 
 	ret;
 
 .align 8
 .Ldec_max32:
 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rax, %rcx, 24);
 
 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 	      %xmm15,
 	      ((key_table + (24) * 8) + 8)(CTX),
 	      ((key_table + (24) * 8) + 12)(CTX),
 	      ((key_table + (24) * 8) + 0)(CTX),
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
+	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
 	vpslldq $8, tmp, tmp; \
 	vpsubq tmp, x, x;
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ctr_enc
 ELF(.type   _gcry_camellia_aesni_avx_ctr_enc,@function;)
 
 _gcry_camellia_aesni_avx_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 16), %rsp;
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	vmovdqa .Lbswap128_mask RIP, %xmm14;
+	vmovdqa .Lbswap128_mask rRIP, %xmm14;
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm15;
 	vmovdqu %xmm15, 15 * 16(%rax);
 	vpshufb %xmm14, %xmm15, %xmm0; /* be => le */
 
 	vpcmpeqd %xmm15, %xmm15, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
 
 	/* construct IVs */
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm13;
 	vmovdqu %xmm13, 14 * 16(%rax);
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm13;
 	vmovdqu %xmm13, 13 * 16(%rax);
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm12;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm11;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm10;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm9;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm8;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm7;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm6;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm5;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm4;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm3;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm2;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vpshufb %xmm14, %xmm0, %xmm1;
 	inc_le128(%xmm0, %xmm15, %xmm13);
 	vmovdqa %xmm0, %xmm13;
 	vpshufb %xmm14, %xmm0, %xmm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
 	vmovdqu %xmm13, (%rcx);
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
 	vpxor %xmm3, %xmm15, %xmm3;
 	vpxor %xmm4, %xmm15, %xmm4;
 	vpxor %xmm5, %xmm15, %xmm5;
 	vpxor %xmm6, %xmm15, %xmm6;
 	vpxor %xmm7, %xmm15, %xmm7;
 	vpxor %xmm8, %xmm15, %xmm8;
 	vpxor %xmm9, %xmm15, %xmm9;
 	vpxor %xmm10, %xmm15, %xmm10;
 	vpxor %xmm11, %xmm15, %xmm11;
 	vpxor %xmm12, %xmm15, %xmm12;
 	vpxor 13 * 16(%rax), %xmm15, %xmm13;
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
 	call __camellia_enc_blk16;
 
 	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
 	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
 	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
 	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
 	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
 	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
 	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
 	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
 	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
 	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
 	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
 	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
 	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
 	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
 	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
 	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
 
 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cbc_dec
 ELF(.type   _gcry_camellia_aesni_avx_cbc_dec,@function;)
 
 _gcry_camellia_aesni_avx_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	movq %rcx, %r9;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 
 	subq $(16 * 16), %rsp;
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
 	call __camellia_dec_blk16;
 
 	/* XOR output with IV */
 	vpxor (%r9), %xmm7, %xmm7;
 	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
 	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
 	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
 	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
 	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
 	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
 	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
 	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
 	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
 	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
 	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
 	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
 	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
 	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
 	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
 	movq (15 * 16 + 0)(%rdx), %r10;
 	movq (15 * 16 + 8)(%rdx), %r11;
 
 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
 	/* store new IV */
 	movq %r10, (0)(%r9);
 	movq %r11, (8)(%r9);
 
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cfb_dec
 ELF(.type   _gcry_camellia_aesni_avx_cfb_dec,@function;)
 
 _gcry_camellia_aesni_avx_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 16), %rsp;
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm0;
-	vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+	vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0;
 	vpxor (%rcx), %xmm0, %xmm15;
 	vmovdqu 15 * 16(%rdx), %xmm1;
 	vmovdqu %xmm1, (%rcx); /* store new IV */
 	vpxor 0 * 16(%rdx), %xmm0, %xmm14;
 	vpxor 1 * 16(%rdx), %xmm0, %xmm13;
 	vpxor 2 * 16(%rdx), %xmm0, %xmm12;
 	vpxor 3 * 16(%rdx), %xmm0, %xmm11;
 	vpxor 4 * 16(%rdx), %xmm0, %xmm10;
 	vpxor 5 * 16(%rdx), %xmm0, %xmm9;
 	vpxor 6 * 16(%rdx), %xmm0, %xmm8;
 	vpxor 7 * 16(%rdx), %xmm0, %xmm7;
 	vpxor 8 * 16(%rdx), %xmm0, %xmm6;
 	vpxor 9 * 16(%rdx), %xmm0, %xmm5;
 	vpxor 10 * 16(%rdx), %xmm0, %xmm4;
 	vpxor 11 * 16(%rdx), %xmm0, %xmm3;
 	vpxor 12 * 16(%rdx), %xmm0, %xmm2;
 	vpxor 13 * 16(%rdx), %xmm0, %xmm1;
 	vpxor 14 * 16(%rdx), %xmm0, %xmm0;
 
 	call __camellia_enc_blk16;
 
 	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
 	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
 	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
 	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
 	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
 	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
 	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
 	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
 	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
 	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
 	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
 	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
 	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
 	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
 	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
 	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
 
 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ocb_enc
 ELF(.type   _gcry_camellia_aesni_avx_ocb_enc,@function;)
 
 _gcry_camellia_aesni_avx_ocb_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 16 + 4 * 8), %rsp;
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm15;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, lreg, xreg) \
 	  vmovdqu (n * 16)(%rdx), xreg; \
 	  vpxor (lreg), %xmm14, %xmm14; \
 	  vpxor xreg, %xmm15, %xmm15; \
 	  vpxor xreg, %xmm14, xreg; \
 	  vmovdqu %xmm14, (n * 16)(%rsi);
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %xmm0);
 	vmovdqu %xmm0, (15 * 16)(%rax);
 	OCB_INPUT(1, %r11, %xmm0);
 	vmovdqu %xmm0, (14 * 16)(%rax);
 	OCB_INPUT(2, %r12, %xmm13);
 	OCB_INPUT(3, %r13, %xmm12);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %xmm11);
 	OCB_INPUT(5, %r11, %xmm10);
 	OCB_INPUT(6, %r12, %xmm9);
 	OCB_INPUT(7, %r13, %xmm8);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(8, %r10, %xmm7);
 	OCB_INPUT(9, %r11, %xmm6);
 	OCB_INPUT(10, %r12, %xmm5);
 	OCB_INPUT(11, %r13, %xmm4);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(12, %r10, %xmm3);
 	OCB_INPUT(13, %r11, %xmm2);
 	OCB_INPUT(14, %r12, %xmm1);
 	OCB_INPUT(15, %r13, %xmm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm14, (%rcx);
 	vmovdqu %xmm15, (%r8);
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
 	vpxor %xmm3, %xmm15, %xmm3;
 	vpxor %xmm4, %xmm15, %xmm4;
 	vpxor %xmm5, %xmm15, %xmm5;
 	vpxor %xmm6, %xmm15, %xmm6;
 	vpxor %xmm7, %xmm15, %xmm7;
 	vpxor %xmm8, %xmm15, %xmm8;
 	vpxor %xmm9, %xmm15, %xmm9;
 	vpxor %xmm10, %xmm15, %xmm10;
 	vpxor %xmm11, %xmm15, %xmm11;
 	vpxor %xmm12, %xmm15, %xmm12;
 	vpxor %xmm13, %xmm15, %xmm13;
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
 	call __camellia_enc_blk16;
 
 	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
 	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
 	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
 	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
 	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
 	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
 	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
 	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
 	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
 	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
 	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
 	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
 	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
 	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
 	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
 	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
 
 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ocb_dec
 ELF(.type   _gcry_camellia_aesni_avx_ocb_dec,@function;)
 
 _gcry_camellia_aesni_avx_ocb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 16 + 4 * 8), %rsp;
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm15;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
 
 #define OCB_INPUT(n, lreg, xreg) \
 	  vmovdqu (n * 16)(%rdx), xreg; \
 	  vpxor (lreg), %xmm15, %xmm15; \
 	  vpxor xreg, %xmm15, xreg; \
 	  vmovdqu %xmm15, (n * 16)(%rsi);
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %xmm0);
 	vmovdqu %xmm0, (15 * 16)(%rax);
 	OCB_INPUT(1, %r11, %xmm14);
 	OCB_INPUT(2, %r12, %xmm13);
 	OCB_INPUT(3, %r13, %xmm12);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %xmm11);
 	OCB_INPUT(5, %r11, %xmm10);
 	OCB_INPUT(6, %r12, %xmm9);
 	OCB_INPUT(7, %r13, %xmm8);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(8, %r10, %xmm7);
 	OCB_INPUT(9, %r11, %xmm6);
 	OCB_INPUT(10, %r12, %xmm5);
 	OCB_INPUT(11, %r13, %xmm4);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(12, %r10, %xmm3);
 	OCB_INPUT(13, %r11, %xmm2);
 	OCB_INPUT(14, %r12, %xmm1);
 	OCB_INPUT(15, %r13, %xmm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm15, (%rcx);
 
 	movq %r8, %r10;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %r9d;
 	cmovel %r9d, %r8d; /* max */
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX, %r8, 8), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
 	vpxor %xmm3, %xmm15, %xmm3;
 	vpxor %xmm4, %xmm15, %xmm4;
 	vpxor %xmm5, %xmm15, %xmm5;
 	vpxor %xmm6, %xmm15, %xmm6;
 	vpxor %xmm7, %xmm15, %xmm7;
 	vpxor %xmm8, %xmm15, %xmm8;
 	vpxor %xmm9, %xmm15, %xmm9;
 	vpxor %xmm10, %xmm15, %xmm10;
 	vpxor %xmm11, %xmm15, %xmm11;
 	vpxor %xmm12, %xmm15, %xmm12;
 	vpxor %xmm13, %xmm15, %xmm13;
 	vpxor %xmm14, %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
 	call __camellia_dec_blk16;
 
 	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
 	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
 	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
 	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
 	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
 	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
 	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
 	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
 	vmovdqu %xmm7, (7 * 16)(%rax);
 	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
 	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
 	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
 	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
 	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
 	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
 	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
 	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
 	vpxor (%r10), %xmm7, %xmm7;
 	vpxor %xmm6, %xmm7, %xmm7;
 	vpxor %xmm5, %xmm7, %xmm7;
 	vpxor %xmm4, %xmm7, %xmm7;
 	vpxor %xmm3, %xmm7, %xmm7;
 	vpxor %xmm2, %xmm7, %xmm7;
 	vpxor %xmm1, %xmm7, %xmm7;
 	vpxor %xmm0, %xmm7, %xmm7;
 	vpxor %xmm15, %xmm7, %xmm7;
 	vpxor %xmm14, %xmm7, %xmm7;
 	vpxor %xmm13, %xmm7, %xmm7;
 	vpxor %xmm12, %xmm7, %xmm7;
 	vpxor %xmm11, %xmm7, %xmm7;
 	vpxor %xmm10, %xmm7, %xmm7;
 	vpxor %xmm9, %xmm7, %xmm7;
 	vpxor %xmm8, %xmm7, %xmm7;
 	vmovdqu %xmm7, (%r10);
 	vmovdqu (7 * 16)(%rax), %xmm7;
 
 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ocb_auth
 ELF(.type   _gcry_camellia_aesni_avx_ocb_auth,@function;)
 
 _gcry_camellia_aesni_avx_ocb_auth:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (16 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 16 + 4 * 8), %rsp;
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm15;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
 
 #define OCB_INPUT(n, lreg, xreg) \
 	  vmovdqu (n * 16)(%rsi), xreg; \
 	  vpxor (lreg), %xmm15, %xmm15; \
 	  vpxor xreg, %xmm15, xreg;
 
 	movq (0 * 8)(%r8), %r10;
 	movq (1 * 8)(%r8), %r11;
 	movq (2 * 8)(%r8), %r12;
 	movq (3 * 8)(%r8), %r13;
 	OCB_INPUT(0, %r10, %xmm0);
 	vmovdqu %xmm0, (15 * 16)(%rax);
 	OCB_INPUT(1, %r11, %xmm14);
 	OCB_INPUT(2, %r12, %xmm13);
 	OCB_INPUT(3, %r13, %xmm12);
 	movq (4 * 8)(%r8), %r10;
 	movq (5 * 8)(%r8), %r11;
 	movq (6 * 8)(%r8), %r12;
 	movq (7 * 8)(%r8), %r13;
 	OCB_INPUT(4, %r10, %xmm11);
 	OCB_INPUT(5, %r11, %xmm10);
 	OCB_INPUT(6, %r12, %xmm9);
 	OCB_INPUT(7, %r13, %xmm8);
 	movq (8 * 8)(%r8), %r10;
 	movq (9 * 8)(%r8), %r11;
 	movq (10 * 8)(%r8), %r12;
 	movq (11 * 8)(%r8), %r13;
 	OCB_INPUT(8, %r10, %xmm7);
 	OCB_INPUT(9, %r11, %xmm6);
 	OCB_INPUT(10, %r12, %xmm5);
 	OCB_INPUT(11, %r13, %xmm4);
 	movq (12 * 8)(%r8), %r10;
 	movq (13 * 8)(%r8), %r11;
 	movq (14 * 8)(%r8), %r12;
 	movq (15 * 8)(%r8), %r13;
 	OCB_INPUT(12, %r10, %xmm3);
 	OCB_INPUT(13, %r11, %xmm2);
 	OCB_INPUT(14, %r12, %xmm1);
 	OCB_INPUT(15, %r13, %xmm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm15, (%rdx);
 
 	movq %rcx, %r10;
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
 	vpxor %xmm3, %xmm15, %xmm3;
 	vpxor %xmm4, %xmm15, %xmm4;
 	vpxor %xmm5, %xmm15, %xmm5;
 	vpxor %xmm6, %xmm15, %xmm6;
 	vpxor %xmm7, %xmm15, %xmm7;
 	vpxor %xmm8, %xmm15, %xmm8;
 	vpxor %xmm9, %xmm15, %xmm9;
 	vpxor %xmm10, %xmm15, %xmm10;
 	vpxor %xmm11, %xmm15, %xmm11;
 	vpxor %xmm12, %xmm15, %xmm12;
 	vpxor %xmm13, %xmm15, %xmm13;
 	vpxor %xmm14, %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
 	call __camellia_enc_blk16;
 
 	vpxor %xmm7, %xmm6, %xmm6;
 	vpxor %xmm5, %xmm4, %xmm4;
 	vpxor %xmm3, %xmm2, %xmm2;
 	vpxor %xmm1, %xmm0, %xmm0;
 	vpxor %xmm15, %xmm14, %xmm14;
 	vpxor %xmm13, %xmm12, %xmm12;
 	vpxor %xmm11, %xmm10, %xmm10;
 	vpxor %xmm9, %xmm8, %xmm8;
 
 	vpxor %xmm6, %xmm4, %xmm4;
 	vpxor %xmm2, %xmm0, %xmm0;
 	vpxor %xmm14, %xmm12, %xmm12;
 	vpxor %xmm10, %xmm8, %xmm8;
 
 	vpxor %xmm4, %xmm0, %xmm0;
 	vpxor %xmm12, %xmm8, %xmm8;
 
 	vpxor %xmm0, %xmm8, %xmm0;
 	vpxor (%r10), %xmm0, %xmm0;
 	vmovdqu %xmm0, (%r10);
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
 
 /*
  * IN:
  *  ab: 64-bit AB state
  *  cd: 64-bit CD state
  */
 #define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
 		   _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
 	vmovq key, t0; \
 	vpxor x, x, t3; \
 	\
 	vpxor ab, t0, x; \
 	\
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
 	\
 	/* input rotation for sbox4 (<<< 1) */ \
 	vpand x, sbox4mask, t0; \
 	vpandn x, sbox4mask, x; \
 	vpaddw t0, t0, t1; \
 	vpsrlw $7, t0, t0; \
 	vpor t0, t1, t0; \
 	vpand sbox4mask, t0, t0; \
 	vpor t0, x, x; \
 	\
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
 	\
 	/* prefilter sboxes */ \
 	filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
 	\
 	/* AES subbytes + AES shift rows + AES inv shift rows */ \
 	vaesenclast t3, x, x; \
 	\
 	/* postfilter sboxes */ \
 	filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
 	\
 	/* output rotation for sbox2 (<<< 1) */ \
 	/* output rotation for sbox3 (>>> 1) */ \
 	vpshufb inv_shift_row, x, t1; \
-	vpshufb .Lsp0044440444044404mask RIP, x, t4; \
-	vpshufb .Lsp1110111010011110mask RIP, x, x; \
+	vpshufb .Lsp0044440444044404mask rRIP, x, t4; \
+	vpshufb .Lsp1110111010011110mask rRIP, x, x; \
 	vpaddb t1, t1, t2; \
 	vpsrlw $7, t1, t0; \
 	vpsllw $7, t1, t3; \
 	vpor t0, t2, t0; \
 	vpsrlw $1, t1, t1; \
-	vpshufb .Lsp0222022222000222mask RIP, t0, t0; \
+	vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \
 	vpor t1, t3, t1; \
 	\
 	vpxor x, t4, t4; \
-	vpshufb .Lsp3033303303303033mask RIP, t1, t1; \
+	vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \
 	vpxor t4, t0, t0; \
 	vpxor t1, t0, t0; \
 	vpsrldq $8, t0, x; \
 	vpxor t0, x, x;
 
 #define vec_rol128(in, out, nrol, t0) \
 	vpshufd $0x4e, in, out; \
 	vpsllq $(nrol), in, t0; \
 	vpsrlq $(64-(nrol)), out, out; \
 	vpaddd t0, out, out;
 
 #define vec_ror128(in, out, nror, t0) \
 	vpshufd $0x4e, in, out; \
 	vpsrlq $(nror), in, t0; \
 	vpsllq $(64-(nror)), out, out; \
 	vpaddd t0, out, out;
 
 
 .align 16
 .Linv_shift_row_and_unpcklbw:
 	.byte 0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff
 	.byte 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff
 .Lsp0044440444044404mask:
 	.long 0xffff0404, 0x0404ff04;
 	.long 0x0d0dff0d, 0x0d0dff0d;
 .Lsp1110111010011110mask:
 	.long 0x000000ff, 0x000000ff;
 	.long 0x0bffff0b, 0x0b0b0bff;
 .Lsp0222022222000222mask:
 	.long 0xff060606, 0xff060606;
 	.long 0x0c0cffff, 0xff0c0c0c;
 .Lsp3033303303303033mask:
 	.long 0x04ff0404, 0x04ff0404;
 	.long 0xff0a0aff, 0x0aff0a0a;
 .Lsbox4_input_mask:
 	.byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00;
 .Lsigma1:
 	.long 0x3BCC908B, 0xA09E667F;
 .Lsigma2:
 	.long 0x4CAA73B2, 0xB67AE858;
 .Lsigma3:
 	.long 0xE94F82BE, 0xC6EF372F;
 .Lsigma4:
 	.long 0xF1D36F1C, 0x54FF53A5;
 .Lsigma5:
 	.long 0xDE682D1D, 0x10E527FA;
 .Lsigma6:
 	.long 0xB3E6C1FD, 0xB05688C2;
 
 
 .align 8
 ELF(.type  __camellia_avx_setup128,@function;)
 __camellia_avx_setup128:
 	/* input:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
 	 *	%xmm0: key
 	 */
+	CFI_STARTPROC();
+
 #define cmll_sub(n, ctx) (key_table+((n)*8))(ctx)
 #define KL128 %xmm0
 #define KA128 %xmm2
 
-	vpshufb .Lbswap128_mask RIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
 
-	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
-	vmovq .Lsbox4_input_mask RIP, %xmm12;
-	vbroadcastss .L0f0f0f0f RIP, %xmm13;
-	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
-	vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15;
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
 
 	/*
 	 * Generate KA
 	 */
 	vpsrldq $8, KL128, %xmm2;
 	vmovdqa KL128, %xmm3;
 	vpslldq $8, %xmm3, %xmm3;
 	vpsrldq $8, %xmm3, %xmm3;
 
 	camellia_f(%xmm2, %xmm4, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm2, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
 	camellia_f(%xmm2, %xmm3, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm4, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
 
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm4, %xmm2, %xmm2;
 	vpsrldq $8, %xmm3, %xmm3;
 	vpslldq $8, %xmm2, KA128;
 	vpor %xmm3, KA128, KA128;
 
         /*
          * Generate subkeys
          */
 	vmovdqu KA128, cmll_sub(24, CTX);
 	vec_rol128(KL128, %xmm3, 15, %xmm15);
 	vec_rol128(KA128, %xmm4, 15, %xmm15);
 	vec_rol128(KA128, %xmm5, 30, %xmm15);
 	vec_rol128(KL128, %xmm6, 45, %xmm15);
 	vec_rol128(KA128, %xmm7, 45, %xmm15);
 	vec_rol128(KL128, %xmm8, 60, %xmm15);
 	vec_rol128(KA128, %xmm9, 60, %xmm15);
 	vec_ror128(KL128, %xmm10, 128-77, %xmm15);
 
 	/* absorb kw2 to other subkeys */
 	vpslldq $8, KL128, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, KA128, KA128;
 	vpxor %xmm15, %xmm3, %xmm3;
 	vpxor %xmm15, %xmm4, %xmm4;
 
 	/* subl(1) ^= subr(1) & ~subr(9); */
 	vpandn %xmm15, %xmm5, %xmm13;
 	vpslldq $12, %xmm13, %xmm13;
 	vpsrldq $8, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm5, %xmm14;
 	vpslld $1, %xmm14, %xmm11;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm11, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpxor %xmm15, %xmm6, %xmm6;
 	vpxor %xmm15, %xmm8, %xmm8;
 	vpxor %xmm15, %xmm9, %xmm9;
 
 	/* subl(1) ^= subr(1) & ~subr(17); */
 	vpandn %xmm15, %xmm10, %xmm13;
 	vpslldq $12, %xmm13, %xmm13;
 	vpsrldq $8, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm10, %xmm14;
 	vpslld $1, %xmm14, %xmm11;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm11, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpshufd $0x1b, KL128, KL128;
 	vpshufd $0x1b, KA128, KA128;
 	vpshufd $0x1b, %xmm3, %xmm3;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm5, %xmm5;
 	vpshufd $0x1b, %xmm6, %xmm6;
 	vpshufd $0x1b, %xmm7, %xmm7;
 	vpshufd $0x1b, %xmm8, %xmm8;
 	vpshufd $0x1b, %xmm9, %xmm9;
 	vpshufd $0x1b, %xmm10, %xmm10;
 
 	vmovdqu KL128, cmll_sub(0, CTX);
 	vpshufd $0x1b, KL128, KL128;
 	vmovdqu KA128, cmll_sub(2, CTX);
 	vmovdqu %xmm3, cmll_sub(4, CTX);
 	vmovdqu %xmm4, cmll_sub(6, CTX);
 	vmovdqu %xmm5, cmll_sub(8, CTX);
 	vmovdqu %xmm6, cmll_sub(10, CTX);
 	vpsrldq $8, %xmm8, %xmm8;
 	vmovq %xmm7, cmll_sub(12, CTX);
 	vmovq %xmm8, cmll_sub(13, CTX);
 	vmovdqu %xmm9, cmll_sub(14, CTX);
 	vmovdqu %xmm10, cmll_sub(16, CTX);
 
 	vmovdqu cmll_sub(24, CTX), KA128;
 
 	vec_ror128(KL128, %xmm3, 128 - 94, %xmm7);
 	vec_ror128(KA128, %xmm4, 128 - 94, %xmm7);
 	vec_ror128(KL128, %xmm5, 128 - 111, %xmm7);
 	vec_ror128(KA128, %xmm6, 128 - 111, %xmm7);
 
 	vpxor %xmm15, %xmm3, %xmm3;
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm5, %xmm5;
 	vpslldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm6, %xmm6;
 
 	/* absorb kw4 to other subkeys */
 	vpslldq $8, %xmm6, %xmm15;
 	vpxor %xmm15, %xmm5, %xmm5;
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm3, %xmm3;
 
 	/* subl(25) ^= subr(25) & ~subr(16); */
 	vpshufd $0x1b, cmll_sub(16, CTX), %xmm10;
 	vpandn %xmm15, %xmm10, %xmm13;
 	vpslldq $4, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm10, %xmm14;
 	vpslld $1, %xmm14, %xmm11;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm11, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpshufd $0x1b, %xmm3, %xmm3;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm5, %xmm5;
 	vpshufd $0x1b, %xmm6, %xmm6;
 
 	vmovdqu %xmm3, cmll_sub(18, CTX);
 	vmovdqu %xmm4, cmll_sub(20, CTX);
 	vmovdqu %xmm5, cmll_sub(22, CTX);
 	vmovdqu %xmm6, cmll_sub(24, CTX);
 
 	vpshufd $0x1b, cmll_sub(14, CTX), %xmm3;
 	vpshufd $0x1b, cmll_sub(12, CTX), %xmm4;
 	vpshufd $0x1b, cmll_sub(10, CTX), %xmm5;
 	vpshufd $0x1b, cmll_sub(8, CTX), %xmm6;
 
 	vpxor %xmm15, %xmm3, %xmm3;
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm5, %xmm5;
 
 	/* subl(25) ^= subr(25) & ~subr(8); */
 	vpandn %xmm15, %xmm6, %xmm13;
 	vpslldq $4, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm6, %xmm14;
 	vpslld $1, %xmm14, %xmm11;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm11, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpshufd $0x1b, %xmm3, %xmm3;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm5, %xmm5;
 
 	vmovdqu %xmm3, cmll_sub(14, CTX);
 	vmovdqu %xmm4, cmll_sub(12, CTX);
 	vmovdqu %xmm5, cmll_sub(10, CTX);
 
 	vpshufd $0x1b, cmll_sub(6, CTX), %xmm6;
 	vpshufd $0x1b, cmll_sub(4, CTX), %xmm4;
 	vpshufd $0x1b, cmll_sub(2, CTX), %xmm2;
 	vpshufd $0x1b, cmll_sub(0, CTX), %xmm0;
 
 	vpxor %xmm15, %xmm6, %xmm6;
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm2, %xmm2;
 	vpxor %xmm15, %xmm0, %xmm0;
 
 	vpshufd $0x1b, %xmm6, %xmm6;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm2, %xmm2;
 	vpshufd $0x1b, %xmm0, %xmm0;
 
 	vpsrldq $8, %xmm2, %xmm3;
 	vpsrldq $8, %xmm4, %xmm5;
 	vpsrldq $8, %xmm6, %xmm7;
 
         /*
 	 * key XOR is end of F-function.
 	 */
 	vpxor %xmm2, %xmm0, %xmm0;
 	vpxor %xmm4, %xmm2, %xmm2;
 
 	vmovq %xmm0, cmll_sub(0, CTX);
 	vmovq %xmm3, cmll_sub(2, CTX);
 	vpxor %xmm5, %xmm3, %xmm3;
 	vpxor %xmm6, %xmm4, %xmm4;
 	vpxor %xmm7, %xmm5, %xmm5;
 	vmovq %xmm2, cmll_sub(3, CTX);
 	vmovq %xmm3, cmll_sub(4, CTX);
 	vmovq %xmm4, cmll_sub(5, CTX);
 	vmovq %xmm5, cmll_sub(6, CTX);
 
 	vmovq cmll_sub(7, CTX), %xmm7;
 	vmovq cmll_sub(8, CTX), %xmm8;
 	vmovq cmll_sub(9, CTX), %xmm9;
 	vmovq cmll_sub(10, CTX), %xmm10;
 	/* tl = subl(10) ^ (subr(10) & ~subr(8)); */
 	vpandn %xmm10, %xmm8, %xmm15;
 	vpsrldq $4, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm10, %xmm0;
 	/* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm8, %xmm0, %xmm15;
 	vpslld $1, %xmm15, %xmm14;
 	vpsrld $31, %xmm15, %xmm15;
 	vpaddd %xmm14, %xmm15, %xmm15;
 	vpslldq $12, %xmm15, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm0, %xmm0;
 
 	vpxor %xmm0, %xmm6, %xmm6;
 	vmovq %xmm6, cmll_sub(7, CTX);
 
 	vmovq cmll_sub(11, CTX), %xmm11;
 	vmovq cmll_sub(12, CTX), %xmm12;
 	vmovq cmll_sub(13, CTX), %xmm13;
 	vmovq cmll_sub(14, CTX), %xmm14;
 	vmovq cmll_sub(15, CTX), %xmm15;
 	/* tl = subl(7) ^ (subr(7) & ~subr(9)); */
 	vpandn %xmm7, %xmm9, %xmm1;
 	vpsrldq $4, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm7, %xmm0;
 	/* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm9, %xmm0, %xmm1;
 	vpslld $1, %xmm1, %xmm2;
 	vpsrld $31, %xmm1, %xmm1;
 	vpaddd %xmm2, %xmm1, %xmm1;
 	vpslldq $12, %xmm1, %xmm1;
 	vpsrldq $8, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm0, %xmm0;
 
 	vpxor %xmm11, %xmm0, %xmm0;
 	vpxor %xmm12, %xmm10, %xmm10;
 	vpxor %xmm13, %xmm11, %xmm11;
 	vpxor %xmm14, %xmm12, %xmm12;
 	vpxor %xmm15, %xmm13, %xmm13;
 	vmovq %xmm0, cmll_sub(10, CTX);
 	vmovq %xmm10, cmll_sub(11, CTX);
 	vmovq %xmm11, cmll_sub(12, CTX);
 	vmovq %xmm12, cmll_sub(13, CTX);
 	vmovq %xmm13, cmll_sub(14, CTX);
 
 	vmovq cmll_sub(16, CTX), %xmm6;
 	vmovq cmll_sub(17, CTX), %xmm7;
 	vmovq cmll_sub(18, CTX), %xmm8;
 	vmovq cmll_sub(19, CTX), %xmm9;
 	vmovq cmll_sub(20, CTX), %xmm10;
 	/* tl = subl(18) ^ (subr(18) & ~subr(16)); */
 	vpandn %xmm8, %xmm6, %xmm1;
 	vpsrldq $4, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm8, %xmm0;
 	/* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm6, %xmm0, %xmm1;
 	vpslld $1, %xmm1, %xmm2;
 	vpsrld $31, %xmm1, %xmm1;
 	vpaddd %xmm2, %xmm1, %xmm1;
 	vpslldq $12, %xmm1, %xmm1;
 	vpsrldq $8, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm0, %xmm0;
 
 	vpxor %xmm14, %xmm0, %xmm0;
 	vmovq %xmm0, cmll_sub(15, CTX);
 
 	/* tl = subl(15) ^ (subr(15) & ~subr(17)); */
 	vpandn %xmm15, %xmm7, %xmm1;
 	vpsrldq $4, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm15, %xmm0;
 	/* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm7, %xmm0, %xmm1;
 	vpslld $1, %xmm1, %xmm2;
 	vpsrld $31, %xmm1, %xmm1;
 	vpaddd %xmm2, %xmm1, %xmm1;
 	vpslldq $12, %xmm1, %xmm1;
 	vpsrldq $8, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm0, %xmm0;
 
 	vmovq cmll_sub(21, CTX), %xmm1;
 	vmovq cmll_sub(22, CTX), %xmm2;
 	vmovq cmll_sub(23, CTX), %xmm3;
 	vmovq cmll_sub(24, CTX), %xmm4;
 
 	vpxor %xmm9, %xmm0, %xmm0;
 	vpxor %xmm10, %xmm8, %xmm8;
 	vpxor %xmm1, %xmm9, %xmm9;
 	vpxor %xmm2, %xmm10, %xmm10;
 	vpxor %xmm3, %xmm1, %xmm1;
 	vpxor %xmm4, %xmm3, %xmm3;
 
 	vmovq %xmm0, cmll_sub(18, CTX);
 	vmovq %xmm8, cmll_sub(19, CTX);
 	vmovq %xmm9, cmll_sub(20, CTX);
 	vmovq %xmm10, cmll_sub(21, CTX);
 	vmovq %xmm1, cmll_sub(22, CTX);
 	vmovq %xmm2, cmll_sub(23, CTX);
 	vmovq %xmm3, cmll_sub(24, CTX);
 
 	/* kw2 and kw4 are unused now. */
 	movq $0, cmll_sub(1, CTX);
 	movq $0, cmll_sub(25, CTX);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
 .align 8
 ELF(.type  __camellia_avx_setup256,@function;)
 
 __camellia_avx_setup256:
 	/* input:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
 	 *	%xmm0 & %xmm1: key
 	 */
+	CFI_STARTPROC();
+
 #define KL128 %xmm0
 #define KR128 %xmm1
 #define KA128 %xmm2
 #define KB128 %xmm3
 
-	vpshufb .Lbswap128_mask RIP, KL128, KL128;
-	vpshufb .Lbswap128_mask RIP, KR128, KR128;
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KR128, KR128;
 
-	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
-	vmovq .Lsbox4_input_mask RIP, %xmm12;
-	vbroadcastss .L0f0f0f0f RIP, %xmm13;
-	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
-	vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15;
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
 
 	/*
 	 * Generate KA
 	 */
 	vpxor KL128, KR128, %xmm3;
 	vpsrldq $8, KR128, %xmm6;
 	vpsrldq $8, %xmm3, %xmm2;
 	vpslldq $8, %xmm3, %xmm3;
 	vpsrldq $8, %xmm3, %xmm3;
 
 	camellia_f(%xmm2, %xmm4, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm2, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
 	vpxor %xmm6, %xmm2, %xmm2;
 	camellia_f(%xmm2, %xmm3, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	vpxor KR128, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm4, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
 
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm4, %xmm2, %xmm2;
 	vpsrldq $8, %xmm3, %xmm3;
 	vpslldq $8, %xmm2, KA128;
 	vpor %xmm3, KA128, KA128;
 
 	/*
 	 * Generate KB
 	 */
 	vpxor KA128, KR128, %xmm3;
 	vpsrldq $8, %xmm3, %xmm4;
 	vpslldq $8, %xmm3, %xmm3;
 	vpsrldq $8, %xmm3, %xmm3;
 
 	camellia_f(%xmm4, %xmm5, %xmm6,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
 	vpxor %xmm5, %xmm3, %xmm3;
 
 	camellia_f(%xmm3, %xmm5, %xmm6,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm5, %xmm4, %xmm4;
 	vpsrldq $8, %xmm3, %xmm3;
 	vpslldq $8, %xmm4, %xmm4;
 	vpor %xmm3, %xmm4, KB128;
 
         /*
          * Generate subkeys
          */
 	vmovdqu KB128, cmll_sub(32, CTX);
 	vec_rol128(KR128, %xmm4, 15, %xmm15);
 	vec_rol128(KA128, %xmm5, 15, %xmm15);
 	vec_rol128(KR128, %xmm6, 30, %xmm15);
 	vec_rol128(KB128, %xmm7, 30, %xmm15);
 	vec_rol128(KL128, %xmm8, 45, %xmm15);
 	vec_rol128(KA128, %xmm9, 45, %xmm15);
 	vec_rol128(KL128, %xmm10, 60, %xmm15);
 	vec_rol128(KR128, %xmm11, 60, %xmm15);
 	vec_rol128(KB128, %xmm12, 60, %xmm15);
 
 	/* absorb kw2 to other subkeys */
 	vpslldq $8, KL128, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, KB128, KB128;
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm5, %xmm5;
 
 	/* subl(1) ^= subr(1) & ~subr(9); */
 	vpandn %xmm15, %xmm6, %xmm13;
 	vpslldq $12, %xmm13, %xmm13;
 	vpsrldq $8, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm6, %xmm14;
 	vpslld $1, %xmm14, %xmm13;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm13, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpxor %xmm15, %xmm7, %xmm7;
 	vpxor %xmm15, %xmm8, %xmm8;
 	vpxor %xmm15, %xmm9, %xmm9;
 
 	vpshufd $0x1b, KL128, KL128;
 	vpshufd $0x1b, KB128, KB128;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm5, %xmm5;
 	vpshufd $0x1b, %xmm6, %xmm6;
 	vpshufd $0x1b, %xmm7, %xmm7;
 	vpshufd $0x1b, %xmm8, %xmm8;
 	vpshufd $0x1b, %xmm9, %xmm9;
 
 	vmovdqu KL128, cmll_sub(0, CTX);
 	vpshufd $0x1b, KL128, KL128;
 	vmovdqu KB128, cmll_sub(2, CTX);
 	vmovdqu %xmm4, cmll_sub(4, CTX);
 	vmovdqu %xmm5, cmll_sub(6, CTX);
 	vmovdqu %xmm6, cmll_sub(8, CTX);
 	vmovdqu %xmm7, cmll_sub(10, CTX);
 	vmovdqu %xmm8, cmll_sub(12, CTX);
 	vmovdqu %xmm9, cmll_sub(14, CTX);
 
 	vmovdqu cmll_sub(32, CTX), KB128;
 
 	/* subl(1) ^= subr(1) & ~subr(17); */
 	vpandn %xmm15, %xmm10, %xmm13;
 	vpslldq $12, %xmm13, %xmm13;
 	vpsrldq $8, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm10, %xmm14;
 	vpslld $1, %xmm14, %xmm13;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm13, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpxor %xmm15, %xmm11, %xmm11;
 	vpxor %xmm15, %xmm12, %xmm12;
 
 	vec_ror128(KL128, %xmm4, 128-77, %xmm14);
 	vec_ror128(KA128, %xmm5, 128-77, %xmm14);
 	vec_ror128(KR128, %xmm6, 128-94, %xmm14);
 	vec_ror128(KA128, %xmm7, 128-94, %xmm14);
 	vec_ror128(KL128, %xmm8, 128-111, %xmm14);
 	vec_ror128(KB128, %xmm9, 128-111, %xmm14);
 
 	vpxor %xmm15, %xmm4, %xmm4;
 
 	vpshufd $0x1b, %xmm10, %xmm10;
 	vpshufd $0x1b, %xmm11, %xmm11;
 	vpshufd $0x1b, %xmm12, %xmm12;
 	vpshufd $0x1b, %xmm4, %xmm4;
 
 	vmovdqu %xmm10, cmll_sub(16, CTX);
 	vmovdqu %xmm11, cmll_sub(18, CTX);
 	vmovdqu %xmm12, cmll_sub(20, CTX);
 	vmovdqu %xmm4, cmll_sub(22, CTX);
 
 	/* subl(1) ^= subr(1) & ~subr(25); */
 	vpandn %xmm15, %xmm5, %xmm13;
 	vpslldq $12, %xmm13, %xmm13;
 	vpsrldq $8, %xmm13, %xmm13;
 	vpxor %xmm13, %xmm15, %xmm15;
 	/* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm5, %xmm14;
 	vpslld $1, %xmm14, %xmm13;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm13, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpxor %xmm15, %xmm6, %xmm6;
 	vpxor %xmm15, %xmm7, %xmm7;
 	vpxor %xmm15, %xmm8, %xmm8;
 	vpslldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm9, %xmm9;
 
 	/* absorb kw4 to other subkeys */
 	vpslldq $8, %xmm9, %xmm15;
 	vpxor %xmm15, %xmm8, %xmm8;
 	vpxor %xmm15, %xmm7, %xmm7;
 	vpxor %xmm15, %xmm6, %xmm6;
 
 	/* subl(33) ^= subr(33) & ~subr(24); */
 	vpandn %xmm15, %xmm5, %xmm14;
 	vpslldq $4, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 	/* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm5, %xmm14;
 	vpslld $1, %xmm14, %xmm13;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm13, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpshufd $0x1b, %xmm5, %xmm5;
 	vpshufd $0x1b, %xmm6, %xmm6;
 	vpshufd $0x1b, %xmm7, %xmm7;
 	vpshufd $0x1b, %xmm8, %xmm8;
 	vpshufd $0x1b, %xmm9, %xmm9;
 
 	vmovdqu %xmm5, cmll_sub(24, CTX);
 	vmovdqu %xmm6, cmll_sub(26, CTX);
 	vmovdqu %xmm7, cmll_sub(28, CTX);
 	vmovdqu %xmm8, cmll_sub(30, CTX);
 	vmovdqu %xmm9, cmll_sub(32, CTX);
 
 	vpshufd $0x1b, cmll_sub(22, CTX), %xmm0;
 	vpshufd $0x1b, cmll_sub(20, CTX), %xmm1;
 	vpshufd $0x1b, cmll_sub(18, CTX), %xmm2;
 	vpshufd $0x1b, cmll_sub(16, CTX), %xmm3;
 	vpshufd $0x1b, cmll_sub(14, CTX), %xmm4;
 	vpshufd $0x1b, cmll_sub(12, CTX), %xmm5;
 	vpshufd $0x1b, cmll_sub(10, CTX), %xmm6;
 	vpshufd $0x1b, cmll_sub(8, CTX), %xmm7;
 
 	vpxor %xmm15, %xmm0, %xmm0;
 	vpxor %xmm15, %xmm1, %xmm1;
 	vpxor %xmm15, %xmm2, %xmm2;
 
 	/* subl(33) ^= subr(33) & ~subr(24); */
 	vpandn %xmm15, %xmm3, %xmm14;
 	vpslldq $4, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 	/* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm3, %xmm14;
 	vpslld $1, %xmm14, %xmm13;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm13, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm5, %xmm5;
 	vpxor %xmm15, %xmm6, %xmm6;
 
 	vpshufd $0x1b, %xmm0, %xmm0;
 	vpshufd $0x1b, %xmm1, %xmm1;
 	vpshufd $0x1b, %xmm2, %xmm2;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm5, %xmm5;
 	vpshufd $0x1b, %xmm6, %xmm6;
 
 	vmovdqu %xmm0, cmll_sub(22, CTX);
 	vmovdqu %xmm1, cmll_sub(20, CTX);
 	vmovdqu %xmm2, cmll_sub(18, CTX);
 	vmovdqu %xmm4, cmll_sub(14, CTX);
 	vmovdqu %xmm5, cmll_sub(12, CTX);
 	vmovdqu %xmm6, cmll_sub(10, CTX);
 
 	vpshufd $0x1b, cmll_sub(6, CTX), %xmm6;
 	vpshufd $0x1b, cmll_sub(4, CTX), %xmm4;
 	vpshufd $0x1b, cmll_sub(2, CTX), %xmm2;
 	vpshufd $0x1b, cmll_sub(0, CTX), %xmm0;
 
 	/* subl(33) ^= subr(33) & ~subr(24); */
 	vpandn %xmm15, %xmm7, %xmm14;
 	vpslldq $4, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 	/* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
 	vpand %xmm15, %xmm7, %xmm14;
 	vpslld $1, %xmm14, %xmm13;
 	vpsrld $31, %xmm14, %xmm14;
 	vpaddd %xmm13, %xmm14, %xmm14;
 	vpsrldq $12, %xmm14, %xmm14;
 	vpslldq $8, %xmm14, %xmm14;
 	vpxor %xmm14, %xmm15, %xmm15;
 
 	vpxor %xmm15, %xmm6, %xmm6;
 	vpxor %xmm15, %xmm4, %xmm4;
 	vpxor %xmm15, %xmm2, %xmm2;
 	vpxor %xmm15, %xmm0, %xmm0;
 
 	vpshufd $0x1b, %xmm6, %xmm6;
 	vpshufd $0x1b, %xmm4, %xmm4;
 	vpshufd $0x1b, %xmm2, %xmm2;
 	vpshufd $0x1b, %xmm0, %xmm0;
 
 	vpsrldq $8, %xmm2, %xmm3;
 	vpsrldq $8, %xmm4, %xmm5;
 	vpsrldq $8, %xmm6, %xmm7;
 
         /*
 	 * key XOR is end of F-function.
 	 */
 	vpxor %xmm2, %xmm0, %xmm0;
 	vpxor %xmm4, %xmm2, %xmm2;
 
 	vmovq %xmm0, cmll_sub(0, CTX);
 	vmovq %xmm3, cmll_sub(2, CTX);
 	vpxor %xmm5, %xmm3, %xmm3;
 	vpxor %xmm6, %xmm4, %xmm4;
 	vpxor %xmm7, %xmm5, %xmm5;
 	vmovq %xmm2, cmll_sub(3, CTX);
 	vmovq %xmm3, cmll_sub(4, CTX);
 	vmovq %xmm4, cmll_sub(5, CTX);
 	vmovq %xmm5, cmll_sub(6, CTX);
 
 	vmovq cmll_sub(7, CTX), %xmm7;
 	vmovq cmll_sub(8, CTX), %xmm8;
 	vmovq cmll_sub(9, CTX), %xmm9;
 	vmovq cmll_sub(10, CTX), %xmm10;
 	/* tl = subl(10) ^ (subr(10) & ~subr(8)); */
 	vpandn %xmm10, %xmm8, %xmm15;
 	vpsrldq $4, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm10, %xmm0;
 	/* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm8, %xmm0, %xmm15;
 	vpslld $1, %xmm15, %xmm14;
 	vpsrld $31, %xmm15, %xmm15;
 	vpaddd %xmm14, %xmm15, %xmm15;
 	vpslldq $12, %xmm15, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm0, %xmm0;
 
 	vpxor %xmm0, %xmm6, %xmm6;
 	vmovq %xmm6, cmll_sub(7, CTX);
 
 	vmovq cmll_sub(11, CTX), %xmm11;
 	vmovq cmll_sub(12, CTX), %xmm12;
 	vmovq cmll_sub(13, CTX), %xmm13;
 	vmovq cmll_sub(14, CTX), %xmm14;
 	vmovq cmll_sub(15, CTX), %xmm15;
 	/* tl = subl(7) ^ (subr(7) & ~subr(9)); */
 	vpandn %xmm7, %xmm9, %xmm1;
 	vpsrldq $4, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm7, %xmm0;
 	/* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm9, %xmm0, %xmm1;
 	vpslld $1, %xmm1, %xmm2;
 	vpsrld $31, %xmm1, %xmm1;
 	vpaddd %xmm2, %xmm1, %xmm1;
 	vpslldq $12, %xmm1, %xmm1;
 	vpsrldq $8, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm0, %xmm0;
 
 	vpxor %xmm11, %xmm0, %xmm0;
 	vpxor %xmm12, %xmm10, %xmm10;
 	vpxor %xmm13, %xmm11, %xmm11;
 	vpxor %xmm14, %xmm12, %xmm12;
 	vpxor %xmm15, %xmm13, %xmm13;
 	vmovq %xmm0, cmll_sub(10, CTX);
 	vmovq %xmm10, cmll_sub(11, CTX);
 	vmovq %xmm11, cmll_sub(12, CTX);
 	vmovq %xmm12, cmll_sub(13, CTX);
 	vmovq %xmm13, cmll_sub(14, CTX);
 
 	vmovq cmll_sub(16, CTX), %xmm6;
 	vmovq cmll_sub(17, CTX), %xmm7;
 	vmovq cmll_sub(18, CTX), %xmm8;
 	vmovq cmll_sub(19, CTX), %xmm9;
 	vmovq cmll_sub(20, CTX), %xmm10;
 	/* tl = subl(18) ^ (subr(18) & ~subr(16)); */
 	vpandn %xmm8, %xmm6, %xmm1;
 	vpsrldq $4, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm8, %xmm0;
 	/* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm6, %xmm0, %xmm1;
 	vpslld $1, %xmm1, %xmm2;
 	vpsrld $31, %xmm1, %xmm1;
 	vpaddd %xmm2, %xmm1, %xmm1;
 	vpslldq $12, %xmm1, %xmm1;
 	vpsrldq $8, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm0, %xmm0;
 
 	vpxor %xmm14, %xmm0, %xmm0;
 	vmovq %xmm0, cmll_sub(15, CTX);
 
 	/* tl = subl(15) ^ (subr(15) & ~subr(17)); */
 	vpandn %xmm15, %xmm7, %xmm1;
 	vpsrldq $4, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm15, %xmm0;
 	/* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm7, %xmm0, %xmm1;
 	vpslld $1, %xmm1, %xmm2;
 	vpsrld $31, %xmm1, %xmm1;
 	vpaddd %xmm2, %xmm1, %xmm1;
 	vpslldq $12, %xmm1, %xmm1;
 	vpsrldq $8, %xmm1, %xmm1;
 	vpxor %xmm1, %xmm0, %xmm0;
 
 	vmovq cmll_sub(21, CTX), %xmm1;
 	vmovq cmll_sub(22, CTX), %xmm2;
 	vmovq cmll_sub(23, CTX), %xmm3;
 	vmovq cmll_sub(24, CTX), %xmm4;
 
 	vpxor %xmm9, %xmm0, %xmm0;
 	vpxor %xmm10, %xmm8, %xmm8;
 	vpxor %xmm1, %xmm9, %xmm9;
 	vpxor %xmm2, %xmm10, %xmm10;
 	vpxor %xmm3, %xmm1, %xmm1;
 
 	vmovq %xmm0, cmll_sub(18, CTX);
 	vmovq %xmm8, cmll_sub(19, CTX);
 	vmovq %xmm9, cmll_sub(20, CTX);
 	vmovq %xmm10, cmll_sub(21, CTX);
 	vmovq %xmm1, cmll_sub(22, CTX);
 
 	vmovq cmll_sub(25, CTX), %xmm5;
 	vmovq cmll_sub(26, CTX), %xmm6;
 	vmovq cmll_sub(27, CTX), %xmm7;
 	vmovq cmll_sub(28, CTX), %xmm8;
 	vmovq cmll_sub(29, CTX), %xmm9;
 	vmovq cmll_sub(30, CTX), %xmm10;
 	vmovq cmll_sub(31, CTX), %xmm11;
 	vmovq cmll_sub(32, CTX), %xmm12;
 
 	/* tl = subl(26) ^ (subr(26) & ~subr(24)); */
 	vpandn %xmm6, %xmm4, %xmm15;
 	vpsrldq $4, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm6, %xmm0;
 	/* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm4, %xmm0, %xmm15;
 	vpslld $1, %xmm15, %xmm14;
 	vpsrld $31, %xmm15, %xmm15;
 	vpaddd %xmm14, %xmm15, %xmm15;
 	vpslldq $12, %xmm15, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm0, %xmm0;
 
 	vpxor %xmm0, %xmm2, %xmm2;
 	vmovq %xmm2, cmll_sub(23, CTX);
 
 	/* tl = subl(23) ^ (subr(23) &  ~subr(25)); */
 	vpandn %xmm3, %xmm5, %xmm15;
 	vpsrldq $4, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm3, %xmm0;
 	/* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
 	vpand %xmm5, %xmm0, %xmm15;
 	vpslld $1, %xmm15, %xmm14;
 	vpsrld $31, %xmm15, %xmm15;
 	vpaddd %xmm14, %xmm15, %xmm15;
 	vpslldq $12, %xmm15, %xmm15;
 	vpsrldq $8, %xmm15, %xmm15;
 	vpxor %xmm15, %xmm0, %xmm0;
 
 	vpxor %xmm7, %xmm0, %xmm0;
 	vpxor %xmm8, %xmm6, %xmm6;
 	vpxor %xmm9, %xmm7, %xmm7;
 	vpxor %xmm10, %xmm8, %xmm8;
 	vpxor %xmm11, %xmm9, %xmm9;
 	vpxor %xmm12, %xmm11, %xmm11;
 
 	vmovq %xmm0, cmll_sub(26, CTX);
 	vmovq %xmm6, cmll_sub(27, CTX);
 	vmovq %xmm7, cmll_sub(28, CTX);
 	vmovq %xmm8, cmll_sub(29, CTX);
 	vmovq %xmm9, cmll_sub(30, CTX);
 	vmovq %xmm10, cmll_sub(31, CTX);
 	vmovq %xmm11, cmll_sub(32, CTX);
 
 	/* kw2 and kw4 are unused now. */
 	movq $0, cmll_sub(1, CTX);
 	movq $0, cmll_sub(33, CTX);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_keygen
 ELF(.type  _gcry_camellia_aesni_avx_keygen,@function;)
 
 _gcry_camellia_aesni_avx_keygen:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: key
 	 *	%rdx: keylen
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	vmovdqu (%rsi), %xmm0;
 	cmpl $24, %edx;
 	jb __camellia_avx_setup128;
 	je .Lprepare_key192;
 
 	vmovdqu 16(%rsi), %xmm1;
 	jmp __camellia_avx_setup256;
 
 .Lprepare_key192:
 	vpcmpeqd %xmm2, %xmm2, %xmm2;
 	vmovq 16(%rsi), %xmm1;
 
 	vpxor %xmm1, %xmm2, %xmm2;
 	vpslldq $8, %xmm2, %xmm2;
 	vpor %xmm2, %xmm1, %xmm1;
 
 	jmp __camellia_avx_setup256;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 897e4aee..cc01c774 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -1,1762 +1,1810 @@
 /* camellia-avx2-aesni-amd64.S  -  AES-NI/AVX2 implementation of Camellia cipher
  *
  * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
 #define key_table 0
 #define key_bitlength CAMELLIA_TABLE_BYTE_LEN
 
 /* register macros */
 #define CTX %rdi
 #define RIO %r8
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
 	vpand x, mask4bit, tmp0; \
 	vpandn x, mask4bit, x; \
 	vpsrld $4, x, x; \
 	\
 	vpshufb tmp0, lo_t, tmp0; \
 	vpshufb x, hi_t, x; \
 	vpxor tmp0, x, x;
 
 #define ymm0_x xmm0
 #define ymm1_x xmm1
 #define ymm2_x xmm2
 #define ymm3_x xmm3
 #define ymm4_x xmm4
 #define ymm5_x xmm5
 #define ymm6_x xmm6
 #define ymm7_x xmm7
 #define ymm8_x xmm8
 #define ymm9_x xmm9
 #define ymm10_x xmm10
 #define ymm11_x xmm11
 #define ymm12_x xmm12
 #define ymm13_x xmm13
 #define ymm14_x xmm14
 #define ymm15_x xmm15
 
 /**********************************************************************
   32-way camellia
  **********************************************************************/
 
 /*
  * IN:
  *   x0..x7: byte-sliced AB state
  *   mem_cd: register pointer storing CD state
  *   key: index for key material
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
 		  t7, mem_cd, key) \
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
-	vbroadcasti128 .Linv_shift_row RIP, t4; \
-	vpbroadcastd .L0f0f0f0f RIP, t7; \
-	vbroadcasti128 .Lpre_tf_lo_s1 RIP, t5; \
-	vbroadcasti128 .Lpre_tf_hi_s1 RIP, t6; \
-	vbroadcasti128 .Lpre_tf_lo_s4 RIP, t2; \
-	vbroadcasti128 .Lpre_tf_hi_s4 RIP, t3; \
+	vbroadcasti128 .Linv_shift_row rRIP, t4; \
+	vpbroadcastd .L0f0f0f0f rRIP, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \
+	vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \
+	vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
 	vpshufb t4, x7, x7; \
 	vpshufb t4, x3, x3; \
 	vpshufb t4, x6, x6; \
 	vpshufb t4, x2, x2; \
 	vpshufb t4, x5, x5; \
 	vpshufb t4, x1, x1; \
 	vpshufb t4, x4, x4; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
 	/* prefilter sbox 4 */ \
 	filter_8bit(x0, t5, t6, t7, t4); \
 	filter_8bit(x7, t5, t6, t7, t4); \
 	vextracti128 $1, x0, t0##_x; \
 	vextracti128 $1, x7, t1##_x; \
 	filter_8bit(x3, t2, t3, t7, t4); \
 	filter_8bit(x6, t2, t3, t7, t4); \
 	vextracti128 $1, x3, t3##_x; \
 	vextracti128 $1, x6, t2##_x; \
 	filter_8bit(x2, t5, t6, t7, t4); \
 	filter_8bit(x5, t5, t6, t7, t4); \
 	filter_8bit(x1, t5, t6, t7, t4); \
 	filter_8bit(x4, t5, t6, t7, t4); \
 	\
 	vpxor t4##_x, t4##_x, t4##_x; \
 	\
 	/* AES subbytes + AES shift rows */ \
 	vextracti128 $1, x2, t6##_x; \
 	vextracti128 $1, x5, t5##_x; \
 	vaesenclast t4##_x, x0##_x, x0##_x; \
 	vaesenclast t4##_x, t0##_x, t0##_x; \
 	vaesenclast t4##_x, x7##_x, x7##_x; \
 	vaesenclast t4##_x, t1##_x, t1##_x; \
 	vaesenclast t4##_x, x3##_x, x3##_x; \
 	vaesenclast t4##_x, t3##_x, t3##_x; \
 	vaesenclast t4##_x, x6##_x, x6##_x; \
 	vaesenclast t4##_x, t2##_x, t2##_x; \
 	vinserti128 $1, t0##_x, x0, x0; \
 	vinserti128 $1, t1##_x, x7, x7; \
 	vinserti128 $1, t3##_x, x3, x3; \
 	vinserti128 $1, t2##_x, x6, x6; \
 	vextracti128 $1, x1, t3##_x; \
 	vextracti128 $1, x4, t2##_x; \
-	vbroadcasti128 .Lpost_tf_lo_s1 RIP, t0; \
-	vbroadcasti128 .Lpost_tf_hi_s1 RIP, t1; \
+	vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \
+	vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \
 	vaesenclast t4##_x, x2##_x, x2##_x; \
 	vaesenclast t4##_x, t6##_x, t6##_x; \
 	vaesenclast t4##_x, x5##_x, x5##_x; \
 	vaesenclast t4##_x, t5##_x, t5##_x; \
 	vaesenclast t4##_x, x1##_x, x1##_x; \
 	vaesenclast t4##_x, t3##_x, t3##_x; \
 	vaesenclast t4##_x, x4##_x, x4##_x; \
 	vaesenclast t4##_x, t2##_x, t2##_x; \
 	vinserti128 $1, t6##_x, x2, x2; \
 	vinserti128 $1, t5##_x, x5, x5; \
 	vinserti128 $1, t3##_x, x1, x1; \
 	vinserti128 $1, t2##_x, x4, x4; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
-	vbroadcasti128 .Lpost_tf_lo_s3 RIP, t2; \
-	vbroadcasti128 .Lpost_tf_hi_s3 RIP, t3; \
+	vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \
+	vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t4); \
 	filter_8bit(x7, t0, t1, t7, t4); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
-	vbroadcasti128 .Lpost_tf_lo_s2 RIP, t4; \
-	vbroadcasti128 .Lpost_tf_hi_s2 RIP, t5; \
+	vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \
+	vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
 	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
 	\
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
 	vpxor t7, t7, t7; \
 	\
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
 	vpshufb t7, t1, t1; \
 	vpsrldq $3, t0, t3; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
 	vpxor x6, x1, x1; \
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
 	vpshufb t7, t2, t2; \
 	vpsrldq $4, t0, t4; \
 	vpshufb t7, t3, t3; \
 	vpsrldq $5, t0, t5; \
 	vpshufb t7, t4, t4; \
 	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
 	vpsrldq $6, t0, t6; \
 	vpshufb t7, t5, t5; \
 	vpshufb t7, t6, t6; \
 	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
 	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
 	vpxor t6, x1, x1; \
 	vpxor 5 * 32(mem_cd), x1, x1; \
 	\
 	vpsrldq $7, t0, t6; \
 	vpshufb t7, t0, t0; \
 	vpshufb t7, t6, t7; \
 	\
 	vpxor t7, x0, x0; \
 	vpxor 4 * 32(mem_cd), x0, x0; \
 	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
 	vpxor t4, x3, x3; \
 	vpxor 7 * 32(mem_cd), x3, x3; \
 	\
 	vpxor t3, x4, x4; \
 	vpxor 0 * 32(mem_cd), x4, x4; \
 	\
 	vpxor t2, x5, x5; \
 	vpxor 1 * 32(mem_cd), x5, x5; \
 	\
 	vpxor t1, x6, x6; \
 	vpxor 2 * 32(mem_cd), x6, x6; \
 	\
 	vpxor t0, x7, x7; \
 	vpxor 3 * 32(mem_cd), x7, x7;
 
 /*
  * IN/OUT:
  *  x0..x7: byte-sliced AB state preloaded
  *  mem_ab: byte-sliced AB state in memory
  *  mem_cb: byte-sliced CD state in memory
  */
 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 	roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
 	\
 	vmovdqu x0, 4 * 32(mem_cd); \
 	vmovdqu x1, 5 * 32(mem_cd); \
 	vmovdqu x2, 6 * 32(mem_cd); \
 	vmovdqu x3, 7 * 32(mem_cd); \
 	vmovdqu x4, 0 * 32(mem_cd); \
 	vmovdqu x5, 1 * 32(mem_cd); \
 	vmovdqu x6, 2 * 32(mem_cd); \
 	vmovdqu x7, 3 * 32(mem_cd); \
 	\
 	roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
 	\
 	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 
 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
 
 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 	/* Store new AB state */ \
 	vmovdqu x4, 4 * 32(mem_ab); \
 	vmovdqu x5, 5 * 32(mem_ab); \
 	vmovdqu x6, 6 * 32(mem_ab); \
 	vmovdqu x7, 7 * 32(mem_ab); \
 	vmovdqu x0, 0 * 32(mem_ab); \
 	vmovdqu x1, 1 * 32(mem_ab); \
 	vmovdqu x2, 2 * 32(mem_ab); \
 	vmovdqu x3, 3 * 32(mem_ab);
 
 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 
 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 
 /*
  * IN:
  *  v0..3: byte-sliced 32-bit integers
  * OUT:
  *  v0..3: (IN <<< 1)
  */
 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
 	vpcmpgtb v0, zero, t0; \
 	vpaddb v0, v0, v0; \
 	vpabsb t0, t0; \
 	\
 	vpcmpgtb v1, zero, t1; \
 	vpaddb v1, v1, v1; \
 	vpabsb t1, t1; \
 	\
 	vpcmpgtb v2, zero, t2; \
 	vpaddb v2, v2, v2; \
 	vpabsb t2, t2; \
 	\
 	vpor t0, v1, v1; \
 	\
 	vpcmpgtb v3, zero, t0; \
 	vpaddb v3, v3, v3; \
 	vpabsb t0, t0; \
 	\
 	vpor t1, v2, v2; \
 	vpor t2, v3, v3; \
 	vpor t0, v0, v0;
 
 /*
  * IN:
  *   r: byte-sliced AB state in memory
  *   l: byte-sliced CD state in memory
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 	      tt1, tt2, tt3, kll, klr, krl, krr) \
 	/* \
 	 * t0 = kll; \
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
 	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 	vpxor tt0, tt0, tt0; \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
 	vpand l2, t2, t2; \
 	vpand l3, t3, t3; \
 	\
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor l4, t0, l4; \
 	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 	vmovdqu l4, 4 * 32(l); \
 	vpxor l5, t1, l5; \
 	vmovdqu l5, 5 * 32(l); \
 	vpxor l6, t2, l6; \
 	vmovdqu l6, 6 * 32(l); \
 	vpxor l7, t3, l7; \
 	vmovdqu l7, 7 * 32(l); \
 	\
 	/* \
 	 * t2 = krr; \
 	 * t2 |= rr; \
 	 * rl ^= t2; \
 	 */ \
 	\
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpor 4 * 32(r), t0, t0; \
 	vpor 5 * 32(r), t1, t1; \
 	vpor 6 * 32(r), t2, t2; \
 	vpor 7 * 32(r), t3, t3; \
 	\
 	vpxor 0 * 32(r), t0, t0; \
 	vpxor 1 * 32(r), t1, t1; \
 	vpxor 2 * 32(r), t2, t2; \
 	vpxor 3 * 32(r), t3, t3; \
 	vmovdqu t0, 0 * 32(r); \
 	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 1 * 32(r); \
 	vmovdqu t2, 2 * 32(r); \
 	vmovdqu t3, 3 * 32(r); \
 	\
 	/* \
 	 * t2 = krl; \
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpand 0 * 32(r), t0, t0; \
 	vpand 1 * 32(r), t1, t1; \
 	vpand 2 * 32(r), t2, t2; \
 	vpand 3 * 32(r), t3, t3; \
 	\
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor 4 * 32(r), t0, t0; \
 	vpxor 5 * 32(r), t1, t1; \
 	vpxor 6 * 32(r), t2, t2; \
 	vpxor 7 * 32(r), t3, t3; \
 	vmovdqu t0, 4 * 32(r); \
 	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 5 * 32(r); \
 	vmovdqu t2, 6 * 32(r); \
 	vmovdqu t3, 7 * 32(r); \
 	\
 	/* \
 	 * t0 = klr; \
 	 * t0 |= lr; \
 	 * ll ^= t0; \
 	 */ \
 	\
 	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
 	vpshufb tt0, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
 	vpor l6, t2, t2; \
 	vpor l7, t3, t3; \
 	\
 	vpxor l0, t0, l0; \
 	vmovdqu l0, 0 * 32(l); \
 	vpxor l1, t1, l1; \
 	vmovdqu l1, 1 * 32(l); \
 	vpxor l2, t2, l2; \
 	vmovdqu l2, 2 * 32(l); \
 	vpxor l3, t3, l3; \
 	vmovdqu l3, 3 * 32(l);
 
 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1, x0, x1; \
 	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2, t2, x2;
 
 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
 			      a3, b3, c3, d3, st0, st1) \
 	vmovdqu d2, st0; \
 	vmovdqu d3, st1; \
 	transpose_4x4(a0, a1, a2, a3, d2, d3); \
 	transpose_4x4(b0, b1, b2, b3, d2, d3); \
 	vmovdqu st0, d2; \
 	vmovdqu st1, d3; \
 	\
 	vmovdqu a0, st0; \
 	vmovdqu a1, st1; \
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
-	vbroadcasti128 .Lshufb_16x16b RIP, a0; \
+	vbroadcasti128 .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
 	vpshufb a0, b0, b0; \
 	vpshufb a0, b1, b1; \
 	vpshufb a0, b2, b2; \
 	vpshufb a0, b3, b3; \
 	vpshufb a0, a1, a1; \
 	vpshufb a0, c0, c0; \
 	vpshufb a0, c1, c1; \
 	vpshufb a0, c2, c2; \
 	vpshufb a0, c3, c3; \
 	vpshufb a0, d0, d0; \
 	vpshufb a0, d1, d1; \
 	vpshufb a0, d2, d2; \
 	vpshufb a0, d3, d3; \
 	vmovdqu d3, st1; \
 	vmovdqu st0, d3; \
 	vpshufb a0, d3, a0; \
 	vmovdqu d2, st0; \
 	\
 	transpose_4x4(a0, b0, c0, d0, d2, d3); \
 	transpose_4x4(a1, b1, c1, d1, d2, d3); \
 	vmovdqu st0, d2; \
 	vmovdqu st1, d3; \
 	\
 	vmovdqu b0, st0; \
 	vmovdqu b1, st1; \
 	transpose_4x4(a2, b2, c2, d2, b0, b1); \
 	transpose_4x4(a3, b3, c3, d3, b0, b1); \
 	vmovdqu st0, b0; \
 	vmovdqu st1, b1; \
 	/* does not adjust output bytes inside vectors */
 
 /* load blocks to registers and apply pre-whitening */
 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vpbroadcastq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 32(rio), x0, y7; \
 	vpxor 1 * 32(rio), x0, y6; \
 	vpxor 2 * 32(rio), x0, y5; \
 	vpxor 3 * 32(rio), x0, y4; \
 	vpxor 4 * 32(rio), x0, y3; \
 	vpxor 5 * 32(rio), x0, y2; \
 	vpxor 6 * 32(rio), x0, y1; \
 	vpxor 7 * 32(rio), x0, y0; \
 	vpxor 8 * 32(rio), x0, x7; \
 	vpxor 9 * 32(rio), x0, x6; \
 	vpxor 10 * 32(rio), x0, x5; \
 	vpxor 11 * 32(rio), x0, x4; \
 	vpxor 12 * 32(rio), x0, x3; \
 	vpxor 13 * 32(rio), x0, x2; \
 	vpxor 14 * 32(rio), x0, x1; \
 	vpxor 15 * 32(rio), x0, x0;
 
 /* byteslice pre-whitened blocks and store to temporary memory */
 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd) \
 	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
 			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
 	\
 	vmovdqu x0, 0 * 32(mem_ab); \
 	vmovdqu x1, 1 * 32(mem_ab); \
 	vmovdqu x2, 2 * 32(mem_ab); \
 	vmovdqu x3, 3 * 32(mem_ab); \
 	vmovdqu x4, 4 * 32(mem_ab); \
 	vmovdqu x5, 5 * 32(mem_ab); \
 	vmovdqu x6, 6 * 32(mem_ab); \
 	vmovdqu x7, 7 * 32(mem_ab); \
 	vmovdqu y0, 0 * 32(mem_cd); \
 	vmovdqu y1, 1 * 32(mem_cd); \
 	vmovdqu y2, 2 * 32(mem_cd); \
 	vmovdqu y3, 3 * 32(mem_cd); \
 	vmovdqu y4, 4 * 32(mem_cd); \
 	vmovdqu y5, 5 * 32(mem_cd); \
 	vmovdqu y6, 6 * 32(mem_cd); \
 	vmovdqu y7, 7 * 32(mem_cd);
 
 /* de-byteslice, apply post-whitening and store blocks */
 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
 			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
 	\
 	vmovdqu x0, stack_tmp0; \
 	\
 	vpbroadcastq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
 	vpxor x0, y5, y5; \
 	vpxor x0, y4, y4; \
 	vpxor x0, y3, y3; \
 	vpxor x0, y2, y2; \
 	vpxor x0, y1, y1; \
 	vpxor x0, y0, y0; \
 	vpxor x0, x7, x7; \
 	vpxor x0, x6, x6; \
 	vpxor x0, x5, x5; \
 	vpxor x0, x4, x4; \
 	vpxor x0, x3, x3; \
 	vpxor x0, x2, x2; \
 	vpxor x0, x1, x1; \
 	vpxor stack_tmp0, x0, x0;
 
 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio) \
 	vmovdqu x0, 0 * 32(rio); \
 	vmovdqu x1, 1 * 32(rio); \
 	vmovdqu x2, 2 * 32(rio); \
 	vmovdqu x3, 3 * 32(rio); \
 	vmovdqu x4, 4 * 32(rio); \
 	vmovdqu x5, 5 * 32(rio); \
 	vmovdqu x6, 6 * 32(rio); \
 	vmovdqu x7, 7 * 32(rio); \
 	vmovdqu y0, 8 * 32(rio); \
 	vmovdqu y1, 9 * 32(rio); \
 	vmovdqu y2, 10 * 32(rio); \
 	vmovdqu y3, 11 * 32(rio); \
 	vmovdqu y4, 12 * 32(rio); \
 	vmovdqu y5, 13 * 32(rio); \
 	vmovdqu y6, 14 * 32(rio); \
 	vmovdqu y7, 15 * 32(rio);
 
 .text
 .align 32
 
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 
 .Lshufb_16x16b:
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 
 .Lpack_bswap:
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 /*
  * pre-SubByte transform
  *
  * pre-lookup for sbox1, sbox2, sbox3:
  *   swap_bitendianness(
  *       isom_map_camellia_to_aes(
  *           camellia_f(
  *               swap_bitendianess(in)
  *           )
  *       )
  *   )
  *
  * (note: '⊕ 0xc5' inside camellia_f())
  */
 .Lpre_tf_lo_s1:
 	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 .Lpre_tf_hi_s1:
 	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 
 /*
  * pre-SubByte transform
  *
  * pre-lookup for sbox4:
  *   swap_bitendianness(
  *       isom_map_camellia_to_aes(
  *           camellia_f(
  *               swap_bitendianess(in <<< 1)
  *           )
  *       )
  *   )
  *
  * (note: '⊕ 0xc5' inside camellia_f())
  */
 .Lpre_tf_lo_s4:
 	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 .Lpre_tf_hi_s4:
 	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox1, sbox4:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  )
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s1:
 	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 .Lpost_tf_hi_s1:
 	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox2:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  ) <<< 1
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s2:
 	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 .Lpost_tf_hi_s2:
 	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 
 /*
  * post-SubByte transform
  *
  * post-lookup for sbox3:
  *  swap_bitendianness(
  *      camellia_h(
  *          isom_map_aes_to_camellia(
  *              swap_bitendianness(
  *                  aes_inverse_affine_transform(in)
  *              )
  *          )
  *      )
  *  ) >>> 1
  *
  * (note: '⊕ 0x6e' inside camellia_h())
  */
 .Lpost_tf_lo_s3:
 	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 .Lpost_tf_hi_s3:
 	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 
 /* For isolating SubBytes from AESENCLAST, inverse shift row */
 .Linv_shift_row:
 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
 
 
 .align 8
 ELF(.type   __camellia_enc_blk32,@function;)
 
 __camellia_enc_blk32:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rax: temporary storage, 512 bytes
 	 *	%ymm0..%ymm15: 32 plaintext blocks
 	 * output:
 	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
 	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		      %ymm15, %rax, %rcx);
 
 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 0);
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table + (8) * 8) + 0)(CTX),
 	      ((key_table + (8) * 8) + 4)(CTX),
 	      ((key_table + (8) * 8) + 8)(CTX),
 	      ((key_table + (8) * 8) + 12)(CTX));
 
 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 8);
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table + (16) * 8) + 0)(CTX),
 	      ((key_table + (16) * 8) + 4)(CTX),
 	      ((key_table + (16) * 8) + 8)(CTX),
 	      ((key_table + (16) * 8) + 12)(CTX));
 
 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 16);
 
 	movl $24, %r8d;
 	cmpl $128, key_bitlength(CTX);
 	jne .Lenc_max32;
 
 .Lenc_done:
 	/* load CD for output */
 	vmovdqu 0 * 32(%rcx), %ymm8;
 	vmovdqu 1 * 32(%rcx), %ymm9;
 	vmovdqu 2 * 32(%rcx), %ymm10;
 	vmovdqu 3 * 32(%rcx), %ymm11;
 	vmovdqu 4 * 32(%rcx), %ymm12;
 	vmovdqu 5 * 32(%rcx), %ymm13;
 	vmovdqu 6 * 32(%rcx), %ymm14;
 	vmovdqu 7 * 32(%rcx), %ymm15;
 
 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
 
 	ret;
 
 .align 8
 .Lenc_max32:
 	movl $32, %r8d;
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table + (24) * 8) + 0)(CTX),
 	      ((key_table + (24) * 8) + 4)(CTX),
 	      ((key_table + (24) * 8) + 8)(CTX),
 	      ((key_table + (24) * 8) + 12)(CTX));
 
 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
 .align 8
 ELF(.type   __camellia_dec_blk32,@function;)
 
 __camellia_dec_blk32:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rax: temporary storage, 512 bytes
 	 *	%r8d: 24 for 16 byte key, 32 for larger
 	 *	%ymm0..%ymm15: 16 encrypted blocks
 	 * output:
 	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
 	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		      %ymm15, %rax, %rcx);
 
 	cmpl $32, %r8d;
 	je .Ldec_max32;
 
 .Ldec_max24:
 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 16);
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table + (16) * 8) + 8)(CTX),
 	      ((key_table + (16) * 8) + 12)(CTX),
 	      ((key_table + (16) * 8) + 0)(CTX),
 	      ((key_table + (16) * 8) + 4)(CTX));
 
 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 8);
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table + (8) * 8) + 8)(CTX),
 	      ((key_table + (8) * 8) + 12)(CTX),
 	      ((key_table + (8) * 8) + 0)(CTX),
 	      ((key_table + (8) * 8) + 4)(CTX));
 
 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 0);
 
 	/* load CD for output */
 	vmovdqu 0 * 32(%rcx), %ymm8;
 	vmovdqu 1 * 32(%rcx), %ymm9;
 	vmovdqu 2 * 32(%rcx), %ymm10;
 	vmovdqu 3 * 32(%rcx), %ymm11;
 	vmovdqu 4 * 32(%rcx), %ymm12;
 	vmovdqu 5 * 32(%rcx), %ymm13;
 	vmovdqu 6 * 32(%rcx), %ymm14;
 	vmovdqu 7 * 32(%rcx), %ymm15;
 
 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
 
 	ret;
 
 .align 8
 .Ldec_max32:
 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rax, %rcx, 24);
 
 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 	      %ymm15,
 	      ((key_table + (24) * 8) + 8)(CTX),
 	      ((key_table + (24) * 8) + 12)(CTX),
 	      ((key_table + (24) * 8) + 0)(CTX),
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
+	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
 	vpslldq $8, tmp, tmp; \
 	vpsubq tmp, x, x;
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ctr_enc
 ELF(.type   _gcry_camellia_aesni_avx2_ctr_enc,@function;)
 
 _gcry_camellia_aesni_avx2_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movq 8(%rcx), %r11;
 	bswapq %r11;
 
 	vzeroupper;
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	vpcmpeqd %ymm15, %ymm15, %ymm15;
 	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm0;
-	vpshufb .Lbswap128_mask RIP, %xmm0, %xmm0;
+	vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0;
 	vmovdqa %xmm0, %xmm1;
 	inc_le128(%xmm0, %xmm15, %xmm14);
-	vbroadcasti128 .Lbswap128_mask RIP, %ymm14;
+	vbroadcasti128 .Lbswap128_mask rRIP, %ymm14;
 	vinserti128 $1, %xmm0, %ymm1, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 15 * 32(%rax);
 
 	/* check need for handling 64-bit overflow and carry */
 	cmpq $(0xffffffffffffffff - 32), %r11;
 	ja .Lload_ctr_carry;
 
 	/* construct IVs */
 	vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 14 * 32(%rax);
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 13 * 32(%rax);
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm12;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm11;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm10;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm9;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm8;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm7;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm6;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm5;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm4;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm3;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm2;
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm1;
 	vpsubq %ymm15, %ymm0, %ymm0;  /* +30 ; +31 */
 	vpsubq %xmm15, %xmm0, %xmm13; /* +32 */
 	vpshufb %ymm14, %ymm0, %ymm0;
 	vpshufb %xmm14, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
 	jmp .Lload_ctr_done;
 
 .align 4
 .Lload_ctr_carry:
 	/* construct IVs */
 	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */
 	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 14 * 32(%rax);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 13 * 32(%rax);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm12;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm11;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm10;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm9;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm8;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm7;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm6;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm5;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm4;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm3;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm2;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm1;
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vextracti128 $1, %ymm0, %xmm13;
 	vpshufb %ymm14, %ymm0, %ymm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13;
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
 .align 4
 .Lload_ctr_done:
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor 13 * 32(%rax), %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
 	call __camellia_enc_blk32;
 
 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
 	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
 	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
 	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
 	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
 	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
 	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
 	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
 	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
 	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
 	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
 	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
 	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
 	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
 	leaq 32 * 16(%rdx), %rdx;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cbc_dec
 ELF(.type   _gcry_camellia_aesni_avx2_cbc_dec,@function;)
 
 _gcry_camellia_aesni_avx2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	movq %rcx, %r9;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
 
 	call __camellia_dec_blk32;
 
 	/* XOR output with IV */
 	vmovdqu %ymm8, (%rax);
 	vmovdqu (%r9), %xmm8;
 	vinserti128 $1, (%rdx), %ymm8, %ymm8;
 	vpxor %ymm8, %ymm7, %ymm7;
 	vmovdqu (%rax), %ymm8;
 	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
 	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
 	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
 	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
 	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
 	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
 	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
 	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
 	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
 	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
 	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
 	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
 	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
 	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
 	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
 	movq (15 * 32 + 16 + 0)(%rdx), %rax;
 	movq (15 * 32 + 16 + 8)(%rdx), %rcx;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	/* store new IV */
 	movq %rax, (0)(%r9);
 	movq %rcx, (8)(%r9);
 
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cfb_dec
 ELF(.type   _gcry_camellia_aesni_avx2_cfb_dec,@function;)
 
 _gcry_camellia_aesni_avx2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 32), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm0;
-	vpshufb .Lpack_bswap RIP, %ymm0, %ymm0;
+	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
 	vmovdqu (%rcx), %xmm15;
 	vinserti128 $1, (%rdx), %ymm15, %ymm15;
 	vpxor %ymm15, %ymm0, %ymm15;
 	vmovdqu (15 * 32 + 16)(%rdx), %xmm1;
 	vmovdqu %xmm1, (%rcx); /* store new IV */
 	vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14;
 	vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13;
 	vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12;
 	vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11;
 	vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10;
 	vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9;
 	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8;
 	vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7;
 	vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6;
 	vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5;
 	vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4;
 	vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3;
 	vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2;
 	vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1;
 	vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0;
 
 	call __camellia_enc_blk32;
 
 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
 	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
 	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
 	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
 	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
 	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
 	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
 	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
 	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
 	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
 	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
 	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
 	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
 	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ocb_enc
 ELF(.type   _gcry_camellia_aesni_avx2_ocb_enc,@function;)
 
 _gcry_camellia_aesni_avx2_ocb_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm13;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), %xmm14, %xmm15; \
 	  vpxor (l1reg), %xmm15, %xmm14; \
 	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
 	  vpxor yreg, %ymm13, %ymm13; \
 	  vpxor yreg, %ymm15, yreg; \
 	  vmovdqu %ymm15, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (15 * 32)(%rax);
 	OCB_INPUT(1, %r12, %r13, %ymm0);
 	vmovdqu %ymm0, (14 * 32)(%rax);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (13 * 32)(%rax);
 	OCB_INPUT(3, %r12, %r13, %ymm12);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, %ymm11);
 	OCB_INPUT(5, %r12, %r13, %ymm10);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, %ymm9);
 	OCB_INPUT(7, %r12, %r13, %ymm8);
 	movq (16 * 8)(%r9), %r10;
 	movq (17 * 8)(%r9), %r11;
 	movq (18 * 8)(%r9), %r12;
 	movq (19 * 8)(%r9), %r13;
 	OCB_INPUT(8, %r10, %r11, %ymm7);
 	OCB_INPUT(9, %r12, %r13, %ymm6);
 	movq (20 * 8)(%r9), %r10;
 	movq (21 * 8)(%r9), %r11;
 	movq (22 * 8)(%r9), %r12;
 	movq (23 * 8)(%r9), %r13;
 	OCB_INPUT(10, %r10, %r11, %ymm5);
 	OCB_INPUT(11, %r12, %r13, %ymm4);
 	movq (24 * 8)(%r9), %r10;
 	movq (25 * 8)(%r9), %r11;
 	movq (26 * 8)(%r9), %r12;
 	movq (27 * 8)(%r9), %r13;
 	OCB_INPUT(12, %r10, %r11, %ymm3);
 	OCB_INPUT(13, %r12, %r13, %ymm2);
 	movq (28 * 8)(%r9), %r10;
 	movq (29 * 8)(%r9), %r11;
 	movq (30 * 8)(%r9), %r12;
 	movq (31 * 8)(%r9), %r13;
 	OCB_INPUT(14, %r10, %r11, %ymm1);
 	OCB_INPUT(15, %r12, %r13, %ymm0);
 #undef OCB_INPUT
 
 	vextracti128 $1, %ymm13, %xmm15;
 	vmovdqu %xmm14, (%rcx);
 	vpxor %xmm13, %xmm15, %xmm15;
 	vmovdqu %xmm15, (%r8);
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor 13 * 32(%rax), %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
 	call __camellia_enc_blk32;
 
 	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
 	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
 	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
 	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
 	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
 	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
 	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
 	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
 	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
 	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
 	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
 	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
 	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
 	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
 	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
 	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ocb_dec
 ELF(.type   _gcry_camellia_aesni_avx2_ocb_dec,@function;)
 
 _gcry_camellia_aesni_avx2_ocb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), %xmm14, %xmm15; \
 	  vpxor (l1reg), %xmm15, %xmm14; \
 	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
 	  vpxor yreg, %ymm15, yreg; \
 	  vmovdqu %ymm15, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (15 * 32)(%rax);
 	OCB_INPUT(1, %r12, %r13, %ymm0);
 	vmovdqu %ymm0, (14 * 32)(%rax);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, %ymm13);
 	OCB_INPUT(3, %r12, %r13, %ymm12);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, %ymm11);
 	OCB_INPUT(5, %r12, %r13, %ymm10);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, %ymm9);
 	OCB_INPUT(7, %r12, %r13, %ymm8);
 	movq (16 * 8)(%r9), %r10;
 	movq (17 * 8)(%r9), %r11;
 	movq (18 * 8)(%r9), %r12;
 	movq (19 * 8)(%r9), %r13;
 	OCB_INPUT(8, %r10, %r11, %ymm7);
 	OCB_INPUT(9, %r12, %r13, %ymm6);
 	movq (20 * 8)(%r9), %r10;
 	movq (21 * 8)(%r9), %r11;
 	movq (22 * 8)(%r9), %r12;
 	movq (23 * 8)(%r9), %r13;
 	OCB_INPUT(10, %r10, %r11, %ymm5);
 	OCB_INPUT(11, %r12, %r13, %ymm4);
 	movq (24 * 8)(%r9), %r10;
 	movq (25 * 8)(%r9), %r11;
 	movq (26 * 8)(%r9), %r12;
 	movq (27 * 8)(%r9), %r13;
 	OCB_INPUT(12, %r10, %r11, %ymm3);
 	OCB_INPUT(13, %r12, %r13, %ymm2);
 	movq (28 * 8)(%r9), %r10;
 	movq (29 * 8)(%r9), %r11;
 	movq (30 * 8)(%r9), %r12;
 	movq (31 * 8)(%r9), %r13;
 	OCB_INPUT(14, %r10, %r11, %ymm1);
 	OCB_INPUT(15, %r12, %r13, %ymm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm14, (%rcx);
 
 	movq %r8, %r10;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %r9d;
 	cmovel %r9d, %r8d; /* max */
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor %ymm13, %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
 	call __camellia_dec_blk32;
 
 	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
 	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
 	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
 	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
 	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
 	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
 	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
 	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
 	vmovdqu %ymm7, (7 * 32)(%rax);
 	vmovdqu %ymm6, (6 * 32)(%rax);
 	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
 	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
 	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
 	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
 	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
 	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
 	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
 	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
 	vpxor %ymm5, %ymm7, %ymm7;
 	vpxor %ymm4, %ymm6, %ymm6;
 	vpxor %ymm3, %ymm7, %ymm7;
 	vpxor %ymm2, %ymm6, %ymm6;
 	vpxor %ymm1, %ymm7, %ymm7;
 	vpxor %ymm0, %ymm6, %ymm6;
 	vpxor %ymm15, %ymm7, %ymm7;
 	vpxor %ymm14, %ymm6, %ymm6;
 	vpxor %ymm13, %ymm7, %ymm7;
 	vpxor %ymm12, %ymm6, %ymm6;
 	vpxor %ymm11, %ymm7, %ymm7;
 	vpxor %ymm10, %ymm6, %ymm6;
 	vpxor %ymm9, %ymm7, %ymm7;
 	vpxor %ymm8, %ymm6, %ymm6;
 	vpxor %ymm7, %ymm6, %ymm7;
 
 	vextracti128 $1, %ymm7, %xmm6;
 	vpxor %xmm6, %xmm7, %xmm7;
 	vpxor (%r10), %xmm7, %xmm7;
 	vmovdqu %xmm7, (%r10);
 
 	vmovdqu 7 * 32(%rax), %ymm7;
 	vmovdqu 6 * 32(%rax), %ymm6;
 
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ocb_auth
 ELF(.type   _gcry_camellia_aesni_avx2_ocb_auth,@function;)
 
 _gcry_camellia_aesni_avx2_ocb_auth:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (16 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm14;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rsi), yreg; \
 	  vpxor (l0reg), %xmm14, %xmm15; \
 	  vpxor (l1reg), %xmm15, %xmm14; \
 	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
 	  vpxor yreg, %ymm15, yreg;
 
 	movq (0 * 8)(%r8), %r10;
 	movq (1 * 8)(%r8), %r11;
 	movq (2 * 8)(%r8), %r12;
 	movq (3 * 8)(%r8), %r13;
 	OCB_INPUT(0, %r10, %r11, %ymm0);
 	vmovdqu %ymm0, (15 * 32)(%rax);
 	OCB_INPUT(1, %r12, %r13, %ymm0);
 	vmovdqu %ymm0, (14 * 32)(%rax);
 	movq (4 * 8)(%r8), %r10;
 	movq (5 * 8)(%r8), %r11;
 	movq (6 * 8)(%r8), %r12;
 	movq (7 * 8)(%r8), %r13;
 	OCB_INPUT(2, %r10, %r11, %ymm13);
 	OCB_INPUT(3, %r12, %r13, %ymm12);
 	movq (8 * 8)(%r8), %r10;
 	movq (9 * 8)(%r8), %r11;
 	movq (10 * 8)(%r8), %r12;
 	movq (11 * 8)(%r8), %r13;
 	OCB_INPUT(4, %r10, %r11, %ymm11);
 	OCB_INPUT(5, %r12, %r13, %ymm10);
 	movq (12 * 8)(%r8), %r10;
 	movq (13 * 8)(%r8), %r11;
 	movq (14 * 8)(%r8), %r12;
 	movq (15 * 8)(%r8), %r13;
 	OCB_INPUT(6, %r10, %r11, %ymm9);
 	OCB_INPUT(7, %r12, %r13, %ymm8);
 	movq (16 * 8)(%r8), %r10;
 	movq (17 * 8)(%r8), %r11;
 	movq (18 * 8)(%r8), %r12;
 	movq (19 * 8)(%r8), %r13;
 	OCB_INPUT(8, %r10, %r11, %ymm7);
 	OCB_INPUT(9, %r12, %r13, %ymm6);
 	movq (20 * 8)(%r8), %r10;
 	movq (21 * 8)(%r8), %r11;
 	movq (22 * 8)(%r8), %r12;
 	movq (23 * 8)(%r8), %r13;
 	OCB_INPUT(10, %r10, %r11, %ymm5);
 	OCB_INPUT(11, %r12, %r13, %ymm4);
 	movq (24 * 8)(%r8), %r10;
 	movq (25 * 8)(%r8), %r11;
 	movq (26 * 8)(%r8), %r12;
 	movq (27 * 8)(%r8), %r13;
 	OCB_INPUT(12, %r10, %r11, %ymm3);
 	OCB_INPUT(13, %r12, %r13, %ymm2);
 	movq (28 * 8)(%r8), %r10;
 	movq (29 * 8)(%r8), %r11;
 	movq (30 * 8)(%r8), %r12;
 	movq (31 * 8)(%r8), %r13;
 	OCB_INPUT(14, %r10, %r11, %ymm1);
 	OCB_INPUT(15, %r12, %r13, %ymm0);
 #undef OCB_INPUT
 
 	vmovdqu %xmm14, (%rdx);
 
 	movq %rcx, %r10;
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
 	vpxor %ymm3, %ymm15, %ymm3;
 	vpxor %ymm4, %ymm15, %ymm4;
 	vpxor %ymm5, %ymm15, %ymm5;
 	vpxor %ymm6, %ymm15, %ymm6;
 	vpxor %ymm7, %ymm15, %ymm7;
 	vpxor %ymm8, %ymm15, %ymm8;
 	vpxor %ymm9, %ymm15, %ymm9;
 	vpxor %ymm10, %ymm15, %ymm10;
 	vpxor %ymm11, %ymm15, %ymm11;
 	vpxor %ymm12, %ymm15, %ymm12;
 	vpxor %ymm13, %ymm15, %ymm13;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
 	call __camellia_enc_blk32;
 
 	vpxor %ymm7, %ymm6, %ymm6;
 	vpxor %ymm5, %ymm4, %ymm4;
 	vpxor %ymm3, %ymm2, %ymm2;
 	vpxor %ymm1, %ymm0, %ymm0;
 	vpxor %ymm15, %ymm14, %ymm14;
 	vpxor %ymm13, %ymm12, %ymm12;
 	vpxor %ymm11, %ymm10, %ymm10;
 	vpxor %ymm9, %ymm8, %ymm8;
 
 	vpxor %ymm6, %ymm4, %ymm4;
 	vpxor %ymm2, %ymm0, %ymm0;
 	vpxor %ymm14, %ymm12, %ymm12;
 	vpxor %ymm10, %ymm8, %ymm8;
 
 	vpxor %ymm4, %ymm0, %ymm0;
 	vpxor %ymm12, %ymm8, %ymm8;
 
 	vpxor %ymm0, %ymm8, %ymm0;
 
 	vextracti128 $1, %ymm0, %xmm1;
 	vpxor (%r10), %xmm0, %xmm0;
 	vpxor %xmm0, %xmm1, %xmm0;
 	vmovdqu %xmm0, (%r10);
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index 1a1d43fd..82f67890 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -1,599 +1,663 @@
 /* cast5-amd64.S  -  AMD64 assembly implementation of CAST5 cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
 
 #include "asm-common-amd64.h"
 
 .text
 
 .extern _gcry_cast5_s1to4;
 
 #define s1 0
 #define s2 (s1 + (4 * 256))
 #define s3 (s2 + (4 * 256))
 #define s4 (s3 + (4 * 256))
 
 /* structure of CAST5_context: */
 #define Km 0
 #define Kr (Km + (16 * 4))
 
 /* register macros */
 #define CTX %rdi
 #define RIO %rsi
 #define RTAB %r8
 
 #define RLR0 %r9
 #define RLR1 %r10
 #define RLR2 %r11
 #define RLR3 %r12
 
 #define RLR0d %r9d
 #define RLR1d %r10d
 #define RLR2d %r11d
 #define RLR3d %r12d
 
 #define RX0 %rax
 #define RX1 %rbx
 #define RX2 %rdx
 
 #define RX0d %eax
 #define RX1d %ebx
 #define RX2d %edx
 
 #define RX0bl %al
 #define RX1bl %bl
 #define RX2bl %dl
 
 #define RX0bh %ah
 #define RX1bh %bh
 #define RX2bh %dh
 
 #define RKR %rcx
 #define RKRd %ecx
 #define RKRbl %cl
 
 #define RT0 %rbp
 #define RT1 %rsi
 
 #define RT0d %ebp
 #define RT1d %esi
 
 #define RKM0d %r13d
 #define RKM1d %r14d
 
 /***********************************************************************
  * 1-way cast5
  ***********************************************************************/
 #define dummy(x)
 
 #define shr_kr(none) \
 	shrq $8,			RKR;
 
 #define F(km, load_next_kr, op0, op1, op2, op3) \
 	op0 ## l RLR0d,			km ## d; \
 	roll RKRbl,			km ## d; \
 	rorq $32,			RLR0; \
 	movzbl km ## bh,		RT0d; \
 	movzbl km ## bl,		RT1d; \
 	roll $16,			km ## d; \
 	movl s1(RTAB,RT0,4),		RT0d; \
 	op1 ## l s2(RTAB,RT1,4),	RT0d; \
 	load_next_kr(kr_next); \
 	movzbl km ## bh,		RT1d; \
 	movzbl km ## bl,		km ## d; \
 	op2 ## l s3(RTAB,RT1,4),	RT0d; \
 	op3 ## l s4(RTAB,km,4),		RT0d; \
 	xorq RT0,			RLR0;
 
 #define F1(km, load_next_kr) \
 	F(##km, load_next_kr, add, xor, sub, add)
 #define F2(km, load_next_kr) \
 	F(##km, load_next_kr, xor, sub, add, xor)
 #define F3(km, load_next_kr) \
 	F(##km, load_next_kr, sub, add, xor, sub)
 
 #define get_round_km(n, km) \
 	movl Km+4*(n)(CTX), 		km;
 
 #define get_round_kr_enc(n) \
 	movq $0x1010101010101010,	RKR; \
 	\
 	/* merge rorl rk and rorl $16 */ \
 	xorq Kr+(n)(CTX),		RKR;
 
 #define get_round_kr_dec(n) \
 	movq $0x1010101010101010,	RKR; \
 	\
 	/* merge rorl rk and rorl $16 */ \
 	xorq Kr+(n - 7)(CTX),		RKR; \
 	bswapq				RKR;
 
 #define round_enc(n, FA, FB, fn1, fn2) \
 	get_round_km(n + 1, RX2d); \
 	FA(RX0, fn1); \
 	get_round_km(n + 2, RX0d); \
 	FB(RX2, fn2);
 
 #define round_enc_last(n, FXA, FXB) \
 	get_round_km(n + 1, RX2d); \
 	\
 	FXA(RX0, shr_kr); \
 	FXB(RX2, dummy);
 
 #define round_enc_1(n, FA, FB) \
 	round_enc(n, FA, FB, shr_kr, shr_kr)
 
 #define round_enc_2(n, FA, FB) \
 	round_enc(n, FA, FB, shr_kr, dummy)
 
 #define round_dec(n, FA, FB, fn1, fn2) \
 	get_round_km(n - 1, RX2d); \
 	FA(RX0, fn1); \
 	get_round_km(n - 2, RX0d); \
 	FB(RX2, fn2);
 
 #define round_dec_last(n, FXA, FXB) \
 	get_round_km(n - 1, RX2d); \
 	FXA(RX0, shr_kr); \
 	FXB(RX2, dummy);
 
 #define round_dec_1(n, FA, FB) \
 	round_dec(n, FA, FB, shr_kr, shr_kr)
 
 #define round_dec_2(n, FA, FB) \
 	round_dec(n, FA, FB, shr_kr, dummy)
 
 #define read_block() \
 	movq (RIO), 		RLR0; \
 	bswapq 			RLR0;
 
 #define write_block() \
 	bswapq 			RLR0; \
 	rorq $32,		RLR0; \
 	movq RLR0, 		(RIO);
 
 .align 8
 .globl _gcry_cast5_amd64_encrypt_block
 ELF(.type   _gcry_cast5_amd64_encrypt_block,@function;)
 
 _gcry_cast5_amd64_encrypt_block:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 
 	movq %rsi, %r10;
 
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	movq %rdx, RIO;
 	read_block();
 
 	get_round_km(0, RX0d);
 	get_round_kr_enc(0);
 	round_enc_1(0, F1, F2);
 	round_enc_1(2, F3, F1);
 	round_enc_1(4, F2, F3);
 	round_enc_2(6, F1, F2);
 	get_round_kr_enc(8);
 	round_enc_1(8, F3, F1);
 	round_enc_1(10, F2, F3);
 	round_enc_1(12, F1, F2);
 	round_enc_last(14, F3, F1);
 
 	movq %r10, RIO;
 	write_block();
 
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
 .align 8
 .globl _gcry_cast5_amd64_decrypt_block
 ELF(.type   _gcry_cast5_amd64_decrypt_block,@function;)
 
 _gcry_cast5_amd64_decrypt_block:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 
 	movq %rsi, %r10;
 
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	movq %rdx, RIO;
 	read_block();
 
 	get_round_km(15, RX0d);
 	get_round_kr_dec(15);
 	round_dec_1(15, F1, F3);
 	round_dec_1(13, F2, F1);
 	round_dec_1(11, F3, F2);
 	round_dec_2(9, F1, F3);
 	get_round_kr_dec(7);
 	round_dec_1(7, F2, F1);
 	round_dec_1(5, F3, F2);
 	round_dec_1(3, F1, F3);
 	round_dec_last(1, F2, F1);
 
 	movq %r10, RIO;
 	write_block();
 
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
 /**********************************************************************
   4-way cast5, four blocks parallel
  **********************************************************************/
 #define F_tail(rlr, rx, op1, op2, op3) \
 	movzbl rx ## bh,		RT0d; \
 	movzbl rx ## bl,		RT1d; \
 	roll $16,			rx ## d; \
 	movl s1(RTAB,RT0,4),		RT0d; \
 	op1 ## l s2(RTAB,RT1,4),	RT0d; \
 	movzbl rx ## bh,		RT1d; \
 	movzbl rx ## bl,		rx ## d; \
 	op2 ## l s3(RTAB,RT1,4),	RT0d; \
 	op3 ## l s4(RTAB,rx,4),		RT0d; \
 	xorq RT0,			rlr;
 
 #define F4(km, load_next_kr, op0, op1, op2, op3) \
 	movl km,			RX0d; \
 	op0 ## l RLR0d,			RX0d; \
 	roll RKRbl,			RX0d; \
 	rorq $32,			RLR0; \
 	\
 	movl km,			RX1d; \
 	op0 ## l RLR1d,			RX1d; \
 	roll RKRbl,			RX1d; \
 	rorq $32,			RLR1; \
 	\
 	movl km,			RX2d; \
 	op0 ## l RLR2d,			RX2d; \
 	roll RKRbl,			RX2d; \
 	rorq $32,			RLR2; \
 	\
 	F_tail(RLR0, RX0, op1, op2, op3); \
 	F_tail(RLR1, RX1, op1, op2, op3); \
 	F_tail(RLR2, RX2, op1, op2, op3); \
 	\
 	movl km,			RX0d; \
 	op0 ## l RLR3d,			RX0d; \
 	roll RKRbl,			RX0d; \
 	load_next_kr();			\
 	rorq $32,			RLR3; \
 	\
 	F_tail(RLR3, RX0, op1, op2, op3);
 
 #define F4_1(km, load_next_kr) \
 	F4(km, load_next_kr, add, xor, sub, add)
 #define F4_2(km, load_next_kr) \
 	F4(km, load_next_kr, xor, sub, add, xor)
 #define F4_3(km, load_next_kr) \
 	F4(km, load_next_kr, sub, add, xor, sub)
 
 #define round_enc4(n, FA, FB, fn1, fn2) \
 	get_round_km(n + 1, RKM1d); \
 	FA(RKM0d, fn1); \
 	get_round_km(n + 2, RKM0d); \
 	FB(RKM1d, fn2);
 
 #define round_enc_last4(n, FXA, FXB) \
 	get_round_km(n + 1, RKM1d); \
 	FXA(RKM0d, shr_kr); \
 	FXB(RKM1d, dummy);
 
 #define round_enc4_1(n, FA, FB) \
 	round_enc4(n, FA, FB, shr_kr, shr_kr);
 
 #define round_enc4_2(n, FA, FB) \
 	round_enc4(n, FA, FB, shr_kr, dummy);
 
 #define round_dec4(n, FA, FB, fn1, fn2) \
 	get_round_km(n - 1, RKM1d); \
 	FA(RKM0d, fn1); \
 	get_round_km(n - 2, RKM0d); \
 	FB(RKM1d, fn2);
 
 #define round_dec_last4(n, FXA, FXB) \
 	get_round_km(n - 1, RKM1d); \
 	FXA(RKM0d, shr_kr); \
 	FXB(RKM1d, dummy);
 
 #define round_dec4_1(n, FA, FB) \
 	round_dec4(n, FA, FB, shr_kr, shr_kr);
 
 #define round_dec4_2(n, FA, FB) \
 	round_dec4(n, FA, FB, shr_kr, dummy);
 
 #define inbswap_block4(a, b, c, d) \
 	bswapq 			a; \
 	bswapq 			b; \
 	bswapq 			c; \
 	bswapq 			d;
 
 #define outbswap_block4(a, b, c, d) \
 	bswapq 			a; \
 	bswapq 			b; \
 	bswapq 			c; \
 	bswapq 			d; \
 	rorq $32,		a; \
 	rorq $32,		b; \
 	rorq $32,		c; \
 	rorq $32,		d;
 
 .align 8
 ELF(.type   __cast5_enc_blk4,@function;)
 
 __cast5_enc_blk4:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	get_round_km(0, RKM0d);
 	get_round_kr_enc(0);
 	round_enc4_1(0, F4_1, F4_2);
 	round_enc4_1(2, F4_3, F4_1);
 	round_enc4_1(4, F4_2, F4_3);
 	round_enc4_2(6, F4_1, F4_2);
 	get_round_kr_enc(8);
 	round_enc4_1(8, F4_3, F4_1);
 	round_enc4_1(10, F4_2, F4_3);
 	round_enc4_1(12, F4_1, F4_2);
 	round_enc_last4(14, F4_3, F4_1);
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
+	CFI_ENDPROC();
 ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
 .align 8
 ELF(.type   __cast5_dec_blk4,@function;)
 
 __cast5_dec_blk4:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
 	 */
+	CFI_STARTPROC();
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
 
 	get_round_km(15, RKM0d);
 	get_round_kr_dec(15);
 	round_dec4_1(15, F4_1, F4_3);
 	round_dec4_1(13, F4_2, F4_1);
 	round_dec4_1(11, F4_3, F4_2);
 	round_dec4_2(9, F4_1, F4_3);
 	get_round_kr_dec(7);
 	round_dec4_1(7, F4_2, F4_1);
 	round_dec4_1(5, F4_3, F4_2);
 	round_dec4_1(3, F4_1, F4_3);
 	round_dec_last4(1, F4_2, F4_1);
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	CFI_ENDPROC();
 	ret;
 ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
 .align 8
 .globl _gcry_cast5_amd64_ctr_enc
 ELF(.type   _gcry_cast5_amd64_ctr_enc,@function;)
 _gcry_cast5_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* load IV and byteswap */
 	movq (%rcx), RX0;
 	bswapq RX0;
 	movq RX0, RLR0;
 
 	/* construct IVs */
 	leaq 1(RX0), RLR1;
 	leaq 2(RX0), RLR2;
 	leaq 3(RX0), RLR3;
 	leaq 4(RX0), RX0;
 	bswapq RX0;
 
 	/* store new IV */
 	movq RX0, (%rcx);
 
 	call __cast5_enc_blk4;
 
 	popq %r14; /*src*/
+	CFI_POP_TMP_REG();
 	popq %r13; /*dst*/
+	CFI_POP_TMP_REG();
 
 	/* XOR key-stream with plaintext */
 	xorq 0 * 8(%r14), RLR0;
 	xorq 1 * 8(%r14), RLR1;
 	xorq 2 * 8(%r14), RLR2;
 	xorq 3 * 8(%r14), RLR3;
 	movq RLR0, 0 * 8(%r13);
 	movq RLR1, 1 * 8(%r13);
 	movq RLR2, 2 * 8(%r13);
 	movq RLR3, 3 * 8(%r13);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
 .align 8
 .globl _gcry_cast5_amd64_cbc_dec
 ELF(.type   _gcry_cast5_amd64_cbc_dec,@function;)
 _gcry_cast5_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rcx;
+	CFI_PUSH(%rcx);
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* load input */
 	movq 0 * 8(%rdx), RLR0;
 	movq 1 * 8(%rdx), RLR1;
 	movq 2 * 8(%rdx), RLR2;
 	movq 3 * 8(%rdx), RLR3;
 
 	call __cast5_dec_blk4;
 
 	popq RX0; /*src*/
+	CFI_POP_TMP_REG();
 	popq RX1; /*dst*/
+	CFI_POP_TMP_REG();
 	popq RX2; /*iv*/
+	CFI_POP_TMP_REG();
 
 	movq 3 * 8(RX0), %r14;
 	xorq      (RX2), RLR0;
 	xorq 0 * 8(RX0), RLR1;
 	xorq 1 * 8(RX0), RLR2;
 	xorq 2 * 8(RX0), RLR3;
 	movq %r14, (RX2); /* store new IV */
 
 	movq RLR0, 0 * 8(RX1);
 	movq RLR1, 1 * 8(RX1);
 	movq RLR2, 2 * 8(RX1);
 	movq RLR3, 3 * 8(RX1);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
-
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
 .align 8
 .globl _gcry_cast5_amd64_cfb_dec
 ELF(.type   _gcry_cast5_amd64_cfb_dec,@function;)
 _gcry_cast5_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* Load input */
 	movq (%rcx), RLR0;
 	movq 0 * 8(%rdx), RLR1;
 	movq 1 * 8(%rdx), RLR2;
 	movq 2 * 8(%rdx), RLR3;
 
 	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
 
 	/* Update IV */
 	movq 3 * 8(%rdx), %rdx;
 	movq %rdx, (%rcx);
 
 	call __cast5_enc_blk4;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rcx; /*dst*/
+	CFI_POP_TMP_REG();
 
 	xorq 0 * 8(%rdx), RLR0;
 	xorq 1 * 8(%rdx), RLR1;
 	xorq 2 * 8(%rdx), RLR2;
 	xorq 3 * 8(%rdx), RLR3;
 	movq RLR0, 0 * 8(%rcx);
 	movq RLR1, 1 * 8(%rcx);
 	movq RLR2, 2 * 8(%rcx);
 	movq RLR3, 3 * 8(%rcx);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
-
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
 
 #endif /*defined(USE_CAST5)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index 94c8e8cf..de6263b6 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -1,763 +1,783 @@
 /* chacha20-amd64-avx2.S  -  AVX2 implementation of ChaCha20 cipher
  *
  * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 .text
 
 #include "asm-common-amd64.h"
 #include "asm-poly1305-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
 #define DST   %rsi
 #define SRC   %rdx
 #define NBLKS %rcx
 #define ROUND %eax
 
 /* stack structure */
 #define STACK_VEC_X12 (32)
 #define STACK_VEC_X13 (32 + STACK_VEC_X12)
 #define STACK_TMP     (32 + STACK_VEC_X13)
 #define STACK_TMP1    (32 + STACK_TMP)
 
 #define STACK_MAX     (32 + STACK_TMP1)
 
 /* vector registers */
 #define X0 %ymm0
 #define X1 %ymm1
 #define X2 %ymm2
 #define X3 %ymm3
 #define X4 %ymm4
 #define X5 %ymm5
 #define X6 %ymm6
 #define X7 %ymm7
 #define X8 %ymm8
 #define X9 %ymm9
 #define X10 %ymm10
 #define X11 %ymm11
 #define X12 %ymm12
 #define X13 %ymm13
 #define X14 %ymm14
 #define X15 %ymm15
 
 #define X0h %xmm0
 #define X1h %xmm1
 #define X2h %xmm2
 #define X3h %xmm3
 #define X4h %xmm4
 #define X5h %xmm5
 #define X6h %xmm6
 #define X7h %xmm7
 #define X8h %xmm8
 #define X9h %xmm9
 #define X10h %xmm10
 #define X11h %xmm11
 #define X12h %xmm12
 #define X13h %xmm13
 #define X14h %xmm14
 #define X15h %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1, x0, x1; \
 	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2, t2, x2;
 
 /* 2x2 128-bit matrix transpose */
 #define transpose_16byte_2x2(x0,x1,t1) \
 	vmovdqa    x0, t1; \
 	vperm2i128 $0x20, x1, x0, x0; \
 	vperm2i128 $0x31, x1, t1, x1;
 
 /* xor register with unaligned src and save to unaligned dst */
 #define xor_src_dst(dst, src, offset, xreg) \
 	vpxor offset(src), xreg, xreg; \
 	vmovdqu xreg, offset(dst);
 
 /**********************************************************************
   8-way chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c,tmp)	\
 	vpsrld $(32 - (c)), v1, tmp;	\
 	vpslld $(c), v1, v1;		\
 	vpaddb tmp, v1, v1;		\
 	vpsrld $(32 - (c)), v2, tmp;	\
 	vpslld $(c), v2, v2;		\
 	vpaddb tmp, v2, v2;
 
 #define ROTATE_SHUF_2(v1,v2,shuf)	\
 	vpshufb shuf, v1, v1;		\
 	vpshufb shuf, v2, v2;
 
 #define XOR(ds,s) \
 	vpxor s, ds, ds;
 
 #define PLUS(ds,s) \
 	vpaddd s, ds, ds;
 
 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
 		      interleave_op1,interleave_op2,\
 		      interleave_op3,interleave_op4)		\
 	vbroadcasti128 .Lshuf_rol16 rRIP, tmp1;			\
 		interleave_op1;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 		interleave_op2;					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1);				\
 	vbroadcasti128 .Lshuf_rol8 rRIP, tmp1;			\
 		interleave_op3;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 		interleave_op4;					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1);
 
 .align 32
 chacha20_data:
 .Lshuf_rol16:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
 .Linc_counter:
 	.byte 0,1,2,3,4,5,6,7
 .Lunsigned_cmp:
 	.long 0x80000000
 
 .align 8
 .globl _gcry_chacha20_amd64_avx2_blocks8
 ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;)
 
 _gcry_chacha20_amd64_avx2_blocks8:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 8)
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $STACK_MAX, %rsp;
 	andq $~31, %rsp;
 
 .Loop8:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
 	vpmovzxbd .Linc_counter rRIP, X0;
 	vpbroadcastd .Lunsigned_cmp rRIP, X2;
 	vpbroadcastd (12 * 4)(INPUT), X12;
 	vpbroadcastd (13 * 4)(INPUT), X13;
 	vpaddd X0, X12, X12;
 	vpxor X2, X0, X0;
 	vpxor X2, X12, X1;
 	vpcmpgtd X1, X0, X0;
 	vpsubd X0, X13, X13;
 	vmovdqa X12, (STACK_VEC_X12)(%rsp);
 	vmovdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	vpbroadcastd (0 * 4)(INPUT), X0;
 	vpbroadcastd (1 * 4)(INPUT), X1;
 	vpbroadcastd (2 * 4)(INPUT), X2;
 	vpbroadcastd (3 * 4)(INPUT), X3;
 	vpbroadcastd (4 * 4)(INPUT), X4;
 	vpbroadcastd (5 * 4)(INPUT), X5;
 	vpbroadcastd (6 * 4)(INPUT), X6;
 	vpbroadcastd (7 * 4)(INPUT), X7;
 	vpbroadcastd (8 * 4)(INPUT), X8;
 	vpbroadcastd (9 * 4)(INPUT), X9;
 	vpbroadcastd (10 * 4)(INPUT), X10;
 	vpbroadcastd (11 * 4)(INPUT), X11;
 	vpbroadcastd (14 * 4)(INPUT), X14;
 	vpbroadcastd (15 * 4)(INPUT), X15;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 
 .Lround2:
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,,,,)
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,,,,)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,,,,)
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,,,,)
 	sub $2, ROUND;
 	jnz .Lround2;
 
 	vmovdqa X8, (STACK_TMP1)(%rsp);
 
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
 	vpbroadcastd (1 * 4)(INPUT), X15;
 	PLUS(X1, X15);
 	vpbroadcastd (2 * 4)(INPUT), X15;
 	PLUS(X2, X15);
 	vpbroadcastd (3 * 4)(INPUT), X15;
 	PLUS(X3, X15);
 	vpbroadcastd (4 * 4)(INPUT), X15;
 	PLUS(X4, X15);
 	vpbroadcastd (5 * 4)(INPUT), X15;
 	PLUS(X5, X15);
 	vpbroadcastd (6 * 4)(INPUT), X15;
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
 	transpose_4x4(X0, X1, X2, X3, X8, X15);
 	transpose_4x4(X4, X5, X6, X7, X8, X15);
 	vmovdqa (STACK_TMP1)(%rsp), X8;
 	transpose_16byte_2x2(X0, X4, X15);
 	transpose_16byte_2x2(X1, X5, X15);
 	transpose_16byte_2x2(X2, X6, X15);
 	transpose_16byte_2x2(X3, X7, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
 	vpbroadcastd (8 * 4)(INPUT), X0;
 	PLUS(X8, X0);
 	vpbroadcastd (9 * 4)(INPUT), X0;
 	PLUS(X9, X0);
 	vpbroadcastd (10 * 4)(INPUT), X0;
 	PLUS(X10, X0);
 	vpbroadcastd (11 * 4)(INPUT), X0;
 	PLUS(X11, X0);
 	vmovdqa (STACK_VEC_X12)(%rsp), X0;
 	PLUS(X12, X0);
 	vmovdqa (STACK_VEC_X13)(%rsp), X0;
 	PLUS(X13, X0);
 	vpbroadcastd (14 * 4)(INPUT), X0;
 	PLUS(X14, X0);
 	vpbroadcastd (15 * 4)(INPUT), X0;
 	PLUS(X15, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
 	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
 	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
 	transpose_16byte_2x2(X8, X12, X0);
 	transpose_16byte_2x2(X9, X13, X0);
 	transpose_16byte_2x2(X10, X14, X0);
 	transpose_16byte_2x2(X11, X15, X0);
 	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
 	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
 	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
 	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
 	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
 	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
 
 	sub $8, NBLKS;
 	lea (8 * 64)(DST), DST;
 	lea (8 * 64)(SRC), SRC;
 	jnz .Loop8;
 
 	/* clear the used vector registers and stack */
 	vpxor X0, X0, X0;
 	vmovdqa X0, (STACK_VEC_X12)(%rsp);
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
 	vzeroall;
 
 	/* eax zeroed by round loop. */
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_amd64_avx2_blocks8;)
 
 /**********************************************************************
   8-way stitched chacha20-poly1305
  **********************************************************************/
 
 .align 8
 .globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
 ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;)
 
 _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 8)
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
 	subq $(8 * 8) + STACK_MAX + 32, %rsp;
 	andq $~31, %rsp;
 
 	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
 	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
 	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
 	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
 	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
 
 	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
 	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
 	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
 	/* Load state */
 	POLY1305_LOAD_STATE();
 
 .Loop_poly8:
 
 	/* Construct counter vectors X12 and X13 */
 	vpmovzxbd .Linc_counter rRIP, X0;
 	vpbroadcastd .Lunsigned_cmp rRIP, X2;
 	vpbroadcastd (12 * 4)(INPUT), X12;
 	vpbroadcastd (13 * 4)(INPUT), X13;
 	vpaddd X0, X12, X12;
 	vpxor X2, X0, X0;
 	vpxor X2, X12, X1;
 	vpcmpgtd X1, X0, X0;
 	vpsubd X0, X13, X13;
 	vmovdqa X12, (STACK_VEC_X12)(%rsp);
 	vmovdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	vpbroadcastd (0 * 4)(INPUT), X0;
 	vpbroadcastd (1 * 4)(INPUT), X1;
 	vpbroadcastd (2 * 4)(INPUT), X2;
 	vpbroadcastd (3 * 4)(INPUT), X3;
 	vpbroadcastd (4 * 4)(INPUT), X4;
 	vpbroadcastd (5 * 4)(INPUT), X5;
 	vpbroadcastd (6 * 4)(INPUT), X6;
 	vpbroadcastd (7 * 4)(INPUT), X7;
 	vpbroadcastd (8 * 4)(INPUT), X8;
 	vpbroadcastd (9 * 4)(INPUT), X9;
 	vpbroadcastd (10 * 4)(INPUT), X10;
 	vpbroadcastd (11 * 4)(INPUT), X11;
 	vpbroadcastd (14 * 4)(INPUT), X14;
 	vpbroadcastd (15 * 4)(INPUT), X15;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 
 	# rounds 0,1
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART1(0 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(1 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(2 * 16),
 		      POLY1305_BLOCK_PART2())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(3 * 16))
 
 	# rounds 2,3
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART1(4 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(5 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(6 * 16),
 		      POLY1305_BLOCK_PART2())
 
 	# rounds 4,5
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(7 * 16))
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART1(8 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(9 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 
 	# rounds 6,7
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(10 * 16),
 		      POLY1305_BLOCK_PART2())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(11 * 16))
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART1(12 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 
 	# rounds 8,9
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(13 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(14 * 16),
 		      POLY1305_BLOCK_PART2())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(15 * 16))
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
 	# rounds 10,11
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART1(16 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(17 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(18 * 16),
 		      POLY1305_BLOCK_PART2())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(19 * 16))
 
 	# rounds 12,13
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART1(20 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(21 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(22 * 16),
 		      POLY1305_BLOCK_PART2())
 
 	# rounds 14,15
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(23 * 16))
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART1(24 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(25 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 
 	# rounds 16,17
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(26 * 16),
 		      POLY1305_BLOCK_PART2())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(27 * 16))
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART1(28 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 
 	# rounds 18,19
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(29 * 16),
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(30 * 16),
 		      POLY1305_BLOCK_PART2())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(31 * 16))
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
 	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
 	movq (STACK_MAX + 6 * 8)(%rsp), DST;
 
 	vmovdqa X8, (STACK_TMP1)(%rsp);
 
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
 	vpbroadcastd (1 * 4)(INPUT), X15;
 	PLUS(X1, X15);
 	vpbroadcastd (2 * 4)(INPUT), X15;
 	PLUS(X2, X15);
 	vpbroadcastd (3 * 4)(INPUT), X15;
 	PLUS(X3, X15);
 	vpbroadcastd (4 * 4)(INPUT), X15;
 	PLUS(X4, X15);
 	vpbroadcastd (5 * 4)(INPUT), X15;
 	PLUS(X5, X15);
 	vpbroadcastd (6 * 4)(INPUT), X15;
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
 	transpose_4x4(X0, X1, X2, X3, X8, X15);
 	transpose_4x4(X4, X5, X6, X7, X8, X15);
 	vmovdqa (STACK_TMP1)(%rsp), X8;
 	transpose_16byte_2x2(X0, X4, X15);
 	transpose_16byte_2x2(X1, X5, X15);
 	transpose_16byte_2x2(X2, X6, X15);
 	transpose_16byte_2x2(X3, X7, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
 	vpbroadcastd (8 * 4)(INPUT), X0;
 	PLUS(X8, X0);
 	vpbroadcastd (9 * 4)(INPUT), X0;
 	PLUS(X9, X0);
 	vpbroadcastd (10 * 4)(INPUT), X0;
 	PLUS(X10, X0);
 	vpbroadcastd (11 * 4)(INPUT), X0;
 	PLUS(X11, X0);
 	vmovdqa (STACK_VEC_X12)(%rsp), X0;
 	PLUS(X12, X0);
 	vmovdqa (STACK_VEC_X13)(%rsp), X0;
 	PLUS(X13, X0);
 	vpbroadcastd (14 * 4)(INPUT), X0;
 	PLUS(X14, X0);
 	vpbroadcastd (15 * 4)(INPUT), X0;
 	PLUS(X15, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
 	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
 	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
 	transpose_16byte_2x2(X8, X12, X0);
 	transpose_16byte_2x2(X9, X13, X0);
 	transpose_16byte_2x2(X10, X14, X0);
 	transpose_16byte_2x2(X11, X15, X0);
 	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
 	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
 	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
 	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
 	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
 	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
 
 	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
 	lea (32 * 16)(POLY_RSRC), POLY_RSRC;
 	lea (8 * 64)(DST), DST;
 	lea (8 * 64)(SRC), SRC;
 	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
 	movq DST, (STACK_MAX + 6 * 8)(%rsp);
 
 	jnz .Loop_poly8;
 
 	/* Store state */
 	POLY1305_STORE_STATE();
 
 	/* clear the used vector registers and stack */
 	vpxor X0, X0, X0;
 	vmovdqa X0, (STACK_VEC_X12)(%rsp);
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
 	vzeroall;
 
 	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
 	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
 	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
 	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
 	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
 
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 1657f771..6bbf12fc 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -1,1217 +1,1255 @@
 /* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
  *
  * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 .text
 
 #include "asm-common-amd64.h"
 #include "asm-poly1305-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
 #define DST   %rsi
 #define SRC   %rdx
 #define NBLKS %rcx
 #define ROUND %eax
 
 /* stack structure */
 #define STACK_VEC_X12 (16)
 #define STACK_VEC_X13 (16 + STACK_VEC_X12)
 #define STACK_TMP     (16 + STACK_VEC_X13)
 #define STACK_TMP1    (16 + STACK_TMP)
 #define STACK_TMP2    (16 + STACK_TMP1)
 
 #define STACK_MAX     (16 + STACK_TMP2)
 
 /* vector registers */
 #define X0 %xmm0
 #define X1 %xmm1
 #define X2 %xmm2
 #define X3 %xmm3
 #define X4 %xmm4
 #define X5 %xmm5
 #define X6 %xmm6
 #define X7 %xmm7
 #define X8 %xmm8
 #define X9 %xmm9
 #define X10 %xmm10
 #define X11 %xmm11
 #define X12 %xmm12
 #define X13 %xmm13
 #define X14 %xmm14
 #define X15 %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 	movdqa    x0, t2; \
 	punpckhdq x1, t2; \
 	punpckldq x1, x0; \
 	\
 	movdqa    x2, t1; \
 	punpckldq x3, t1; \
 	punpckhdq x3, x2; \
 	\
 	movdqa     x0, x1; \
 	punpckhqdq t1, x1; \
 	punpcklqdq t1, x0; \
 	\
 	movdqa     t2, x3; \
 	punpckhqdq x2, x3; \
 	punpcklqdq x2, t2; \
 	movdqa     t2, x2;
 
 /* fill xmm register with 32-bit value from memory */
 #define pbroadcastd(mem32, xreg) \
 	movd mem32, xreg; \
 	pshufd $0, xreg, xreg;
 
 /* xor with unaligned memory operand */
 #define pxor_u(umem128, xreg, t) \
 	movdqu umem128, t; \
 	pxor t, xreg;
 
 /* xor register with unaligned src and save to unaligned dst */
 #define xor_src_dst(dst, src, offset, xreg, t) \
 	pxor_u(offset(src), xreg, t); \
 	movdqu xreg, offset(dst);
 
 #define clear(x) pxor x,x;
 
 /**********************************************************************
   4-way chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c,tmp1,tmp2)	\
 	movdqa v1, tmp1; 		\
 	movdqa v2, tmp2; 		\
 	psrld $(32 - (c)), v1;		\
 	pslld $(c), tmp1;		\
 	paddb tmp1, v1;			\
 	psrld $(32 - (c)), v2;		\
 	pslld $(c), tmp2;		\
 	paddb tmp2, v2;
 
 #define ROTATE_SHUF_2(v1,v2,shuf)	\
 	pshufb shuf, v1;		\
 	pshufb shuf, v2;
 
 #define XOR(ds,s) \
 	pxor s, ds;
 
 #define PLUS(ds,s) \
 	paddd s, ds;
 
 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
 		      interleave_op1,interleave_op2)		\
 	movdqa .Lshuf_rol16 rRIP, tmp1;				\
 		interleave_op1;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
 	movdqa .Lshuf_rol8 rRIP, tmp1;				\
 		interleave_op2;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1, tmp2);
 
 chacha20_data:
 .align 16
 .Lshuf_rol16:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
 .Lcounter1:
 	.long 1,0,0,0
 .Linc_counter:
 	.long 0,1,2,3
 .Lunsigned_cmp:
 	.long 0x80000000,0x80000000,0x80000000,0x80000000
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)
 
 _gcry_chacha20_amd64_ssse3_blocks4:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 4)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $STACK_MAX, %rsp;
 	andq $~15, %rsp;
 
 .Loop4:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
 	movdqa .Linc_counter rRIP, X0;
 	movdqa .Lunsigned_cmp rRIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;
 	movdqa X12, X1;
 	pxor X2, X0;
 	pxor X2, X1;
 	pcmpgtd X1, X0;
 	psubd X0, X13;
 	movdqa X12, (STACK_VEC_X12)(%rsp);
 	movdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	pbroadcastd((0 * 4)(INPUT), X0);
 	pbroadcastd((1 * 4)(INPUT), X1);
 	pbroadcastd((2 * 4)(INPUT), X2);
 	pbroadcastd((3 * 4)(INPUT), X3);
 	pbroadcastd((4 * 4)(INPUT), X4);
 	pbroadcastd((5 * 4)(INPUT), X5);
 	pbroadcastd((6 * 4)(INPUT), X6);
 	pbroadcastd((7 * 4)(INPUT), X7);
 	pbroadcastd((8 * 4)(INPUT), X8);
 	pbroadcastd((9 * 4)(INPUT), X9);
 	pbroadcastd((10 * 4)(INPUT), X10);
 	pbroadcastd((11 * 4)(INPUT), X11);
 	pbroadcastd((14 * 4)(INPUT), X14);
 	pbroadcastd((15 * 4)(INPUT), X15);
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
 .Lround2_4:
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
 	sub $2, ROUND;
 	jnz .Lround2_4;
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
 	pbroadcastd((0 * 4)(INPUT), X15);
 	PLUS(X0, X15);
 	pbroadcastd((1 * 4)(INPUT), X15);
 	PLUS(X1, X15);
 	pbroadcastd((2 * 4)(INPUT), X15);
 	PLUS(X2, X15);
 	pbroadcastd((3 * 4)(INPUT), X15);
 	PLUS(X3, X15);
 	pbroadcastd((4 * 4)(INPUT), X15);
 	PLUS(X4, X15);
 	pbroadcastd((5 * 4)(INPUT), X15);
 	PLUS(X5, X15);
 	pbroadcastd((6 * 4)(INPUT), X15);
 	PLUS(X6, X15);
 	pbroadcastd((7 * 4)(INPUT), X15);
 	PLUS(X7, X15);
 	pbroadcastd((8 * 4)(INPUT), X15);
 	PLUS(X8, X15);
 	pbroadcastd((9 * 4)(INPUT), X15);
 	PLUS(X9, X15);
 	pbroadcastd((10 * 4)(INPUT), X15);
 	PLUS(X10, X15);
 	pbroadcastd((11 * 4)(INPUT), X15);
 	PLUS(X11, X15);
 	movdqa (STACK_VEC_X12)(%rsp), X15;
 	PLUS(X12, X15);
 	movdqa (STACK_VEC_X13)(%rsp), X15;
 	PLUS(X13, X15);
 	movdqa X13, (STACK_TMP)(%rsp);
 	pbroadcastd((14 * 4)(INPUT), X15);
 	PLUS(X14, X15);
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X14, (STACK_TMP1)(%rsp);
 	pbroadcastd((15 * 4)(INPUT), X13);
 	PLUS(X15, X13);
 	movdqa X15, (STACK_TMP2)(%rsp);
 
 	/* Update counter */
 	addq $4, (12 * 4)(INPUT);
 
 	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
 	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
 	movdqa (STACK_TMP)(%rsp), X13;
 	movdqa (STACK_TMP1)(%rsp), X14;
 	movdqa (STACK_TMP2)(%rsp), X15;
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
 	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
 	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
 
 	sub $4, NBLKS;
 	lea (4 * 64)(DST), DST;
 	lea (4 * 64)(SRC), SRC;
 	jnz .Loop4;
 
 	/* clear the used vector registers and stack */
 	clear(X0);
 	movdqa X0, (STACK_VEC_X12)(%rsp);
 	movdqa X0, (STACK_VEC_X13)(%rsp);
 	movdqa X0, (STACK_TMP)(%rsp);
 	movdqa X0, (STACK_TMP1)(%rsp);
 	movdqa X0, (STACK_TMP2)(%rsp);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X8);
 	clear(X9);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 	clear(X14);
 	clear(X15);
 
 	/* eax zeroed by round loop. */
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
 /**********************************************************************
   2-way && 1-way chacha20
  **********************************************************************/
 
 #define ROTATE_SHUF(v1,shuf)		\
 	pshufb shuf, v1;
 
 #define ROTATE(v1,c,tmp1)		\
 	movdqa v1, tmp1; 		\
 	psrld $(32 - (c)), v1;		\
 	pslld $(c), tmp1;		\
 	paddb tmp1, v1;
 
 #define WORD_SHUF(v1,shuf)		\
 	pshufd $shuf, v1, v1;
 
 #define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\
 		      shuf_x2,shuf_x3) \
 	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \
 	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \
 	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \
 	PLUS(x2, x3); \
 	  WORD_SHUF(x3, shuf_x3); \
 		      XOR(x1, x2); \
 	  WORD_SHUF(x2, shuf_x2); \
 				   ROTATE(x1, 7, tmp1); \
 	  WORD_SHUF(x1, shuf_x1);
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks1
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;)
 
 _gcry_chacha20_amd64_ssse3_blocks1:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks
 	 */
+	CFI_STARTPROC();
 
 	/* Load constants */
 	movdqa .Lcounter1 rRIP, X4;
 	movdqa .Lshuf_rol8 rRIP, X5;
 	movdqa .Lshuf_rol16 rRIP, X6;
 
 	/* Load state */
 	movdqu (0 * 4)(INPUT), X10;
 	movdqu (4 * 4)(INPUT), X11;
 	movdqu (8 * 4)(INPUT), X12;
 	movdqu (12 * 4)(INPUT), X13;
 
 	cmp $2, NBLKS;
 	jb .Loop1;
 
 	mov $20, ROUND;
 
 	movdqa X10, X0;
 	movdqa X11, X1;
 	movdqa X12, X2;
 	movdqa X13, X3;
 
 	movdqa X10, X8;
 	movdqa X11, X9;
 	movdqa X12, X14;
 	movdqa X13, X15;
 	paddq X4, X15;
 
 .Lround2_2:
 	QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 	sub $2, ROUND;
 	jnz .Lround2_2;
 
 	PLUS(X0, X10);
 	PLUS(X1, X11);
 	PLUS(X2, X12);
 	PLUS(X3, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	PLUS(X8, X10);
 	PLUS(X9, X11);
 	PLUS(X14, X12);
 	PLUS(X15, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
 	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
 	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
 	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
 	xor_src_dst(DST, SRC, 16 * 4, X8, X7);
 	xor_src_dst(DST, SRC, 20 * 4, X9, X7);
 	xor_src_dst(DST, SRC, 24 * 4, X14, X7);
 	xor_src_dst(DST, SRC, 28 * 4, X15, X7);
 
 	lea (2 * 64)(DST), DST;
 	lea (2 * 64)(SRC), SRC;
 
 	clear(X8);
 	clear(X9);
 	clear(X14);
 	clear(X15);
 
 	sub $2, NBLKS;
 	jz .Ldone1;
 
 .Loop1:
 	mov $20, ROUND;
 
 	movdqa X10, X0;
 	movdqa X11, X1;
 	movdqa X12, X2;
 	movdqa X13, X3;
 
 .Lround2_1:
 	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 	sub $2, ROUND;
 	jnz .Lround2_1;
 
 	PLUS(X0, X10);
 	PLUS(X1, X11);
 	PLUS(X2, X12);
 	PLUS(X3, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
 	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
 	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
 	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
 
 	lea (64)(DST), DST;
 	lea (64)(SRC), SRC;
 
 	sub $1, NBLKS;
 	jnz .Loop1;
 
 .Ldone1:
 	/* Store counter */
 	movdqu X13, (12 * 4)(INPUT);
 
 	/* clear the used vector registers */
 	clear(X0);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 
 	/* eax zeroed by round loop. */
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
 
 /**********************************************************************
   4-way stitched chacha20-poly1305
  **********************************************************************/
 
 .align 8
 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;)
 
 _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 4)
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(8 * 8) + STACK_MAX + 16, %rsp;
 	andq $~15, %rsp;
 
 	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
 	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
 	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
 	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
 	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
 
 	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
 	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
 	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
 	/* Load state */
 	POLY1305_LOAD_STATE();
 
 .Loop_poly4:
 
 	/* Construct counter vectors X12 and X13 */
 	movdqa .Linc_counter rRIP, X0;
 	movdqa .Lunsigned_cmp rRIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;
 	movdqa X12, X1;
 	pxor X2, X0;
 	pxor X2, X1;
 	pcmpgtd X1, X0;
 	psubd X0, X13;
 	movdqa X12, (STACK_VEC_X12)(%rsp);
 	movdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	pbroadcastd((0 * 4)(INPUT), X0);
 	pbroadcastd((1 * 4)(INPUT), X1);
 	pbroadcastd((2 * 4)(INPUT), X2);
 	pbroadcastd((3 * 4)(INPUT), X3);
 	pbroadcastd((4 * 4)(INPUT), X4);
 	pbroadcastd((5 * 4)(INPUT), X5);
 	pbroadcastd((6 * 4)(INPUT), X6);
 	pbroadcastd((7 * 4)(INPUT), X7);
 	pbroadcastd((8 * 4)(INPUT), X8);
 	pbroadcastd((9 * 4)(INPUT), X9);
 	pbroadcastd((10 * 4)(INPUT), X10);
 	pbroadcastd((11 * 4)(INPUT), X11);
 	pbroadcastd((14 * 4)(INPUT), X14);
 	pbroadcastd((15 * 4)(INPUT), X15);
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
 	/* rounds 0,1 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART1(0 * 16),
 		      POLY1305_BLOCK_PART2())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(1 * 16))
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 
 	/* rounds 2,3 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART1(2 * 16),
 		      POLY1305_BLOCK_PART2())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(3 * 16))
 
 	/* rounds 4,5 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART1(4 * 16),
 		      POLY1305_BLOCK_PART2())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 
 	/* rounds 6,7 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(5 * 16))
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART1(6 * 16),
 		      POLY1305_BLOCK_PART2())
 
 	/* rounds 8,9 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(7 * 16))
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
 	/* rounds 10,11 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART1(8 * 16),
 		      POLY1305_BLOCK_PART2())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(9 * 16))
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 
 	/* rounds 12,13 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART1(10 * 16),
 		      POLY1305_BLOCK_PART2())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(11 * 16))
 
 	/* rounds 14,15 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART1(12 * 16),
 		      POLY1305_BLOCK_PART2())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 
 	/* rounds 16,17 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(13 * 16))
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART1(14 * 16),
 		      POLY1305_BLOCK_PART2())
 
 	/* rounds 18,19 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4())
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART5(),
 		      POLY1305_BLOCK_PART1(15 * 16))
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
 	pbroadcastd((0 * 4)(INPUT), X15);
 	PLUS(X0, X15);
 	pbroadcastd((1 * 4)(INPUT), X15);
 	PLUS(X1, X15);
 	pbroadcastd((2 * 4)(INPUT), X15);
 	PLUS(X2, X15);
 	pbroadcastd((3 * 4)(INPUT), X15);
 	PLUS(X3, X15);
 	pbroadcastd((4 * 4)(INPUT), X15);
 	PLUS(X4, X15);
 	pbroadcastd((5 * 4)(INPUT), X15);
 	PLUS(X5, X15);
 	pbroadcastd((6 * 4)(INPUT), X15);
 	PLUS(X6, X15);
 	pbroadcastd((7 * 4)(INPUT), X15);
 	PLUS(X7, X15);
 	pbroadcastd((8 * 4)(INPUT), X15);
 	PLUS(X8, X15);
 	pbroadcastd((9 * 4)(INPUT), X15);
 	PLUS(X9, X15);
 	pbroadcastd((10 * 4)(INPUT), X15);
 	PLUS(X10, X15);
 	pbroadcastd((11 * 4)(INPUT), X15);
 	PLUS(X11, X15);
 	movdqa (STACK_VEC_X12)(%rsp), X15;
 	PLUS(X12, X15);
 	movdqa (STACK_VEC_X13)(%rsp), X15;
 	PLUS(X13, X15);
 	movdqa X13, (STACK_TMP)(%rsp);
 	pbroadcastd((14 * 4)(INPUT), X15);
 	PLUS(X14, X15);
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X14, (STACK_TMP1)(%rsp);
 	pbroadcastd((15 * 4)(INPUT), X13);
 	PLUS(X15, X13);
 	movdqa X15, (STACK_TMP2)(%rsp);
 
 	/* Update counter */
 	addq $4, (12 * 4)(INPUT);
 
 	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
 	movq (STACK_MAX + 6 * 8)(%rsp), DST;
 
 	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
 	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
 	movdqa (STACK_TMP)(%rsp), X13;
 	movdqa (STACK_TMP1)(%rsp), X14;
 	movdqa (STACK_TMP2)(%rsp), X15;
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
 	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
 	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
 
 	subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
 	lea (16 * 16)(POLY_RSRC), POLY_RSRC;
 	lea (4 * 64)(DST), DST;
 	lea (4 * 64)(SRC), SRC;
 	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
 	movq DST, (STACK_MAX + 6 * 8)(%rsp);
 
 	jnz .Loop_poly4;
 
 	/* Store state */
 	POLY1305_STORE_STATE();
 
 	/* clear the used vector registers and stack */
 	clear(X0);
 	movdqa X0, (STACK_VEC_X12)(%rsp);
 	movdqa X0, (STACK_VEC_X13)(%rsp);
 	movdqa X0, (STACK_TMP)(%rsp);
 	movdqa X0, (STACK_TMP1)(%rsp);
 	movdqa X0, (STACK_TMP2)(%rsp);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X8);
 	clear(X9);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 	clear(X14);
 	clear(X15);
 
 	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
 	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
 	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
 	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
 	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
 
 /**********************************************************************
   2-way && 1-way stitched chacha20-poly1305
  **********************************************************************/
 
 .align 8
 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1,@function;)
 
 _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	/* input:
 	 *	%rdi: chacha20-state
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
+
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(8 * 8), %rsp;
 	movq %rbx, (0 * 8)(%rsp);
 	movq %r12, (1 * 8)(%rsp);
 	movq %r13, (2 * 8)(%rsp);
 	movq %r14, (3 * 8)(%rsp);
 	movq %r15, (4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, 0 * 8);
+	CFI_REG_ON_STACK(r12, 1 * 8);
+	CFI_REG_ON_STACK(r13, 2 * 8);
+	CFI_REG_ON_STACK(r14, 3 * 8);
+	CFI_REG_ON_STACK(r15, 4 * 8);
 
 	movq %rdx, (5 * 8)(%rsp); # SRC
 	movq %rsi, (6 * 8)(%rsp); # DST
 	movq %rcx, (7 * 8)(%rsp); # NBLKS
 
 	/* Load constants */
 	movdqa .Lcounter1 rRIP, X4;
 	movdqa .Lshuf_rol8 rRIP, X5;
 	movdqa .Lshuf_rol16 rRIP, X6;
 
 	/* Load state */
 	movdqu (0 * 4)(INPUT), X10;
 	movdqu (4 * 4)(INPUT), X11;
 	movdqu (8 * 4)(INPUT), X12;
 	movdqu (12 * 4)(INPUT), X13;
 
 	POLY1305_LOAD_STATE();
 
 	cmpq $2, (7 * 8)(%rsp); #NBLKS
 	jb .Loop_poly1;
 
 	movdqa X10, X0;
 	movdqa X11, X1;
 	movdqa X12, X2;
 	movdqa X13, X3;
 
 	movdqa X10, X8;
 	movdqa X11, X9;
 	movdqa X12, X14;
 	movdqa X13, X15;
 	paddq X4, X15;
 
 	/* Process two ChaCha20 blocks and eight Poly1305 blocks. */
 
 	POLY1305_BLOCK_PART1(0 * 16);
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART1(1 * 16);
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART1(2 * 16);
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART1(3 * 16);
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART1(4 * 16);
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART1(5 * 16);
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART1(6 * 16);
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART1(7 * 16);
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	movq (5 * 8)(%rsp), SRC;
 	movq (6 * 8)(%rsp), DST;
 
 	PLUS(X0, X10);
 	PLUS(X1, X11);
 	PLUS(X2, X12);
 	PLUS(X3, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	PLUS(X8, X10);
 	PLUS(X9, X11);
 	PLUS(X14, X12);
 	PLUS(X15, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
 	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
 	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
 	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
 	xor_src_dst(DST, SRC, 16 * 4, X8, X7);
 	xor_src_dst(DST, SRC, 20 * 4, X9, X7);
 	xor_src_dst(DST, SRC, 24 * 4, X14, X7);
 	xor_src_dst(DST, SRC, 28 * 4, X15, X7);
 
 	clear(X8);
 	clear(X9);
 	clear(X14);
 	clear(X15);
 
 	subq $2, (7 * 8)(%rsp); # NBLKS
 	lea (2 * 64)(POLY_RSRC), POLY_RSRC;
 	lea (2 * 64)(SRC), SRC;
 	lea (2 * 64)(DST), DST;
 	movq SRC, (5 * 8)(%rsp);
 	movq DST, (6 * 8)(%rsp);
 	jz .Ldone_poly1;
 
 .Loop_poly1:
 	movdqa X10, X0;
 	movdqa X11, X1;
 	movdqa X12, X2;
 	movdqa X13, X3;
 
 	/* Process one ChaCha20 block and four Poly1305 blocks. */
 	POLY1305_BLOCK_PART1(0 * 16);
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART1(1 * 16);
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART1(2 * 16);
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART1(3 * 16);
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
 	movq (5 * 8)(%rsp), SRC;
 	movq (6 * 8)(%rsp), DST;
 
 	PLUS(X0, X10);
 	PLUS(X1, X11);
 	PLUS(X2, X12);
 	PLUS(X3, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
 	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
 	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
 	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
 
 	subq $1, (7 * 8)(%rsp); # NBLKS
 	lea (64)(POLY_RSRC), POLY_RSRC;
 	lea (64)(SRC), SRC;
 	lea (64)(DST), DST;
 	movq SRC, (5 * 8)(%rsp);
 	movq DST, (6 * 8)(%rsp);
 
 	jnz .Loop_poly1;
 
 .Ldone_poly1:
 	/* Store state */
 	POLY1305_STORE_STATE();
 
 	movdqu X13, (12 * 4)(INPUT);
 
 	/* clear the used vector registers */
 	clear(X0);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 
 	movq (0 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %r12;
 	movq (2 * 8)(%rsp), %r13;
 	movq (3 * 8)(%rsp), %r14;
 	movq (4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
 
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index f25573d9..a211dac3 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -1,1037 +1,1111 @@
 /* des-amd64.S  -  AMD64 assembly implementation of 3DES cipher
  *
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(USE_DES) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #include "asm-common-amd64.h"
 
 .text
 
 #define s1 0
 #define s2 ((s1) + (64*8))
 #define s3 ((s2) + (64*8))
 #define s4 ((s3) + (64*8))
 #define s5 ((s4) + (64*8))
 #define s6 ((s5) + (64*8))
 #define s7 ((s6) + (64*8))
 #define s8 ((s7) + (64*8))
 
 /* register macros */
 #define CTX %rdi
 #define SBOXES %rbp
 
 #define RL0 %r8
 #define RL1 %r9
 #define RL2 %r10
 
 #define RL0d %r8d
 #define RL1d %r9d
 #define RL2d %r10d
 
 #define RR0 %r11
 #define RR1 %r12
 #define RR2 %r13
 
 #define RR0d %r11d
 #define RR1d %r12d
 #define RR2d %r13d
 
 #define RW0 %rax
 #define RW1 %rbx
 #define RW2 %rcx
 
 #define RW0d %eax
 #define RW1d %ebx
 #define RW2d %ecx
 
 #define RW0bl %al
 #define RW1bl %bl
 #define RW2bl %cl
 
 #define RW0bh %ah
 #define RW1bh %bh
 #define RW2bh %ch
 
 #define RT0 %r15
 #define RT1 %rsi
 #define RT2 %r14
 #define RT3 %rdx
 
 #define RT0d %r15d
 #define RT1d %esi
 #define RT2d %r14d
 #define RT3d %edx
 
 /***********************************************************************
  * 1-way 3DES
  ***********************************************************************/
 #define do_permutation(a, b, offset, mask) \
 	movl a, RT0d; \
 	shrl $(offset), RT0d; \
 	xorl b, RT0d; \
 	andl $(mask), RT0d; \
 	xorl RT0d, b; \
 	shll $(offset), RT0d; \
 	xorl RT0d, a;
 
 #define expand_to_64bits(val, mask) \
 	movl val##d, RT0d; \
 	rorl $4, RT0d; \
 	shlq $32, RT0; \
 	orq RT0, val; \
 	andq mask, val;
 
 #define compress_to_64bits(val) \
 	movq val, RT0; \
 	shrq $32, RT0; \
 	roll $4, RT0d; \
 	orl RT0d, val##d;
 
 #define initial_permutation(left, right) \
 	do_permutation(left##d, right##d,  4, 0x0f0f0f0f); \
 	do_permutation(left##d, right##d, 16, 0x0000ffff); \
 	do_permutation(right##d, left##d,  2, 0x33333333); \
 	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
 	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
 	movl left##d, RW0d; \
 	roll $1, right##d; \
 	xorl right##d, RW0d; \
 	andl $0xaaaaaaaa, RW0d; \
 	xorl RW0d, left##d; \
 	xorl RW0d, right##d; \
 	roll $1, left##d; \
 	expand_to_64bits(right, RT3); \
 	expand_to_64bits(left, RT3);
 
 #define final_permutation(left, right) \
 	compress_to_64bits(right); \
 	compress_to_64bits(left); \
 	movl right##d, RW0d; \
 	rorl $1, left##d; \
 	xorl left##d, RW0d; \
 	andl $0xaaaaaaaa, RW0d; \
 	xorl RW0d, right##d; \
 	xorl RW0d, left##d; \
 	rorl $1, right##d; \
 	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
 	do_permutation(right##d, left##d,  2, 0x33333333); \
 	do_permutation(left##d, right##d, 16, 0x0000ffff); \
 	do_permutation(left##d, right##d,  4, 0x0f0f0f0f);
 
 #define round1(n, from, to, load_next_key) \
 	xorq from, RW0; \
 	\
 	movzbl RW0bl, RT0d; \
 	movzbl RW0bh, RT1d; \
 	shrq $16, RW0; \
 	movzbl RW0bl, RT2d; \
 	movzbl RW0bh, RT3d; \
 	shrq $16, RW0; \
 	movq s8(SBOXES, RT0, 8), RT0; \
 	xorq s6(SBOXES, RT1, 8), to; \
 	movzbl RW0bl, RL1d; \
 	movzbl RW0bh, RT1d; \
 	shrl $16, RW0d; \
 	xorq s4(SBOXES, RT2, 8), RT0; \
 	xorq s2(SBOXES, RT3, 8), to; \
 	movzbl RW0bl, RT2d; \
 	movzbl RW0bh, RT3d; \
 	xorq s7(SBOXES, RL1, 8), RT0; \
 	xorq s5(SBOXES, RT1, 8), to; \
 	xorq s3(SBOXES, RT2, 8), RT0; \
 	load_next_key(n, RW0); \
 	xorq RT0, to; \
 	xorq s1(SBOXES, RT3, 8), to; \
 
 #define load_next_key(n, RWx) \
 	movq (((n) + 1) * 8)(CTX), RWx;
 
 #define dummy2(a, b) /*_*/
 
 #define read_block(io, left, right) \
 	movl    (io), left##d; \
 	movl   4(io), right##d; \
 	bswapl left##d; \
 	bswapl right##d;
 
 #define write_block(io, left, right) \
 	bswapl left##d; \
 	bswapl right##d; \
 	movl   left##d,   (io); \
 	movl   right##d, 4(io);
 
 .align 8
 .globl _gcry_3des_amd64_crypt_block
 ELF(.type  _gcry_3des_amd64_crypt_block,@function;)
 
 _gcry_3des_amd64_crypt_block:
 	/* input:
 	 *	%rdi: round keys, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 
 	leaq .L_s1 rRIP, SBOXES;
 
 	read_block(%rdx, RL0, RR0);
 	initial_permutation(RL0, RR0);
 
 	movq (CTX), RW0;
 
 	round1(0, RR0, RL0, load_next_key);
 	round1(1, RL0, RR0, load_next_key);
 	round1(2, RR0, RL0, load_next_key);
 	round1(3, RL0, RR0, load_next_key);
 	round1(4, RR0, RL0, load_next_key);
 	round1(5, RL0, RR0, load_next_key);
 	round1(6, RR0, RL0, load_next_key);
 	round1(7, RL0, RR0, load_next_key);
 	round1(8, RR0, RL0, load_next_key);
 	round1(9, RL0, RR0, load_next_key);
 	round1(10, RR0, RL0, load_next_key);
 	round1(11, RL0, RR0, load_next_key);
 	round1(12, RR0, RL0, load_next_key);
 	round1(13, RL0, RR0, load_next_key);
 	round1(14, RR0, RL0, load_next_key);
 	round1(15, RL0, RR0, load_next_key);
 
 	round1(16+0, RL0, RR0, load_next_key);
 	round1(16+1, RR0, RL0, load_next_key);
 	round1(16+2, RL0, RR0, load_next_key);
 	round1(16+3, RR0, RL0, load_next_key);
 	round1(16+4, RL0, RR0, load_next_key);
 	round1(16+5, RR0, RL0, load_next_key);
 	round1(16+6, RL0, RR0, load_next_key);
 	round1(16+7, RR0, RL0, load_next_key);
 	round1(16+8, RL0, RR0, load_next_key);
 	round1(16+9, RR0, RL0, load_next_key);
 	round1(16+10, RL0, RR0, load_next_key);
 	round1(16+11, RR0, RL0, load_next_key);
 	round1(16+12, RL0, RR0, load_next_key);
 	round1(16+13, RR0, RL0, load_next_key);
 	round1(16+14, RL0, RR0, load_next_key);
 	round1(16+15, RR0, RL0, load_next_key);
 
 	round1(32+0, RR0, RL0, load_next_key);
 	round1(32+1, RL0, RR0, load_next_key);
 	round1(32+2, RR0, RL0, load_next_key);
 	round1(32+3, RL0, RR0, load_next_key);
 	round1(32+4, RR0, RL0, load_next_key);
 	round1(32+5, RL0, RR0, load_next_key);
 	round1(32+6, RR0, RL0, load_next_key);
 	round1(32+7, RL0, RR0, load_next_key);
 	round1(32+8, RR0, RL0, load_next_key);
 	round1(32+9, RL0, RR0, load_next_key);
 	round1(32+10, RR0, RL0, load_next_key);
 	round1(32+11, RL0, RR0, load_next_key);
 	round1(32+12, RR0, RL0, load_next_key);
 	round1(32+13, RL0, RR0, load_next_key);
 	round1(32+14, RR0, RL0, load_next_key);
 	round1(32+15, RL0, RR0, dummy2);
 
 	popq RW2; /*dst*/
+	CFI_POP_TMP_REG();
 	final_permutation(RR0, RL0);
 	write_block(RW2, RR0, RL0);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
 /***********************************************************************
  * 3-way 3DES
  ***********************************************************************/
 #define expand_to_64bits(val, mask) \
 	movl val##d, RT0d; \
 	rorl $4, RT0d; \
 	shlq $32, RT0; \
 	orq RT0, val; \
 	andq mask, val;
 
 #define compress_to_64bits(val) \
 	movq val, RT0; \
 	shrq $32, RT0; \
 	roll $4, RT0d; \
 	orl RT0d, val##d;
 
 #define initial_permutation3(left, right) \
 	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
 	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
 	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
 	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
 	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f); \
 	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
 	    \
 	do_permutation(right##0d, left##0d,  2, 0x33333333); \
 	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
 	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
 	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
 	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
 	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
 	    \
 	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
 	    \
 	movl left##0d, RW0d; \
 	roll $1, right##0d; \
 	xorl right##0d, RW0d; \
 	andl $0xaaaaaaaa, RW0d; \
 	xorl RW0d, left##0d; \
 	xorl RW0d, right##0d; \
 	roll $1, left##0d; \
 	expand_to_64bits(right##0, RT3); \
 	expand_to_64bits(left##0, RT3); \
 	  movl left##1d, RW1d; \
 	  roll $1, right##1d; \
 	  xorl right##1d, RW1d; \
 	  andl $0xaaaaaaaa, RW1d; \
 	  xorl RW1d, left##1d; \
 	  xorl RW1d, right##1d; \
 	  roll $1, left##1d; \
 	  expand_to_64bits(right##1, RT3); \
 	  expand_to_64bits(left##1, RT3); \
 	    movl left##2d, RW2d; \
 	    roll $1, right##2d; \
 	    xorl right##2d, RW2d; \
 	    andl $0xaaaaaaaa, RW2d; \
 	    xorl RW2d, left##2d; \
 	    xorl RW2d, right##2d; \
 	    roll $1, left##2d; \
 	    expand_to_64bits(right##2, RT3); \
 	    expand_to_64bits(left##2, RT3);
 
 #define final_permutation3(left, right) \
 	compress_to_64bits(right##0); \
 	compress_to_64bits(left##0); \
 	movl right##0d, RW0d; \
 	rorl $1, left##0d; \
 	xorl left##0d, RW0d; \
 	andl $0xaaaaaaaa, RW0d; \
 	xorl RW0d, right##0d; \
 	xorl RW0d, left##0d; \
 	rorl $1, right##0d; \
 	  compress_to_64bits(right##1); \
 	  compress_to_64bits(left##1); \
 	  movl right##1d, RW1d; \
 	  rorl $1, left##1d; \
 	  xorl left##1d, RW1d; \
 	  andl $0xaaaaaaaa, RW1d; \
 	  xorl RW1d, right##1d; \
 	  xorl RW1d, left##1d; \
 	  rorl $1, right##1d; \
 	    compress_to_64bits(right##2); \
 	    compress_to_64bits(left##2); \
 	    movl right##2d, RW2d; \
 	    rorl $1, left##2d; \
 	    xorl left##2d, RW2d; \
 	    andl $0xaaaaaaaa, RW2d; \
 	    xorl RW2d, right##2d; \
 	    xorl RW2d, left##2d; \
 	    rorl $1, right##2d; \
 	    \
 	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
 	do_permutation(right##0d, left##0d,  2, 0x33333333); \
 	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
 	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
 	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
 	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
 	    \
 	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
 	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
 	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
 	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
 	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
 	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f);
 
 #define round3(n, from, to, load_next_key, do_movq) \
 	xorq from##0, RW0; \
 	movzbl RW0bl, RT3d; \
 	movzbl RW0bh, RT1d; \
 	shrq $16, RW0; \
 	xorq s8(SBOXES, RT3, 8), to##0; \
 	xorq s6(SBOXES, RT1, 8), to##0; \
 	movzbl RW0bl, RT3d; \
 	movzbl RW0bh, RT1d; \
 	shrq $16, RW0; \
 	xorq s4(SBOXES, RT3, 8), to##0; \
 	xorq s2(SBOXES, RT1, 8), to##0; \
 	movzbl RW0bl, RT3d; \
 	movzbl RW0bh, RT1d; \
 	shrl $16, RW0d; \
 	xorq s7(SBOXES, RT3, 8), to##0; \
 	xorq s5(SBOXES, RT1, 8), to##0; \
 	movzbl RW0bl, RT3d; \
 	movzbl RW0bh, RT1d; \
 	load_next_key(n, RW0); \
 	xorq s3(SBOXES, RT3, 8), to##0; \
 	xorq s1(SBOXES, RT1, 8), to##0; \
 		xorq from##1, RW1; \
 		movzbl RW1bl, RT3d; \
 		movzbl RW1bh, RT1d; \
 		shrq $16, RW1; \
 		xorq s8(SBOXES, RT3, 8), to##1; \
 		xorq s6(SBOXES, RT1, 8), to##1; \
 		movzbl RW1bl, RT3d; \
 		movzbl RW1bh, RT1d; \
 		shrq $16, RW1; \
 		xorq s4(SBOXES, RT3, 8), to##1; \
 		xorq s2(SBOXES, RT1, 8), to##1; \
 		movzbl RW1bl, RT3d; \
 		movzbl RW1bh, RT1d; \
 		shrl $16, RW1d; \
 		xorq s7(SBOXES, RT3, 8), to##1; \
 		xorq s5(SBOXES, RT1, 8), to##1; \
 		movzbl RW1bl, RT3d; \
 		movzbl RW1bh, RT1d; \
 		do_movq(RW0, RW1); \
 		xorq s3(SBOXES, RT3, 8), to##1; \
 		xorq s1(SBOXES, RT1, 8), to##1; \
 			xorq from##2, RW2; \
 			movzbl RW2bl, RT3d; \
 			movzbl RW2bh, RT1d; \
 			shrq $16, RW2; \
 			xorq s8(SBOXES, RT3, 8), to##2; \
 			xorq s6(SBOXES, RT1, 8), to##2; \
 			movzbl RW2bl, RT3d; \
 			movzbl RW2bh, RT1d; \
 			shrq $16, RW2; \
 			xorq s4(SBOXES, RT3, 8), to##2; \
 			xorq s2(SBOXES, RT1, 8), to##2; \
 			movzbl RW2bl, RT3d; \
 			movzbl RW2bh, RT1d; \
 			shrl $16, RW2d; \
 			xorq s7(SBOXES, RT3, 8), to##2; \
 			xorq s5(SBOXES, RT1, 8), to##2; \
 			movzbl RW2bl, RT3d; \
 			movzbl RW2bh, RT1d; \
 			do_movq(RW0, RW2); \
 			xorq s3(SBOXES, RT3, 8), to##2; \
 			xorq s1(SBOXES, RT1, 8), to##2;
 
 #define __movq(src, dst) \
 	movq src, dst;
 
 #define read_block(io, left, right) \
 	movl    (io), left##d; \
 	movl   4(io), right##d; \
 	bswapl left##d; \
 	bswapl right##d;
 
 #define write_block(io, left, right) \
 	bswapl left##d; \
 	bswapl right##d; \
 	movl   left##d,   (io); \
 	movl   right##d, 4(io);
 
 .align 8
 ELF(.type  _gcry_3des_amd64_crypt_blk3,@function;)
 _gcry_3des_amd64_crypt_blk3:
 	/* input:
 	 *  %rdi: round keys, CTX
 	 *  RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks
 	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
 	 */
+	CFI_STARTPROC();
 
 	leaq .L_s1 rRIP, SBOXES;
 
 	initial_permutation3(RL, RR);
 
 	movq 0(CTX), RW0;
 	movq RW0, RW1;
 	movq RW0, RW2;
 
 	round3(0, RR, RL, load_next_key, __movq);
 	round3(1, RL, RR, load_next_key, __movq);
 	round3(2, RR, RL, load_next_key, __movq);
 	round3(3, RL, RR, load_next_key, __movq);
 	round3(4, RR, RL, load_next_key, __movq);
 	round3(5, RL, RR, load_next_key, __movq);
 	round3(6, RR, RL, load_next_key, __movq);
 	round3(7, RL, RR, load_next_key, __movq);
 	round3(8, RR, RL, load_next_key, __movq);
 	round3(9, RL, RR, load_next_key, __movq);
 	round3(10, RR, RL, load_next_key, __movq);
 	round3(11, RL, RR, load_next_key, __movq);
 	round3(12, RR, RL, load_next_key, __movq);
 	round3(13, RL, RR, load_next_key, __movq);
 	round3(14, RR, RL, load_next_key, __movq);
 	round3(15, RL, RR, load_next_key, __movq);
 
 	round3(16+0, RL, RR, load_next_key, __movq);
 	round3(16+1, RR, RL, load_next_key, __movq);
 	round3(16+2, RL, RR, load_next_key, __movq);
 	round3(16+3, RR, RL, load_next_key, __movq);
 	round3(16+4, RL, RR, load_next_key, __movq);
 	round3(16+5, RR, RL, load_next_key, __movq);
 	round3(16+6, RL, RR, load_next_key, __movq);
 	round3(16+7, RR, RL, load_next_key, __movq);
 	round3(16+8, RL, RR, load_next_key, __movq);
 	round3(16+9, RR, RL, load_next_key, __movq);
 	round3(16+10, RL, RR, load_next_key, __movq);
 	round3(16+11, RR, RL, load_next_key, __movq);
 	round3(16+12, RL, RR, load_next_key, __movq);
 	round3(16+13, RR, RL, load_next_key, __movq);
 	round3(16+14, RL, RR, load_next_key, __movq);
 	round3(16+15, RR, RL, load_next_key, __movq);
 
 	round3(32+0, RR, RL, load_next_key, __movq);
 	round3(32+1, RL, RR, load_next_key, __movq);
 	round3(32+2, RR, RL, load_next_key, __movq);
 	round3(32+3, RL, RR, load_next_key, __movq);
 	round3(32+4, RR, RL, load_next_key, __movq);
 	round3(32+5, RL, RR, load_next_key, __movq);
 	round3(32+6, RR, RL, load_next_key, __movq);
 	round3(32+7, RL, RR, load_next_key, __movq);
 	round3(32+8, RR, RL, load_next_key, __movq);
 	round3(32+9, RL, RR, load_next_key, __movq);
 	round3(32+10, RR, RL, load_next_key, __movq);
 	round3(32+11, RL, RR, load_next_key, __movq);
 	round3(32+12, RR, RL, load_next_key, __movq);
 	round3(32+13, RL, RR, load_next_key, __movq);
 	round3(32+14, RR, RL, load_next_key, __movq);
 	round3(32+15, RL, RR, dummy2, dummy2);
 
 	final_permutation3(RR, RL);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
 
 .align 8
 .globl  _gcry_3des_amd64_cbc_dec
 ELF(.type   _gcry_3des_amd64_cbc_dec,@function;)
 _gcry_3des_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	pushq %rcx; /*iv*/
+	CFI_PUSH(%rcx);
 
 	/* load input */
 	movl 0 * 4(%rdx), RL0d;
 	movl 1 * 4(%rdx), RR0d;
 	movl 2 * 4(%rdx), RL1d;
 	movl 3 * 4(%rdx), RR1d;
 	movl 4 * 4(%rdx), RL2d;
 	movl 5 * 4(%rdx), RR2d;
 
 	bswapl RL0d;
 	bswapl RR0d;
 	bswapl RL1d;
 	bswapl RR1d;
 	bswapl RL2d;
 	bswapl RR2d;
 
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rcx; /*iv*/
+	CFI_POP_TMP_REG();
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
 	bswapl RR1d;
 	bswapl RL1d;
 	bswapl RR2d;
 	bswapl RL2d;
 
 	movq 2 * 8(%rdx), RT0;
 	xorl 0 * 4(%rcx), RR0d;
 	xorl 1 * 4(%rcx), RL0d;
 	xorl 0 * 4(%rdx), RR1d;
 	xorl 1 * 4(%rdx), RL1d;
 	xorl 2 * 4(%rdx), RR2d;
 	xorl 3 * 4(%rdx), RL2d;
 	movq RT0, (%rcx); /* store new IV */
 
 	movl RR0d, 0 * 4(%rsi);
 	movl RL0d, 1 * 4(%rsi);
 	movl RR1d, 2 * 4(%rsi);
 	movl RL1d, 3 * 4(%rsi);
 	movl RR2d, 4 * 4(%rsi);
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
 .globl  _gcry_3des_amd64_ctr_enc
 ELF(.type   _gcry_3des_amd64_ctr_enc,@function;)
 _gcry_3des_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	movq %rcx, RW2;
 
 	/* load IV and byteswap */
 	movq (RW2), RT0;
 	bswapq RT0;
 	movq RT0, RR0;
 
 	/* construct IVs */
 	leaq 1(RT0), RR1;
 	leaq 2(RT0), RR2;
 	leaq 3(RT0), RT0;
 	movq RR0, RL0;
 	movq RR1, RL1;
 	movq RR2, RL2;
 	bswapq RT0;
 	shrq $32, RL0;
 	shrq $32, RL1;
 	shrq $32, RL2;
 
 	/* store new IV */
 	movq RT0, (RW2);
 
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
 	bswapl RR1d;
 	bswapl RL1d;
 	bswapl RR2d;
 	bswapl RL2d;
 
 	xorl 0 * 4(%rdx), RR0d;
 	xorl 1 * 4(%rdx), RL0d;
 	xorl 2 * 4(%rdx), RR1d;
 	xorl 3 * 4(%rdx), RL1d;
 	xorl 4 * 4(%rdx), RR2d;
 	xorl 5 * 4(%rdx), RL2d;
 
 	movl RR0d, 0 * 4(%rsi);
 	movl RL0d, 1 * 4(%rsi);
 	movl RR1d, 2 * 4(%rsi);
 	movl RL1d, 3 * 4(%rsi);
 	movl RR2d, 4 * 4(%rsi);
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
 .globl  _gcry_3des_amd64_cfb_dec
 ELF(.type   _gcry_3des_amd64_cfb_dec,@function;)
 _gcry_3des_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	movq %rcx, RW2;
 
 	/* Load input */
 	movl 0 * 4(RW2), RL0d;
 	movl 1 * 4(RW2), RR0d;
 	movl 0 * 4(%rdx), RL1d;
 	movl 1 * 4(%rdx), RR1d;
 	movl 2 * 4(%rdx), RL2d;
 	movl 3 * 4(%rdx), RR2d;
 
 	bswapl RL0d;
 	bswapl RR0d;
 	bswapl RL1d;
 	bswapl RR1d;
 	bswapl RL2d;
 	bswapl RR2d;
 
 	/* Update IV */
 	movq 4 * 4(%rdx), RW0;
 	movq RW0, (RW2);
 
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
 	bswapl RR1d;
 	bswapl RL1d;
 	bswapl RR2d;
 	bswapl RL2d;
 
 	xorl 0 * 4(%rdx), RR0d;
 	xorl 1 * 4(%rdx), RL0d;
 	xorl 2 * 4(%rdx), RR1d;
 	xorl 3 * 4(%rdx), RL1d;
 	xorl 4 * 4(%rdx), RR2d;
 	xorl 5 * 4(%rdx), RL2d;
 
 	movl RR0d, 0 * 4(%rsi);
 	movl RL0d, 1 * 4(%rsi);
 	movl RR1d, 2 * 4(%rsi);
 	movl RL1d, 3 * 4(%rsi);
 	movl RR2d, 4 * 4(%rsi);
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
 .align 16
 .L_s1:
 	.quad 0x0010100001010400, 0x0000000000000000
 	.quad 0x0000100000010000, 0x0010100001010404
 	.quad 0x0010100001010004, 0x0000100000010404
 	.quad 0x0000000000000004, 0x0000100000010000
 	.quad 0x0000000000000400, 0x0010100001010400
 	.quad 0x0010100001010404, 0x0000000000000400
 	.quad 0x0010000001000404, 0x0010100001010004
 	.quad 0x0010000001000000, 0x0000000000000004
 	.quad 0x0000000000000404, 0x0010000001000400
 	.quad 0x0010000001000400, 0x0000100000010400
 	.quad 0x0000100000010400, 0x0010100001010000
 	.quad 0x0010100001010000, 0x0010000001000404
 	.quad 0x0000100000010004, 0x0010000001000004
 	.quad 0x0010000001000004, 0x0000100000010004
 	.quad 0x0000000000000000, 0x0000000000000404
 	.quad 0x0000100000010404, 0x0010000001000000
 	.quad 0x0000100000010000, 0x0010100001010404
 	.quad 0x0000000000000004, 0x0010100001010000
 	.quad 0x0010100001010400, 0x0010000001000000
 	.quad 0x0010000001000000, 0x0000000000000400
 	.quad 0x0010100001010004, 0x0000100000010000
 	.quad 0x0000100000010400, 0x0010000001000004
 	.quad 0x0000000000000400, 0x0000000000000004
 	.quad 0x0010000001000404, 0x0000100000010404
 	.quad 0x0010100001010404, 0x0000100000010004
 	.quad 0x0010100001010000, 0x0010000001000404
 	.quad 0x0010000001000004, 0x0000000000000404
 	.quad 0x0000100000010404, 0x0010100001010400
 	.quad 0x0000000000000404, 0x0010000001000400
 	.quad 0x0010000001000400, 0x0000000000000000
 	.quad 0x0000100000010004, 0x0000100000010400
 	.quad 0x0000000000000000, 0x0010100001010004
 .L_s2:
 	.quad 0x0801080200100020, 0x0800080000000000
 	.quad 0x0000080000000000, 0x0001080200100020
 	.quad 0x0001000000100000, 0x0000000200000020
 	.quad 0x0801000200100020, 0x0800080200000020
 	.quad 0x0800000200000020, 0x0801080200100020
 	.quad 0x0801080000100000, 0x0800000000000000
 	.quad 0x0800080000000000, 0x0001000000100000
 	.quad 0x0000000200000020, 0x0801000200100020
 	.quad 0x0001080000100000, 0x0001000200100020
 	.quad 0x0800080200000020, 0x0000000000000000
 	.quad 0x0800000000000000, 0x0000080000000000
 	.quad 0x0001080200100020, 0x0801000000100000
 	.quad 0x0001000200100020, 0x0800000200000020
 	.quad 0x0000000000000000, 0x0001080000100000
 	.quad 0x0000080200000020, 0x0801080000100000
 	.quad 0x0801000000100000, 0x0000080200000020
 	.quad 0x0000000000000000, 0x0001080200100020
 	.quad 0x0801000200100020, 0x0001000000100000
 	.quad 0x0800080200000020, 0x0801000000100000
 	.quad 0x0801080000100000, 0x0000080000000000
 	.quad 0x0801000000100000, 0x0800080000000000
 	.quad 0x0000000200000020, 0x0801080200100020
 	.quad 0x0001080200100020, 0x0000000200000020
 	.quad 0x0000080000000000, 0x0800000000000000
 	.quad 0x0000080200000020, 0x0801080000100000
 	.quad 0x0001000000100000, 0x0800000200000020
 	.quad 0x0001000200100020, 0x0800080200000020
 	.quad 0x0800000200000020, 0x0001000200100020
 	.quad 0x0001080000100000, 0x0000000000000000
 	.quad 0x0800080000000000, 0x0000080200000020
 	.quad 0x0800000000000000, 0x0801000200100020
 	.quad 0x0801080200100020, 0x0001080000100000
 .L_s3:
 	.quad 0x0000002000000208, 0x0000202008020200
 	.quad 0x0000000000000000, 0x0000200008020008
 	.quad 0x0000002008000200, 0x0000000000000000
 	.quad 0x0000202000020208, 0x0000002008000200
 	.quad 0x0000200000020008, 0x0000000008000008
 	.quad 0x0000000008000008, 0x0000200000020000
 	.quad 0x0000202008020208, 0x0000200000020008
 	.quad 0x0000200008020000, 0x0000002000000208
 	.quad 0x0000000008000000, 0x0000000000000008
 	.quad 0x0000202008020200, 0x0000002000000200
 	.quad 0x0000202000020200, 0x0000200008020000
 	.quad 0x0000200008020008, 0x0000202000020208
 	.quad 0x0000002008000208, 0x0000202000020200
 	.quad 0x0000200000020000, 0x0000002008000208
 	.quad 0x0000000000000008, 0x0000202008020208
 	.quad 0x0000002000000200, 0x0000000008000000
 	.quad 0x0000202008020200, 0x0000000008000000
 	.quad 0x0000200000020008, 0x0000002000000208
 	.quad 0x0000200000020000, 0x0000202008020200
 	.quad 0x0000002008000200, 0x0000000000000000
 	.quad 0x0000002000000200, 0x0000200000020008
 	.quad 0x0000202008020208, 0x0000002008000200
 	.quad 0x0000000008000008, 0x0000002000000200
 	.quad 0x0000000000000000, 0x0000200008020008
 	.quad 0x0000002008000208, 0x0000200000020000
 	.quad 0x0000000008000000, 0x0000202008020208
 	.quad 0x0000000000000008, 0x0000202000020208
 	.quad 0x0000202000020200, 0x0000000008000008
 	.quad 0x0000200008020000, 0x0000002008000208
 	.quad 0x0000002000000208, 0x0000200008020000
 	.quad 0x0000202000020208, 0x0000000000000008
 	.quad 0x0000200008020008, 0x0000202000020200
 .L_s4:
 	.quad 0x1008020000002001, 0x1000020800002001
 	.quad 0x1000020800002001, 0x0000000800000000
 	.quad 0x0008020800002000, 0x1008000800000001
 	.quad 0x1008000000000001, 0x1000020000002001
 	.quad 0x0000000000000000, 0x0008020000002000
 	.quad 0x0008020000002000, 0x1008020800002001
 	.quad 0x1000000800000001, 0x0000000000000000
 	.quad 0x0008000800000000, 0x1008000000000001
 	.quad 0x1000000000000001, 0x0000020000002000
 	.quad 0x0008000000000000, 0x1008020000002001
 	.quad 0x0000000800000000, 0x0008000000000000
 	.quad 0x1000020000002001, 0x0000020800002000
 	.quad 0x1008000800000001, 0x1000000000000001
 	.quad 0x0000020800002000, 0x0008000800000000
 	.quad 0x0000020000002000, 0x0008020800002000
 	.quad 0x1008020800002001, 0x1000000800000001
 	.quad 0x0008000800000000, 0x1008000000000001
 	.quad 0x0008020000002000, 0x1008020800002001
 	.quad 0x1000000800000001, 0x0000000000000000
 	.quad 0x0000000000000000, 0x0008020000002000
 	.quad 0x0000020800002000, 0x0008000800000000
 	.quad 0x1008000800000001, 0x1000000000000001
 	.quad 0x1008020000002001, 0x1000020800002001
 	.quad 0x1000020800002001, 0x0000000800000000
 	.quad 0x1008020800002001, 0x1000000800000001
 	.quad 0x1000000000000001, 0x0000020000002000
 	.quad 0x1008000000000001, 0x1000020000002001
 	.quad 0x0008020800002000, 0x1008000800000001
 	.quad 0x1000020000002001, 0x0000020800002000
 	.quad 0x0008000000000000, 0x1008020000002001
 	.quad 0x0000000800000000, 0x0008000000000000
 	.quad 0x0000020000002000, 0x0008020800002000
 .L_s5:
 	.quad 0x0000001000000100, 0x0020001002080100
 	.quad 0x0020000002080000, 0x0420001002000100
 	.quad 0x0000000000080000, 0x0000001000000100
 	.quad 0x0400000000000000, 0x0020000002080000
 	.quad 0x0400001000080100, 0x0000000000080000
 	.quad 0x0020001002000100, 0x0400001000080100
 	.quad 0x0420001002000100, 0x0420000002080000
 	.quad 0x0000001000080100, 0x0400000000000000
 	.quad 0x0020000002000000, 0x0400000000080000
 	.quad 0x0400000000080000, 0x0000000000000000
 	.quad 0x0400001000000100, 0x0420001002080100
 	.quad 0x0420001002080100, 0x0020001002000100
 	.quad 0x0420000002080000, 0x0400001000000100
 	.quad 0x0000000000000000, 0x0420000002000000
 	.quad 0x0020001002080100, 0x0020000002000000
 	.quad 0x0420000002000000, 0x0000001000080100
 	.quad 0x0000000000080000, 0x0420001002000100
 	.quad 0x0000001000000100, 0x0020000002000000
 	.quad 0x0400000000000000, 0x0020000002080000
 	.quad 0x0420001002000100, 0x0400001000080100
 	.quad 0x0020001002000100, 0x0400000000000000
 	.quad 0x0420000002080000, 0x0020001002080100
 	.quad 0x0400001000080100, 0x0000001000000100
 	.quad 0x0020000002000000, 0x0420000002080000
 	.quad 0x0420001002080100, 0x0000001000080100
 	.quad 0x0420000002000000, 0x0420001002080100
 	.quad 0x0020000002080000, 0x0000000000000000
 	.quad 0x0400000000080000, 0x0420000002000000
 	.quad 0x0000001000080100, 0x0020001002000100
 	.quad 0x0400001000000100, 0x0000000000080000
 	.quad 0x0000000000000000, 0x0400000000080000
 	.quad 0x0020001002080100, 0x0400001000000100
 .L_s6:
 	.quad 0x0200000120000010, 0x0204000020000000
 	.quad 0x0000040000000000, 0x0204040120000010
 	.quad 0x0204000020000000, 0x0000000100000010
 	.quad 0x0204040120000010, 0x0004000000000000
 	.quad 0x0200040020000000, 0x0004040100000010
 	.quad 0x0004000000000000, 0x0200000120000010
 	.quad 0x0004000100000010, 0x0200040020000000
 	.quad 0x0200000020000000, 0x0000040100000010
 	.quad 0x0000000000000000, 0x0004000100000010
 	.quad 0x0200040120000010, 0x0000040000000000
 	.quad 0x0004040000000000, 0x0200040120000010
 	.quad 0x0000000100000010, 0x0204000120000010
 	.quad 0x0204000120000010, 0x0000000000000000
 	.quad 0x0004040100000010, 0x0204040020000000
 	.quad 0x0000040100000010, 0x0004040000000000
 	.quad 0x0204040020000000, 0x0200000020000000
 	.quad 0x0200040020000000, 0x0000000100000010
 	.quad 0x0204000120000010, 0x0004040000000000
 	.quad 0x0204040120000010, 0x0004000000000000
 	.quad 0x0000040100000010, 0x0200000120000010
 	.quad 0x0004000000000000, 0x0200040020000000
 	.quad 0x0200000020000000, 0x0000040100000010
 	.quad 0x0200000120000010, 0x0204040120000010
 	.quad 0x0004040000000000, 0x0204000020000000
 	.quad 0x0004040100000010, 0x0204040020000000
 	.quad 0x0000000000000000, 0x0204000120000010
 	.quad 0x0000000100000010, 0x0000040000000000
 	.quad 0x0204000020000000, 0x0004040100000010
 	.quad 0x0000040000000000, 0x0004000100000010
 	.quad 0x0200040120000010, 0x0000000000000000
 	.quad 0x0204040020000000, 0x0200000020000000
 	.quad 0x0004000100000010, 0x0200040120000010
 .L_s7:
 	.quad 0x0002000000200000, 0x2002000004200002
 	.quad 0x2000000004000802, 0x0000000000000000
 	.quad 0x0000000000000800, 0x2000000004000802
 	.quad 0x2002000000200802, 0x0002000004200800
 	.quad 0x2002000004200802, 0x0002000000200000
 	.quad 0x0000000000000000, 0x2000000004000002
 	.quad 0x2000000000000002, 0x0000000004000000
 	.quad 0x2002000004200002, 0x2000000000000802
 	.quad 0x0000000004000800, 0x2002000000200802
 	.quad 0x2002000000200002, 0x0000000004000800
 	.quad 0x2000000004000002, 0x0002000004200000
 	.quad 0x0002000004200800, 0x2002000000200002
 	.quad 0x0002000004200000, 0x0000000000000800
 	.quad 0x2000000000000802, 0x2002000004200802
 	.quad 0x0002000000200800, 0x2000000000000002
 	.quad 0x0000000004000000, 0x0002000000200800
 	.quad 0x0000000004000000, 0x0002000000200800
 	.quad 0x0002000000200000, 0x2000000004000802
 	.quad 0x2000000004000802, 0x2002000004200002
 	.quad 0x2002000004200002, 0x2000000000000002
 	.quad 0x2002000000200002, 0x0000000004000000
 	.quad 0x0000000004000800, 0x0002000000200000
 	.quad 0x0002000004200800, 0x2000000000000802
 	.quad 0x2002000000200802, 0x0002000004200800
 	.quad 0x2000000000000802, 0x2000000004000002
 	.quad 0x2002000004200802, 0x0002000004200000
 	.quad 0x0002000000200800, 0x0000000000000000
 	.quad 0x2000000000000002, 0x2002000004200802
 	.quad 0x0000000000000000, 0x2002000000200802
 	.quad 0x0002000004200000, 0x0000000000000800
 	.quad 0x2000000004000002, 0x0000000004000800
 	.quad 0x0000000000000800, 0x2002000000200002
 .L_s8:
 	.quad 0x0100010410001000, 0x0000010000001000
 	.quad 0x0000000000040000, 0x0100010410041000
 	.quad 0x0100000010000000, 0x0100010410001000
 	.quad 0x0000000400000000, 0x0100000010000000
 	.quad 0x0000000400040000, 0x0100000010040000
 	.quad 0x0100010410041000, 0x0000010000041000
 	.quad 0x0100010010041000, 0x0000010400041000
 	.quad 0x0000010000001000, 0x0000000400000000
 	.quad 0x0100000010040000, 0x0100000410000000
 	.quad 0x0100010010001000, 0x0000010400001000
 	.quad 0x0000010000041000, 0x0000000400040000
 	.quad 0x0100000410040000, 0x0100010010041000
 	.quad 0x0000010400001000, 0x0000000000000000
 	.quad 0x0000000000000000, 0x0100000410040000
 	.quad 0x0100000410000000, 0x0100010010001000
 	.quad 0x0000010400041000, 0x0000000000040000
 	.quad 0x0000010400041000, 0x0000000000040000
 	.quad 0x0100010010041000, 0x0000010000001000
 	.quad 0x0000000400000000, 0x0100000410040000
 	.quad 0x0000010000001000, 0x0000010400041000
 	.quad 0x0100010010001000, 0x0000000400000000
 	.quad 0x0100000410000000, 0x0100000010040000
 	.quad 0x0100000410040000, 0x0100000010000000
 	.quad 0x0000000000040000, 0x0100010410001000
 	.quad 0x0000000000000000, 0x0100010410041000
 	.quad 0x0000000400040000, 0x0100000410000000
 	.quad 0x0100000010040000, 0x0100010010001000
 	.quad 0x0100010410001000, 0x0000000000000000
 	.quad 0x0100010410041000, 0x0000010000041000
 	.quad 0x0000010000041000, 0x0000010400001000
 	.quad 0x0000010400001000, 0x0000000400040000
 	.quad 0x0100000010000000, 0x0100010010041000
 
 #endif
 #endif
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index 798ff51a..3dcaa856 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -1,451 +1,477 @@
 /* rinjdael-amd64.S  -  AMD64 assembly implementation of AES cipher
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES)
 
 #include "asm-common-amd64.h"
 
 .text
 
 /* table macros */
 #define E0	(0)
 #define Es0	(1)
 #define Esize	4
 #define Essize	4
 
 #define D0	(0)
 #define Ds0	(4 * 256)
 #define Dsize	4
 #define Dssize	1
 
 /* register macros */
 #define CTX	%rdi
 #define RTAB	%r12
 
 #define RA	%rax
 #define RB	%rbx
 #define RC	%rcx
 #define RD	%rdx
 
 #define RAd	%eax
 #define RBd	%ebx
 #define RCd	%ecx
 #define RDd	%edx
 
 #define RAbl	%al
 #define RBbl	%bl
 #define RCbl	%cl
 #define RDbl	%dl
 
 #define RAbh	%ah
 #define RBbh	%bh
 #define RCbh	%ch
 #define RDbh	%dh
 
 #define RNA	%r8
 #define RNB	%r9
 #define RNC	%r10
 #define RND	%r11
 
 #define RNAd	%r8d
 #define RNBd	%r9d
 #define RNCd	%r10d
 #define RNDd	%r11d
 
 #define RT0	%rbp
 #define RT1	%rsi
 
 #define RT0d	%ebp
 #define RT1d	%esi
 
 /* helper macros */
 #define do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
 	movzbl source ## bl,			t0 ## d; \
 	movzbl source ## bh,			t1 ## d; \
 	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
 	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
 
 #define do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
 	movzbl source ## bl,			t0 ## d; \
 	movzbl source ## bh,			t1 ## d; \
 	shrl $(shf),				source ## d; \
 	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
 	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
 
 #define last_do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
 	movzbl source ## bl,			t0 ## d; \
 	movzbl source ## bh,			t1 ## d; \
 	movzbl table1(RTAB,t0,tablemul),	t0 ## d; \
 	movzbl table2(RTAB,t1,tablemul),	t1 ## d; \
 	op ## l t0 ## d,			dest1 ## d; \
 	op ## l t1 ## d,			dest2 ## d;
 
 #define last_do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
 	movzbl source ## bl,			t0 ## d; \
 	movzbl source ## bh,			t1 ## d; \
 	shrl $(shf),				source ## d; \
 	movzbl table1(RTAB,t0,tablemul),	t0 ## d; \
 	movzbl table2(RTAB,t1,tablemul),	t1 ## d; \
 	op ## l t0 ## d,			dest1 ## d; \
 	op ## l t1 ## d,			dest2 ## d;
 
 /***********************************************************************
  * AMD64 assembly implementation of the AES cipher
  ***********************************************************************/
 #define addroundkey(round, ra, rb, rc, rd) \
 	xorl (((round) * 16) + 0 * 4)(CTX), ra ## d; \
 	xorl (((round) * 16) + 1 * 4)(CTX), rb ## d; \
 	xorl (((round) * 16) + 2 * 4)(CTX), rc ## d; \
 	xorl (((round) * 16) + 3 * 4)(CTX), rd ## d;
 
 #define do_encround(next_r) \
 	do16bit_shr(16, mov, RA, Esize, E0, RNA, E0, RND, RT0, RT1); \
 	do16bit(        mov, RA, Esize, E0, RNC, E0, RNB, RT0, RT1); \
 	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
 	roll $8, RNDd; \
 	xorl RNAd, RAd; \
 	roll $8, RNCd; \
 	roll $8, RNBd; \
 	roll $8, RAd; \
 	\
 	do16bit_shr(16, xor, RD, Esize, E0, RND, E0, RNC, RT0, RT1); \
 	do16bit(        xor, RD, Esize, E0, RNB, E0, RA,  RT0, RT1); \
 	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
 	roll $8, RNCd; \
 	xorl RNDd, RDd; \
 	roll $8, RNBd; \
 	roll $8, RAd; \
 	roll $8, RDd; \
 	\
 	do16bit_shr(16, xor, RC, Esize, E0, RNC, E0, RNB, RT0, RT1); \
 	do16bit(        xor, RC, Esize, E0, RA,  E0, RD,  RT0, RT1); \
 	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
 	roll $8, RNBd; \
 	xorl RNCd, RCd; \
 	roll $8, RAd; \
 	roll $8, RDd; \
 	roll $8, RCd; \
 	\
 	do16bit_shr(16, xor, RB, Esize, E0, RNB, E0, RA,  RT0, RT1); \
 	do16bit(        xor, RB, Esize, E0, RD,  E0, RC,  RT0, RT1); \
 	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
 	roll $8, RAd; \
 	xorl RNBd, RBd; \
 	roll $16, RDd; \
 	roll $24, RCd;
 
 #define do_lastencround(next_r) \
 	do16bit_shr(16, movzb, RA, Essize, Es0, RNA, Es0, RND, RT0, RT1); \
 	do16bit(        movzb, RA, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \
 	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
 	roll $8, RNDd; \
 	xorl RNAd, RAd; \
 	roll $8, RNCd; \
 	roll $8, RNBd; \
 	roll $8, RAd; \
 	\
 	last_do16bit_shr(16, xor, RD, Essize, Es0, RND, Es0, RNC, RT0, RT1); \
 	last_do16bit(        xor, RD, Essize, Es0, RNB, Es0, RA,  RT0, RT1); \
 	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
 	roll $8, RNCd; \
 	xorl RNDd, RDd; \
 	roll $8, RNBd; \
 	roll $8, RAd; \
 	roll $8, RDd; \
 	\
 	last_do16bit_shr(16, xor, RC, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \
 	last_do16bit(        xor, RC, Essize, Es0, RA,  Es0, RD,  RT0, RT1); \
 	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
 	roll $8, RNBd; \
 	xorl RNCd, RCd; \
 	roll $8, RAd; \
 	roll $8, RDd; \
 	roll $8, RCd; \
 	\
 	last_do16bit_shr(16, xor, RB, Essize, Es0, RNB, Es0, RA,  RT0, RT1); \
 	last_do16bit(        xor, RB, Essize, Es0, RD,  Es0, RC,  RT0, RT1); \
 	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
 	roll $8, RAd; \
 	xorl RNBd, RBd; \
 	roll $16, RDd; \
 	roll $24, RCd;
 
 #define firstencround(round) \
 	addroundkey(round, RA, RB, RC, RD); \
 	do_encround((round) + 1);
 
 #define encround(round) \
 	do_encround((round) + 1);
 
 #define lastencround(round) \
 	do_lastencround((round) + 1);
 
 .align 8
 .globl _gcry_aes_amd64_encrypt_block
 ELF(.type   _gcry_aes_amd64_encrypt_block,@function;)
 
 _gcry_aes_amd64_encrypt_block:
 	/* input:
 	 *	%rdi: keysched, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  encryption tables
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
 	movq %rbp, (2 * 8)(%rsp);
 	movq %rbx, (3 * 8)(%rsp);
 	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
 
 	leaq (%r8), RTAB;
 
 	/* read input block */
 	movl 0 * 4(%rdx), RAd;
 	movl 1 * 4(%rdx), RBd;
 	movl 2 * 4(%rdx), RCd;
 	movl 3 * 4(%rdx), RDd;
 
 	firstencround(0);
 	encround(1);
 	encround(2);
 	encround(3);
 	encround(4);
 	encround(5);
 	encround(6);
 	encround(7);
 	encround(8);
 	cmpl $12, (1 * 8)(%rsp);
 	jnb .Lenc_not_128;
 	lastencround(9);
 
 .align 4
 .Lenc_done:
 	/* write output block */
 	movq (0 * 8)(%rsp), %rsi;
 	movl RAd, 0 * 4(%rsi);
 	movl RBd, 1 * 4(%rsi);
 	movl RCd, 2 * 4(%rsi);
 	movl RDd, 3 * 4(%rsi);
 
+	CFI_REMEMBER_STATE();
+
 	movq (4 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
 
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
 	ret;
 
+	CFI_RESTORE_STATE();
 .align 4
 .Lenc_not_128:
 	je .Lenc_192
 
 	encround(9);
 	encround(10);
 	encround(11);
 	encround(12);
 	lastencround(13);
 
 	jmp .Lenc_done;
 
 .align 4
 .Lenc_192:
 	encround(9);
 	encround(10);
 	lastencround(11);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
 
 #define do_decround(next_r) \
 	do16bit_shr(16, mov, RA, Dsize, D0, RNA, D0, RNB, RT0, RT1); \
 	do16bit(        mov, RA, Dsize, D0, RNC, D0, RND, RT0, RT1); \
 	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
 	roll $8, RNBd; \
 	xorl RNAd, RAd; \
 	roll $8, RNCd; \
 	roll $8, RNDd; \
 	roll $8, RAd; \
 	\
 	do16bit_shr(16, xor, RB, Dsize, D0, RNB, D0, RNC, RT0, RT1); \
 	do16bit(        xor, RB, Dsize, D0, RND, D0, RA,  RT0, RT1); \
 	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
 	roll $8, RNCd; \
 	xorl RNBd, RBd; \
 	roll $8, RNDd; \
 	roll $8, RAd; \
 	roll $8, RBd; \
 	\
 	do16bit_shr(16, xor, RC, Dsize, D0, RNC, D0, RND, RT0, RT1); \
 	do16bit(        xor, RC, Dsize, D0, RA,  D0, RB,  RT0, RT1); \
 	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
 	roll $8, RNDd; \
 	xorl RNCd, RCd; \
 	roll $8, RAd; \
 	roll $8, RBd; \
 	roll $8, RCd; \
 	\
 	do16bit_shr(16, xor, RD, Dsize, D0, RND, D0, RA,  RT0, RT1); \
 	do16bit(        xor, RD, Dsize, D0, RB,  D0, RC,  RT0, RT1); \
 	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
 	roll $8, RAd; \
 	xorl RNDd, RDd; \
 	roll $16, RBd; \
 	roll $24, RCd;
 
 #define do_lastdecround(next_r) \
 	do16bit_shr(16, movzb, RA, Dssize, Ds0, RNA, Ds0, RNB, RT0, RT1); \
 	do16bit(        movzb, RA, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \
 	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
 	roll $8, RNBd; \
 	xorl RNAd, RAd; \
 	roll $8, RNCd; \
 	roll $8, RNDd; \
 	roll $8, RAd; \
 	\
 	last_do16bit_shr(16, xor, RB, Dssize, Ds0, RNB, Ds0, RNC, RT0, RT1); \
 	last_do16bit(        xor, RB, Dssize, Ds0, RND, Ds0, RA,  RT0, RT1); \
 	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
 	roll $8, RNCd; \
 	xorl RNBd, RBd; \
 	roll $8, RNDd; \
 	roll $8, RAd; \
 	roll $8, RBd; \
 	\
 	last_do16bit_shr(16, xor, RC, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \
 	last_do16bit(        xor, RC, Dssize, Ds0, RA,  Ds0, RB,  RT0, RT1); \
 	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
 	roll $8, RNDd; \
 	xorl RNCd, RCd; \
 	roll $8, RAd; \
 	roll $8, RBd; \
 	roll $8, RCd; \
 	\
 	last_do16bit_shr(16, xor, RD, Dssize, Ds0, RND, Ds0, RA,  RT0, RT1); \
 	last_do16bit(        xor, RD, Dssize, Ds0, RB,  Ds0, RC,  RT0, RT1); \
 	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
 	roll $8, RAd; \
 	xorl RNDd, RDd; \
 	roll $16, RBd; \
 	roll $24, RCd;
 
 #define firstdecround(round) \
 	addroundkey((round + 1), RA, RB, RC, RD); \
 	do_decround(round);
 
 #define decround(round) \
 	do_decround(round);
 
 #define lastdecround(round) \
 	do_lastdecround(round);
 
 .align 8
 .globl _gcry_aes_amd64_decrypt_block
 ELF(.type   _gcry_aes_amd64_decrypt_block,@function;)
 
 _gcry_aes_amd64_decrypt_block:
 	/* input:
 	 *	%rdi: keysched, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  decryption tables
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
 	movq %rbp, (2 * 8)(%rsp);
 	movq %rbx, (3 * 8)(%rsp);
 	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
 
 	leaq (%r8), RTAB;
 
 	/* read input block */
 	movl 0 * 4(%rdx), RAd;
 	movl 1 * 4(%rdx), RBd;
 	movl 2 * 4(%rdx), RCd;
 	movl 3 * 4(%rdx), RDd;
 
 	cmpl $12, (1 * 8)(%rsp);
 	jnb .Ldec_256;
 
 	firstdecround(9);
 .align 4
 .Ldec_tail:
 	decround(8);
 	decround(7);
 	decround(6);
 	decround(5);
 	decround(4);
 	decround(3);
 	decround(2);
 	decround(1);
 	lastdecround(0);
 
 	/* write output block */
 	movq (0 * 8)(%rsp), %rsi;
 	movl RAd, 0 * 4(%rsi);
 	movl RBd, 1 * 4(%rsi);
 	movl RCd, 2 * 4(%rsi);
 	movl RDd, 3 * 4(%rsi);
 
+	CFI_REMEMBER_STATE();
+
 	movq (4 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
 
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
 	ret;
 
+	CFI_RESTORE_STATE();
 .align 4
 .Ldec_256:
 	je .Ldec_192;
 
 	firstdecround(13);
 	decround(12);
 	decround(11);
 	decround(10);
 	decround(9);
 
 	jmp .Ldec_tail;
 
 .align 4
 .Ldec_192:
 	firstdecround(11);
 	decround(10);
 	decround(9);
 
 	jmp .Ldec_tail;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;)
 
 #endif /*USE_AES*/
 #endif /*__x86_64*/
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index ffce5df2..8124eb21 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -1,864 +1,874 @@
 /* SSSE3 vector permutation AES for Libgcrypt
  * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  *
  * The code is based on the public domain library libvpaes version 0.5
  * available at http://crypto.stanford.edu/vpaes/ and which carries
  * this notice:
  *
  *     libvpaes: constant-time SSSE3 AES encryption and decryption.
  *     version 0.5
  *
  *     By Mike Hamburg, Stanford University, 2009.  Public domain.
  *     I wrote essentially all of this code.  I did not write the test
  *     vectors; they are the NIST known answer tests.  I hereby release all
  *     the code and documentation here that I wrote into the public domain.
  *
  *     This is an implementation of AES following my paper,
  *       "Accelerating AES with Vector Permute Instructions
  *       CHES 2009; http://shiftleft.org/papers/vector_aes/
  */
 
 #if defined(__x86_64__)
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #include "asm-common-amd64.h"
 
 .text
 
 ##
 ##  _gcry_aes_ssse3_enc_preload
 ##
 ELF(.type _gcry_aes_ssse3_enc_preload,@function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9  # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10 # inv
 	movdqa	.Lk_inv+16(%rax), %xmm11 # inva
 	movdqa	.Lk_sb1   (%rax), %xmm13 # sb1u
 	movdqa	.Lk_sb1+16(%rax), %xmm12 # sb1t
 	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
 ##
 ##  _gcry_aes_ssse3_dec_preload
 ##
 ELF(.type _gcry_aes_ssse3_dec_preload,@function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9   # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10  # inv
 	movdqa	.Lk_inv+16(%rax), %xmm11  # inva
 	movdqa	.Lk_dsb9   (%rax), %xmm13 # sb9u
 	movdqa	.Lk_dsb9+16(%rax), %xmm12 # sb9t
 	movdqa	.Lk_dsbd   (%rax), %xmm15 # sbdu
 	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
 ##
 ## Constant-time SSSE3 AES core implementation.
 ##
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
 
 ##
 ##  _aes_encrypt_core
 ##
 ##  AES-encrypt %xmm0.
 ##
 ##  Inputs:
 ##     %xmm0 = input
 ##     %xmm9-%xmm15 as in .Laes_preheat
 ##    (%rdi) = scheduled keys
 ##     %rsi  = nrounds
 ##
 ##  Output in %xmm0
 ##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx
 ##  Preserves %xmm6 - %xmm7 so you get some local vectors
 ##
 ##
 .align 16
 ELF(.type _gcry_aes_ssse3_encrypt_core,@function)
 .globl _gcry_aes_ssse3_encrypt_core
 _gcry_aes_ssse3_encrypt_core:
 _aes_encrypt_core:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	mov	%rdi,	%rdx
 	leaq	-1(%rsi), %rax
 	lea	.Laes_consts(%rip), %rcx
 	leaq	.Lk_mc_backward(%rcx), %rdi
 	mov	$16,	%rsi
 	movdqa	.Lk_ipt   (%rcx), %xmm2 # iptlo
 	movdqa	%xmm9,	%xmm1
 	pandn	%xmm0,	%xmm1
 	psrld	$4,	%xmm1
 	pand	%xmm9,	%xmm0
 	pshufb	%xmm0,	%xmm2
 	movdqa	.Lk_ipt+16(%rcx), %xmm0 # ipthi
 	pshufb	%xmm1,	%xmm0
 	pxor	(%rdx),%xmm2
 	pxor	%xmm2,	%xmm0
 	add	$16,	%rdx
 	jmp	.Laes_entry
 
 .align 8
 .Laes_loop:
 	# middle of middle round
 	movdqa  %xmm13,	%xmm4	# 4 : sb1u
 	pshufb  %xmm2,	%xmm4   # 4 = sb1u
 	pxor	(%rdx),	%xmm4	# 4 = sb1u + k
 	movdqa  %xmm12,	%xmm0	# 0 : sb1t
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	movdqa  %xmm15,	%xmm4	# 4 : sb2u
 	pshufb	%xmm2,	%xmm4	# 4 = sb2u
 	movdqa	.Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1
 	movdqa	%xmm14, %xmm2	# 2 : sb2t
 	pshufb	%xmm3,  %xmm2	# 2 = sb2t
 	pxor	%xmm4,  %xmm2	# 2 = 2A
 	movdqa	%xmm0,  %xmm3	# 3 = A
 	pshufb  %xmm1,  %xmm0	# 0 = B
 	pxor	%xmm2,  %xmm0	# 0 = 2A+B
 	pshufb	(%rsi,%rdi), %xmm3  # 3 = D
 	lea	16(%esi),%esi	# next mc
 	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
 	lea	16(%rdx),%rdx	# next key
 	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
 	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
 	and	$48, %rsi	# ... mod 4
 	dec	%rax		# nr--
 
 .Laes_entry:
 	# top of round
 	movdqa  %xmm9, 	%xmm1	# 1 : i
 	pandn	%xmm0, 	%xmm1	# 1 = i<<4
 	psrld	$4,    	%xmm1   # 1 = i
 	pand	%xmm9, 	%xmm0   # 0 = k
 	movdqa	%xmm11, %xmm2	# 2 : a/k
 	pshufb  %xmm0,  %xmm2	# 2 = a/k
 	pxor	%xmm1,	%xmm0	# 0 = j
 	movdqa  %xmm10,	%xmm3  	# 3 : 1/i
 	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
 	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k
 	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
 	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
 	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k
 	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak
 	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
 	pxor	%xmm0, 	%xmm2  	# 2 = io
 	movdqa  %xmm10, %xmm3   # 3 : 1/jak
 	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
 	pxor	%xmm1,  %xmm3   # 3 = jo
 	jnz	.Laes_loop
 
 	# middle of last round
 	movdqa	.Lk_sbo(%rcx), %xmm4	# 3 : sbou
 	pshufb  %xmm2,  %xmm4   # 4 = sbou
 	pxor	(%rdx), %xmm4   # 4 = sb1u + k
 	movdqa	.Lk_sbo+16(%rcx), %xmm0	# 0 : sbot
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
 ##
 ##  Decryption core
 ##
 ##  Same API as encryption core.
 ##
 .align 16
 .globl _gcry_aes_ssse3_decrypt_core
 ELF(.type _gcry_aes_ssse3_decrypt_core,@function)
 _gcry_aes_ssse3_decrypt_core:
 _aes_decrypt_core:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	mov	%rdi,	%rdx
 	lea	.Laes_consts(%rip), %rcx
 	subl	$1,	%esi
 	movl	%esi,   %eax
 	shll	$4,	%esi
 	xorl	$48,	%esi
 	andl	$48,	%esi
 	movdqa	.Lk_dipt   (%rcx), %xmm2 # iptlo
 	movdqa	%xmm9,	%xmm1
 	pandn	%xmm0,	%xmm1
 	psrld	$4,	%xmm1
 	pand	%xmm9,	%xmm0
 	pshufb	%xmm0,	%xmm2
 	movdqa	.Lk_dipt+16(%rcx), %xmm0 # ipthi
 	pshufb	%xmm1,	%xmm0
 	pxor	(%rdx),	%xmm2
 	pxor	%xmm2,	%xmm0
 	movdqa	.Lk_mc_forward+48(%rcx), %xmm5
 	lea	16(%rdx), %rdx
 	neg	%rax
 	jmp	.Laes_dec_entry
 
 .align 16
 .Laes_dec_loop:
 ##
 ##  Inverse mix columns
 ##
 	movdqa  %xmm13,	%xmm4		# 4 : sb9u
 	pshufb	%xmm2,	%xmm4		# 4 = sb9u
 	pxor	(%rdx),	%xmm4
 	movdqa  %xmm12,	%xmm0		# 0 : sb9t
 	pshufb	%xmm3,	%xmm0		# 0 = sb9t
 	movdqa  .Lk_dsbd+16(%rcx),%xmm1	# 1 : sbdt
 	pxor	%xmm4,	%xmm0		# 0 = ch
 	lea	16(%rdx), %rdx		# next round key
 
 	pshufb	%xmm5,	%xmm0		# MC ch
 	movdqa  %xmm15,	%xmm4		# 4 : sbdu
 	pshufb	%xmm2,	%xmm4		# 4 = sbdu
 	pxor	%xmm0,	%xmm4		# 4 = ch
 	pshufb	%xmm3,	%xmm1		# 1 = sbdt
 	pxor	%xmm4,	%xmm1		# 1 = ch
 
 	pshufb	%xmm5,	%xmm1		# MC ch
 	movdqa  %xmm14,	%xmm4		# 4 : sbbu
 	pshufb	%xmm2,	%xmm4		# 4 = sbbu
 	inc     %rax                    # nr--
 	pxor	%xmm1,	%xmm4		# 4 = ch
 	movdqa  .Lk_dsbb+16(%rcx),%xmm0	# 0 : sbbt
 	pshufb	%xmm3,	%xmm0		# 0 = sbbt
 	pxor	%xmm4,	%xmm0		# 0 = ch
 
 	pshufb	%xmm5,	%xmm0		# MC ch
 	movdqa  %xmm8,	%xmm4		# 4 : sbeu
 	pshufb	%xmm2,	%xmm4		# 4 = sbeu
 	pshufd	$0x93,	%xmm5,	%xmm5
 	pxor	%xmm0,	%xmm4		# 4 = ch
 	movdqa  .Lk_dsbe+16(%rcx),%xmm0	# 0 : sbet
 	pshufb	%xmm3,	%xmm0		# 0 = sbet
 	pxor	%xmm4,	%xmm0		# 0 = ch
 
 .Laes_dec_entry:
 	# top of round
 	movdqa  %xmm9, 	%xmm1	# 1 : i
 	pandn	%xmm0, 	%xmm1	# 1 = i<<4
 	psrld	$4,    	%xmm1   # 1 = i
 	pand	%xmm9, 	%xmm0   # 0 = k
 	movdqa	%xmm11, %xmm2	# 2 : a/k
 	pshufb  %xmm0,  %xmm2	# 2 = a/k
 	pxor	%xmm1,	%xmm0	# 0 = j
 	movdqa  %xmm10,	%xmm3  	# 3 : 1/i
 	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
 	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k
 	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
 	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
 	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k
 	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak
 	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
 	pxor	%xmm0, 	%xmm2  	# 2 = io
 	movdqa  %xmm10, %xmm3   # 3 : 1/jak
 	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
 	pxor	%xmm1,  %xmm3   # 3 = jo
 	jnz	.Laes_dec_loop
 
 	# middle of last round
 	movdqa	.Lk_dsbo(%rcx), %xmm4		# 3 : sbou
 	pshufb  %xmm2,  %xmm4   # 4 = sbou
 	pxor	(%rdx), %xmm4   # 4 = sb1u + k
 	movdqa	.Lk_dsbo+16(%rcx), %xmm0	# 0 : sbot
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
 ########################################################
 ##                                                    ##
 ##                  AES key schedule                  ##
 ##                                                    ##
 ########################################################
 
 .align 16
 .globl _gcry_aes_ssse3_schedule_core
 ELF(.type _gcry_aes_ssse3_schedule_core,@function)
 _gcry_aes_ssse3_schedule_core:
 _aes_schedule_core:
 	# rdi = key
 	# rsi = size in bits
 	# rdx = buffer
 	# rcx = direction.  0=encrypt, 1=decrypt
 	# r8 = rotoffs
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	# load the tables
 	lea	.Laes_consts(%rip), %r10
 	movdqa	          (%r10), %xmm9  # 0F
 	movdqa	.Lk_inv   (%r10), %xmm10 # inv
 	movdqa	.Lk_inv+16(%r10), %xmm11 # inva
 	movdqa	.Lk_sb1   (%r10), %xmm13 # sb1u
 	movdqa	.Lk_sb1+16(%r10), %xmm12 # sb1t
 	movdqa	.Lk_sb2   (%r10), %xmm15 # sb2u
 	movdqa	.Lk_sb2+16(%r10), %xmm14 # sb2t
 
 	movdqa	.Lk_rcon(%r10), %xmm8	# load rcon
 	movdqu	(%rdi),	%xmm0		# load key (unaligned)
 
 	# input transform
 	movdqu	%xmm0,	%xmm3
 	lea	.Lk_ipt(%r10), %r11
 	call	.Laes_schedule_transform
 	movdqu	%xmm0,	%xmm7
 
 	test	%rcx,	%rcx
 	jnz	.Laes_schedule_am_decrypting
 
 	# encrypting, output zeroth round key after transform
 	movdqa	%xmm0,	(%rdx)
 	jmp	.Laes_schedule_go
 
 .Laes_schedule_am_decrypting:
 	# decrypting, output zeroth round key after shiftrows
 	pshufb  .Lk_sr(%r8,%r10),%xmm3
 	movdqa	%xmm3,	(%rdx)
 	xor	$48, 	%r8
 
 .Laes_schedule_go:
 	cmp	$192,	%rsi
 	je	.Laes_schedule_192
 	cmp	$256,	%rsi
 	je	.Laes_schedule_256
 	# 128: fall though
 
 ##
 ##  .Laes_schedule_128
 ##
 ##  128-bit specific part of key schedule.
 ##
 ##  This schedule is really simple, because all its parts
 ##  are accomplished by the subroutines.
 ##
 .Laes_schedule_128:
 	mov	$10, %rsi
 
 .Laes_schedule_128_L:
 	call 	.Laes_schedule_round
 	dec	%rsi
 	jz 	.Laes_schedule_mangle_last
 	call	.Laes_schedule_mangle	# write output
 	jmp 	.Laes_schedule_128_L
 
 ##
 ##  .Laes_schedule_192
 ##
 ##  192-bit specific part of key schedule.
 ##
 ##  The main body of this schedule is the same as the 128-bit
 ##  schedule, but with more smearing.  The long, high side is
 ##  stored in %xmm7 as before, and the short, low side is in
 ##  the high bits of %xmm6.
 ##
 ##  This schedule is somewhat nastier, however, because each
 ##  round produces 192 bits of key material, or 1.5 round keys.
 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
 ##  keys.
 ##
 .Laes_schedule_192:
 	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
 	call	.Laes_schedule_transform	# input transform
 	pshufd	$0x0E,	%xmm0,	%xmm6
 	pslldq	$8,	%xmm6		# clobber low side with zeros
 	mov	$4,	%rsi
 
 .Laes_schedule_192_L:
 	call	.Laes_schedule_round
 	palignr	$8,%xmm6,%xmm0
 	call	.Laes_schedule_mangle	# save key n
 	call	.Laes_schedule_192_smear
 	call	.Laes_schedule_mangle	# save key n+1
 	call	.Laes_schedule_round
 	dec	%rsi
 	jz 	.Laes_schedule_mangle_last
 	call	.Laes_schedule_mangle	# save key n+2
 	call	.Laes_schedule_192_smear
 	jmp	.Laes_schedule_192_L
 
 ##
 ##  .Laes_schedule_192_smear
 ##
 ##  Smear the short, low side in the 192-bit key schedule.
 ##
 ##  Inputs:
 ##    %xmm7: high side, b  a  x  y
 ##    %xmm6:  low side, d  c  0  0
 ##    %xmm13: 0
 ##
 ##  Outputs:
 ##    %xmm6: b+c+d  b+c  0  0
 ##    %xmm0: b+c+d  b+c  b  a
 ##
 .Laes_schedule_192_smear:
 	pshufd	$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
 	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
 	pshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
 	pxor	%xmm6,	%xmm0		# -> b+c+d b+c b a
 	pshufd	$0x0E,	%xmm0,	%xmm6
 	pslldq	$8,	%xmm6		# clobber low side with zeros
 	ret
 
 ##
 ##  .Laes_schedule_256
 ##
 ##  256-bit specific part of key schedule.
 ##
 ##  The structure here is very similar to the 128-bit
 ##  schedule, but with an additional 'low side' in
 ##  %xmm6.  The low side's rounds are the same as the
 ##  high side's, except no rcon and no rotation.
 ##
 .Laes_schedule_256:
 	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
 	call	.Laes_schedule_transform	# input transform
 	mov	$7, %rsi
 
 .Laes_schedule_256_L:
 	call	.Laes_schedule_mangle	# output low result
 	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
 
 	# high round
 	call	.Laes_schedule_round
 	dec	%rsi
 	jz 	.Laes_schedule_mangle_last
 	call	.Laes_schedule_mangle
 
 	# low round. swap xmm7 and xmm6
 	pshufd	$0xFF,	%xmm0,	%xmm0
 	movdqa	%xmm7,	%xmm5
 	movdqa	%xmm6,	%xmm7
 	call	.Laes_schedule_low_round
 	movdqa	%xmm5,	%xmm7
 
 	jmp	.Laes_schedule_256_L
 
 ##
 ##  .Laes_schedule_round
 ##
 ##  Runs one main round of the key schedule on %xmm0, %xmm7
 ##
 ##  Specifically, runs subbytes on the high dword of %xmm0
 ##  then rotates it by one byte and xors into the low dword of
 ##  %xmm7.
 ##
 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
 ##  next rcon.
 ##
 ##  Smears the dwords of %xmm7 by xoring the low into the
 ##  second low, result into third, result into highest.
 ##
 ##  Returns results in %xmm7 = %xmm0.
 ##  Clobbers %xmm1-%xmm4, %r11.
 ##
 .Laes_schedule_round:
 	# extract rcon from xmm8
 	pxor	%xmm1,	%xmm1
 	palignr	$15,	%xmm8,	%xmm1
 	palignr	$15,	%xmm8,	%xmm8
 	pxor	%xmm1,	%xmm7
 
 	# rotate
 	pshufd	$0xFF,	%xmm0,	%xmm0
 	palignr	$1,	%xmm0,	%xmm0
 
 	# fall through...
 
 	# low round: same as high round, but no rotation and no rcon.
 .Laes_schedule_low_round:
 	# smear xmm7
 	movdqa	%xmm7,	%xmm1
 	pslldq	$4,	%xmm7
 	pxor	%xmm1,	%xmm7
 	movdqa	%xmm7,	%xmm1
 	pslldq	$8,	%xmm7
 	pxor	%xmm1,	%xmm7
 	pxor	.Lk_s63(%r10), %xmm7
 
 	# subbytes
 	movdqa  %xmm9, 	%xmm1
 	pandn	%xmm0, 	%xmm1
 	psrld	$4,    	%xmm1		# 1 = i
 	pand	%xmm9, 	%xmm0		# 0 = k
 	movdqa	%xmm11, %xmm2		# 2 : a/k
 	pshufb  %xmm0,  %xmm2		# 2 = a/k
 	pxor	%xmm1,	%xmm0		# 0 = j
 	movdqa  %xmm10,	%xmm3		# 3 : 1/i
 	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
 	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
 	movdqa	%xmm10,	%xmm4		# 4 : 1/j
 	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
 	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
 	movdqa  %xmm10,	%xmm2		# 2 : 1/iak
 	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
 	pxor	%xmm0, 	%xmm2		# 2 = io
 	movdqa  %xmm10, %xmm3		# 3 : 1/jak
 	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
 	pxor	%xmm1,  %xmm3		# 3 = jo
 	movdqa	.Lk_sb1(%r10), %xmm4	# 4 : sbou
 	pshufb  %xmm2,  %xmm4		# 4 = sbou
 	movdqa	.Lk_sb1+16(%r10), %xmm0	# 0 : sbot
 	pshufb  %xmm3,	%xmm0		# 0 = sb1t
 	pxor	%xmm4, 	%xmm0		# 0 = sbox output
 
 	# add in smeared stuff
 	pxor	%xmm7,	%xmm0
 	movdqa	%xmm0,	%xmm7
 	ret
 
 ##
 ##  .Laes_schedule_transform
 ##
 ##  Linear-transform %xmm0 according to tables at (%r11)
 ##
 ##  Requires that %xmm9 = 0x0F0F... as in preheat
 ##  Output in %xmm0
 ##  Clobbers %xmm1, %xmm2
 ##
 .Laes_schedule_transform:
 	movdqa	%xmm9,	%xmm1
 	pandn	%xmm0,	%xmm1
 	psrld	$4,	%xmm1
 	pand	%xmm9,	%xmm0
 	movdqa	(%r11), %xmm2 	# lo
 	pshufb	%xmm0,	%xmm2
 	movdqa	16(%r11), %xmm0 # hi
 	pshufb	%xmm1,	%xmm0
 	pxor	%xmm2,	%xmm0
 	ret
 
 ##
 ##  .Laes_schedule_mangle
 ##
 ##  Mangle xmm0 from (basis-transformed) standard version
 ##  to our version.
 ##
 ##  On encrypt,
 ##    xor with 0x63
 ##    multiply by circulant 0,1,1,1
 ##    apply shiftrows transform
 ##
 ##  On decrypt,
 ##    xor with 0x63
 ##    multiply by 'inverse mixcolumns' circulant E,B,D,9
 ##    deskew
 ##    apply shiftrows transform
 ##
 ##
 ##  Writes out to (%rdx), and increments or decrements it
 ##  Keeps track of round number mod 4 in %r8
 ##  Preserves xmm0
 ##  Clobbers xmm1-xmm5
 ##
 .Laes_schedule_mangle:
 	movdqa	%xmm0,	%xmm4	# save xmm0 for later
 	movdqa	.Lk_mc_forward(%r10),%xmm5
 	test	%rcx, 	%rcx
 	jnz	.Laes_schedule_mangle_dec
 
 	# encrypting
 	add	$16,	%rdx
 	pxor	.Lk_s63(%r10),%xmm4
 	pshufb	%xmm5,	%xmm4
 	movdqa	%xmm4,	%xmm3
 	pshufb	%xmm5,	%xmm4
 	pxor	%xmm4,	%xmm3
 	pshufb	%xmm5,	%xmm4
 	pxor	%xmm4,	%xmm3
 
 	jmp	.Laes_schedule_mangle_both
 
 .Laes_schedule_mangle_dec:
 	lea	.Lk_dks_1(%r10), %r11	# first table: *9
 	call 	.Laes_schedule_transform
 	movdqa	%xmm0,	%xmm3
 	pshufb	%xmm5,	%xmm3
 
 	add	$32, 	%r11		# next table:  *B
 	call 	.Laes_schedule_transform
 	pxor	%xmm0,	%xmm3
 	pshufb	%xmm5,	%xmm3
 
 	add	$32, 	%r11		# next table:  *D
 	call 	.Laes_schedule_transform
 	pxor	%xmm0,	%xmm3
 	pshufb	%xmm5,	%xmm3
 
 	add	$32, 	%r11		# next table:  *E
 	call 	.Laes_schedule_transform
 	pxor	%xmm0,	%xmm3
 	pshufb	%xmm5,	%xmm3
 
 	movdqa	%xmm4,	%xmm0		# restore %xmm0
 	add	$-16,	%rdx
 
 .Laes_schedule_mangle_both:
 	pshufb	.Lk_sr(%r8,%r10),%xmm3
 	add	$-16,	%r8
 	and	$48,	%r8
 	movdqa	%xmm3,	(%rdx)
 	ret
 
 ##
 ##  .Laes_schedule_mangle_last
 ##
 ##  Mangler for last round of key schedule
 ##  Mangles %xmm0
 ##    when encrypting, outputs out(%xmm0) ^ 63
 ##    when decrypting, outputs unskew(%xmm0)
 ##
 ##  Always called right before return... jumps to cleanup and exits
 ##
 .Laes_schedule_mangle_last:
 	# schedule last round key from xmm0
 	lea	.Lk_deskew(%r10),%r11	# prepare to deskew
 	test	%rcx, 	%rcx
 	jnz	.Laes_schedule_mangle_last_dec
 
 	# encrypting
 	pshufb	.Lk_sr(%r8,%r10),%xmm0	# output permute
 	lea	.Lk_opt(%r10),	%r11	# prepare to output transform
 	add	$32,	%rdx
 
 .Laes_schedule_mangle_last_dec:
 	add	$-16,	%rdx
 	pxor	.Lk_s63(%r10),	%xmm0
 	call	.Laes_schedule_transform # output transform
 	movdqa	%xmm0,	(%rdx)		# save last key
 
 	#_aes_cleanup
 	pxor	%xmm0,  %xmm0
 	pxor	%xmm1,  %xmm1
 	pxor	%xmm2,  %xmm2
 	pxor	%xmm3,  %xmm3
 	pxor	%xmm4,  %xmm4
 	pxor	%xmm5,  %xmm5
 	pxor	%xmm6,  %xmm6
 	pxor	%xmm7,  %xmm7
 	pxor	%xmm8,  %xmm8
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
 ########################################################
 ##                                                    ##
 ##                     Constants                      ##
 ##                                                    ##
 ########################################################
 
 .align 16
 ELF(.type _aes_consts,@object)
 .Laes_consts:
 _aes_consts:
 	# s0F
 	.Lk_s0F = .-.Laes_consts
 	.quad	0x0F0F0F0F0F0F0F0F
 	.quad	0x0F0F0F0F0F0F0F0F
 
 	# input transform (lo, hi)
 	.Lk_ipt = .-.Laes_consts
 	.quad	0xC2B2E8985A2A7000
 	.quad	0xCABAE09052227808
 	.quad	0x4C01307D317C4D00
 	.quad	0xCD80B1FCB0FDCC81
 
 	# inv, inva
 	.Lk_inv = .-.Laes_consts
 	.quad	0x0E05060F0D080180
 	.quad	0x040703090A0B0C02
 	.quad	0x01040A060F0B0780
 	.quad	0x030D0E0C02050809
 
 	# sb1u, sb1t
 	.Lk_sb1 = .-.Laes_consts
 	.quad	0xB19BE18FCB503E00
 	.quad	0xA5DF7A6E142AF544
 	.quad	0x3618D415FAE22300
 	.quad	0x3BF7CCC10D2ED9EF
 
 
 	# sb2u, sb2t
 	.Lk_sb2 = .-.Laes_consts
 	.quad	0xE27A93C60B712400
 	.quad	0x5EB7E955BC982FCD
 	.quad	0x69EB88400AE12900
 	.quad	0xC2A163C8AB82234A
 
 	# sbou, sbot
 	.Lk_sbo = .-.Laes_consts
 	.quad	0xD0D26D176FBDC700
 	.quad	0x15AABF7AC502A878
 	.quad	0xCFE474A55FBB6A00
 	.quad	0x8E1E90D1412B35FA
 
 	# mc_forward
 	.Lk_mc_forward = .-.Laes_consts
 	.quad	0x0407060500030201
 	.quad	0x0C0F0E0D080B0A09
 	.quad	0x080B0A0904070605
 	.quad	0x000302010C0F0E0D
 	.quad	0x0C0F0E0D080B0A09
 	.quad	0x0407060500030201
 	.quad	0x000302010C0F0E0D
 	.quad	0x080B0A0904070605
 
 	# mc_backward
 	.Lk_mc_backward = .-.Laes_consts
 	.quad	0x0605040702010003
 	.quad	0x0E0D0C0F0A09080B
 	.quad	0x020100030E0D0C0F
 	.quad	0x0A09080B06050407
 	.quad	0x0E0D0C0F0A09080B
 	.quad	0x0605040702010003
 	.quad	0x0A09080B06050407
 	.quad	0x020100030E0D0C0F
 
 	# sr
 	.Lk_sr = .-.Laes_consts
 	.quad	0x0706050403020100
 	.quad	0x0F0E0D0C0B0A0908
 	.quad	0x030E09040F0A0500
 	.quad	0x0B06010C07020D08
 	.quad	0x0F060D040B020900
 	.quad	0x070E050C030A0108
 	.quad	0x0B0E0104070A0D00
 	.quad	0x0306090C0F020508
 
 	# rcon
 	.Lk_rcon = .-.Laes_consts
 	.quad	0x1F8391B9AF9DEEB6
 	.quad	0x702A98084D7C7D81
 
 	# s63: all equal to 0x63 transformed
 	.Lk_s63 = .-.Laes_consts
 	.quad	0x5B5B5B5B5B5B5B5B
 	.quad	0x5B5B5B5B5B5B5B5B
 
 	# output transform
 	.Lk_opt = .-.Laes_consts
 	.quad	0xFF9F4929D6B66000
 	.quad	0xF7974121DEBE6808
 	.quad	0x01EDBD5150BCEC00
 	.quad	0xE10D5DB1B05C0CE0
 
 	# deskew tables: inverts the sbox's 'skew'
 	.Lk_deskew = .-.Laes_consts
 	.quad	0x07E4A34047A4E300
 	.quad	0x1DFEB95A5DBEF91A
 	.quad	0x5F36B5DC83EA6900
 	.quad	0x2841C2ABF49D1E77
 
 ##
 ##  Decryption stuff
 ##  Key schedule constants
 ##
 	# decryption key schedule: x -> invskew x*9
 	.Lk_dks_1 = .-.Laes_consts
 	.quad	0xB6116FC87ED9A700
 	.quad	0x4AED933482255BFC
 	.quad	0x4576516227143300
 	.quad	0x8BB89FACE9DAFDCE
 
 	# decryption key schedule: invskew x*9 -> invskew x*D
 	.Lk_dks_2 = .-.Laes_consts
 	.quad	0x27438FEBCCA86400
 	.quad	0x4622EE8AADC90561
 	.quad	0x815C13CE4F92DD00
 	.quad	0x73AEE13CBD602FF2
 
 	# decryption key schedule: invskew x*D -> invskew x*B
 	.Lk_dks_3 = .-.Laes_consts
 	.quad	0x03C4C50201C6C700
 	.quad	0xF83F3EF9FA3D3CFB
 	.quad	0xEE1921D638CFF700
 	.quad	0xA5526A9D7384BC4B
 
 	# decryption key schedule: invskew x*B -> invskew x*E + 0x63
 	.Lk_dks_4 = .-.Laes_consts
 	.quad	0xE3C390B053732000
 	.quad	0xA080D3F310306343
 	.quad	0xA0CA214B036982E8
 	.quad	0x2F45AEC48CE60D67
 
 ##
 ##  Decryption stuff
 ##  Round function constants
 ##
 	# decryption input transform
 	.Lk_dipt = .-.Laes_consts
 	.quad	0x0F505B040B545F00
 	.quad	0x154A411E114E451A
 	.quad	0x86E383E660056500
 	.quad	0x12771772F491F194
 
 	# decryption sbox output *9*u, *9*t
 	.Lk_dsb9 = .-.Laes_consts
 	.quad	0x851C03539A86D600
 	.quad	0xCAD51F504F994CC9
 	.quad	0xC03B1789ECD74900
 	.quad	0x725E2C9EB2FBA565
 
 	# decryption sbox output *D*u, *D*t
 	.Lk_dsbd = .-.Laes_consts
 	.quad	0x7D57CCDFE6B1A200
 	.quad	0xF56E9B13882A4439
 	.quad	0x3CE2FAF724C6CB00
 	.quad	0x2931180D15DEEFD3
 
 	# decryption sbox output *B*u, *B*t
 	.Lk_dsbb = .-.Laes_consts
 	.quad	0xD022649296B44200
 	.quad	0x602646F6B0F2D404
 	.quad	0xC19498A6CD596700
 	.quad	0xF3FF0C3E3255AA6B
 
 	# decryption sbox output *E*u, *E*t
 	.Lk_dsbe = .-.Laes_consts
 	.quad	0x46F2929626D4D000
 	.quad	0x2242600464B4F6B0
 	.quad	0x0C55A6CDFFAAC100
 	.quad	0x9467F36B98593E32
 
 	# decryption sbox final output
 	.Lk_dsbo = .-.Laes_consts
 	.quad	0x1387EA537EF94000
 	.quad	0xC7AA6DB9D4943E2D
 	.quad	0x12D7560F93441D00
 	.quad	0xCA4B8159D8C58E9C
 ELF(.size _aes_consts,.-_aes_consts)
 
 #endif
 #endif
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
index 470c32aa..ae8f2715 100644
--- a/cipher/salsa20-amd64.S
+++ b/cipher/salsa20-amd64.S
@@ -1,931 +1,940 @@
 /* salsa20-amd64.S  -  AMD64 implementation of Salsa20
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on public domain implementation by D. J. Bernstein at
  *  http://cr.yp.to/snuffle.html
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
 .align 8
 .globl _gcry_salsa20_amd64_keysetup
 ELF(.type  _gcry_salsa20_amd64_keysetup,@function;)
 _gcry_salsa20_amd64_keysetup:
+	CFI_STARTPROC();
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%r9d
 	movl   8(%rsi),%eax
 	movl   12(%rsi),%r10d
 	movl   %r8d,20(%rdi)
 	movl   %r9d,40(%rdi)
 	movl   %eax,60(%rdi)
 	movl   %r10d,48(%rdi)
 	cmp  $256,%rdx
 	jb .L_kbits128
 .L_kbits256:
 	movl   16(%rsi),%edx
 	movl   20(%rsi),%ecx
 	movl   24(%rsi),%r8d
 	movl   28(%rsi),%esi
 	movl   %edx,28(%rdi)
 	movl   %ecx,16(%rdi)
 	movl   %r8d,36(%rdi)
 	movl   %esi,56(%rdi)
 	mov  $1634760805,%rsi
 	mov  $857760878,%rdx
 	mov  $2036477234,%rcx
 	mov  $1797285236,%r8
 	movl   %esi,0(%rdi)
 	movl   %edx,4(%rdi)
 	movl   %ecx,8(%rdi)
 	movl   %r8d,12(%rdi)
 	jmp .L_keysetupdone
 .L_kbits128:
 	movl   0(%rsi),%edx
 	movl   4(%rsi),%ecx
 	movl   8(%rsi),%r8d
 	movl   12(%rsi),%esi
 	movl   %edx,28(%rdi)
 	movl   %ecx,16(%rdi)
 	movl   %r8d,36(%rdi)
 	movl   %esi,56(%rdi)
 	mov  $1634760805,%rsi
 	mov  $824206446,%rdx
 	mov  $2036477238,%rcx
 	mov  $1797285236,%r8
 	movl   %esi,0(%rdi)
 	movl   %edx,4(%rdi)
 	movl   %ecx,8(%rdi)
 	movl   %r8d,12(%rdi)
 .L_keysetupdone:
 	ret
+	CFI_ENDPROC();
 
 .align 8
 .globl _gcry_salsa20_amd64_ivsetup
 ELF(.type  _gcry_salsa20_amd64_ivsetup,@function;)
 _gcry_salsa20_amd64_ivsetup:
+	CFI_STARTPROC();
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%esi
 	mov  $0,%r9
 	mov  $0,%rax
 	movl   %r8d,24(%rdi)
 	movl   %esi,44(%rdi)
 	movl   %r9d,32(%rdi)
 	movl   %eax,52(%rdi)
 	ret
+	CFI_ENDPROC();
 
 .align 8
 .globl _gcry_salsa20_amd64_encrypt_blocks
 ELF(.type  _gcry_salsa20_amd64_encrypt_blocks,@function;)
 _gcry_salsa20_amd64_encrypt_blocks:
 	/*
 	 * Modifications to original implementation:
 	 *  - Number of rounds passing in register %r8 (for Salsa20/12).
 	 *  - Length is input as number of blocks, so don't handle tail bytes
 	 *    (this is done in salsa20.c).
 	 */
+	CFI_STARTPROC();
 	push %rbx
+	CFI_PUSH(%rbx);
 	shlq $6, %rcx /* blocks to bytes */
 	mov %r8, %rbx
 	mov %rsp,%r11
-	and $31,%r11
-	add $384,%r11
-	sub %r11,%rsp
+	CFI_DEF_CFA_REGISTER(%r11);
+	sub $384,%rsp
+	and $~31,%rsp
 	mov  %rdi,%r8
 	mov  %rsi,%rsi
 	mov  %rdx,%rdi
 	mov  %rcx,%rdx
 	cmp  $0,%rdx
 	jbe .L_done
 .L_start:
 	cmp  $256,%rdx
 	jb .L_bytes_are_64_128_or_192
 	movdqa 0(%r8),%xmm0
 	pshufd $0x55,%xmm0,%xmm1
 	pshufd $0xaa,%xmm0,%xmm2
 	pshufd $0xff,%xmm0,%xmm3
 	pshufd $0x00,%xmm0,%xmm0
 	movdqa %xmm1,0(%rsp)
 	movdqa %xmm2,16(%rsp)
 	movdqa %xmm3,32(%rsp)
 	movdqa %xmm0,48(%rsp)
 	movdqa 16(%r8),%xmm0
 	pshufd $0xaa,%xmm0,%xmm1
 	pshufd $0xff,%xmm0,%xmm2
 	pshufd $0x00,%xmm0,%xmm3
 	pshufd $0x55,%xmm0,%xmm0
 	movdqa %xmm1,64(%rsp)
 	movdqa %xmm2,80(%rsp)
 	movdqa %xmm3,96(%rsp)
 	movdqa %xmm0,112(%rsp)
 	movdqa 32(%r8),%xmm0
 	pshufd $0xff,%xmm0,%xmm1
 	pshufd $0x55,%xmm0,%xmm2
 	pshufd $0xaa,%xmm0,%xmm0
 	movdqa %xmm1,128(%rsp)
 	movdqa %xmm2,144(%rsp)
 	movdqa %xmm0,160(%rsp)
 	movdqa 48(%r8),%xmm0
 	pshufd $0x00,%xmm0,%xmm1
 	pshufd $0xaa,%xmm0,%xmm2
 	pshufd $0xff,%xmm0,%xmm0
 	movdqa %xmm1,176(%rsp)
 	movdqa %xmm2,192(%rsp)
 	movdqa %xmm0,208(%rsp)
 .L_bytesatleast256:
 	movl   32(%r8),%ecx
 	movl   52(%r8),%r9d
 	movl %ecx,224(%rsp)
 	movl %r9d,240(%rsp)
 	add  $1,%ecx
 	adc  $0,%r9d
 	movl %ecx,4+224(%rsp)
 	movl %r9d,4+240(%rsp)
 	add  $1,%ecx
 	adc  $0,%r9d
 	movl %ecx,8+224(%rsp)
 	movl %r9d,8+240(%rsp)
 	add  $1,%ecx
 	adc  $0,%r9d
 	movl %ecx,12+224(%rsp)
 	movl %r9d,12+240(%rsp)
 	add  $1,%ecx
 	adc  $0,%r9d
 	movl   %ecx,32(%r8)
 	movl   %r9d,52(%r8)
 	movq %rdx,288(%rsp)
 	mov  %rbx,%rdx
 	movdqa 0(%rsp),%xmm0
 	movdqa 16(%rsp),%xmm1
 	movdqa 32(%rsp),%xmm2
 	movdqa 192(%rsp),%xmm3
 	movdqa 208(%rsp),%xmm4
 	movdqa 64(%rsp),%xmm5
 	movdqa 80(%rsp),%xmm6
 	movdqa 112(%rsp),%xmm7
 	movdqa 128(%rsp),%xmm8
 	movdqa 144(%rsp),%xmm9
 	movdqa 160(%rsp),%xmm10
 	movdqa 240(%rsp),%xmm11
 	movdqa 48(%rsp),%xmm12
 	movdqa 96(%rsp),%xmm13
 	movdqa 176(%rsp),%xmm14
 	movdqa 224(%rsp),%xmm15
 .L_mainloop1:
 	movdqa %xmm1,256(%rsp)
 	movdqa %xmm2,272(%rsp)
 	movdqa %xmm13,%xmm1
 	paddd %xmm12,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $7,%xmm1
 	pxor  %xmm1,%xmm14
 	psrld $25,%xmm2
 	pxor  %xmm2,%xmm14
 	movdqa %xmm7,%xmm1
 	paddd %xmm0,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $7,%xmm1
 	pxor  %xmm1,%xmm11
 	psrld $25,%xmm2
 	pxor  %xmm2,%xmm11
 	movdqa %xmm12,%xmm1
 	paddd %xmm14,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $9,%xmm1
 	pxor  %xmm1,%xmm15
 	psrld $23,%xmm2
 	pxor  %xmm2,%xmm15
 	movdqa %xmm0,%xmm1
 	paddd %xmm11,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $9,%xmm1
 	pxor  %xmm1,%xmm9
 	psrld $23,%xmm2
 	pxor  %xmm2,%xmm9
 	movdqa %xmm14,%xmm1
 	paddd %xmm15,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $13,%xmm1
 	pxor  %xmm1,%xmm13
 	psrld $19,%xmm2
 	pxor  %xmm2,%xmm13
 	movdqa %xmm11,%xmm1
 	paddd %xmm9,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $13,%xmm1
 	pxor  %xmm1,%xmm7
 	psrld $19,%xmm2
 	pxor  %xmm2,%xmm7
 	movdqa %xmm15,%xmm1
 	paddd %xmm13,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $18,%xmm1
 	pxor  %xmm1,%xmm12
 	psrld $14,%xmm2
 	pxor  %xmm2,%xmm12
 	movdqa 256(%rsp),%xmm1
 	movdqa %xmm12,256(%rsp)
 	movdqa %xmm9,%xmm2
 	paddd %xmm7,%xmm2
 	movdqa %xmm2,%xmm12
 	pslld $18,%xmm2
 	pxor  %xmm2,%xmm0
 	psrld $14,%xmm12
 	pxor  %xmm12,%xmm0
 	movdqa %xmm5,%xmm2
 	paddd %xmm1,%xmm2
 	movdqa %xmm2,%xmm12
 	pslld $7,%xmm2
 	pxor  %xmm2,%xmm3
 	psrld $25,%xmm12
 	pxor  %xmm12,%xmm3
 	movdqa 272(%rsp),%xmm2
 	movdqa %xmm0,272(%rsp)
 	movdqa %xmm6,%xmm0
 	paddd %xmm2,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $7,%xmm0
 	pxor  %xmm0,%xmm4
 	psrld $25,%xmm12
 	pxor  %xmm12,%xmm4
 	movdqa %xmm1,%xmm0
 	paddd %xmm3,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $9,%xmm0
 	pxor  %xmm0,%xmm10
 	psrld $23,%xmm12
 	pxor  %xmm12,%xmm10
 	movdqa %xmm2,%xmm0
 	paddd %xmm4,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $9,%xmm0
 	pxor  %xmm0,%xmm8
 	psrld $23,%xmm12
 	pxor  %xmm12,%xmm8
 	movdqa %xmm3,%xmm0
 	paddd %xmm10,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $13,%xmm0
 	pxor  %xmm0,%xmm5
 	psrld $19,%xmm12
 	pxor  %xmm12,%xmm5
 	movdqa %xmm4,%xmm0
 	paddd %xmm8,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $13,%xmm0
 	pxor  %xmm0,%xmm6
 	psrld $19,%xmm12
 	pxor  %xmm12,%xmm6
 	movdqa %xmm10,%xmm0
 	paddd %xmm5,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $18,%xmm0
 	pxor  %xmm0,%xmm1
 	psrld $14,%xmm12
 	pxor  %xmm12,%xmm1
 	movdqa 256(%rsp),%xmm0
 	movdqa %xmm1,256(%rsp)
 	movdqa %xmm4,%xmm1
 	paddd %xmm0,%xmm1
 	movdqa %xmm1,%xmm12
 	pslld $7,%xmm1
 	pxor  %xmm1,%xmm7
 	psrld $25,%xmm12
 	pxor  %xmm12,%xmm7
 	movdqa %xmm8,%xmm1
 	paddd %xmm6,%xmm1
 	movdqa %xmm1,%xmm12
 	pslld $18,%xmm1
 	pxor  %xmm1,%xmm2
 	psrld $14,%xmm12
 	pxor  %xmm12,%xmm2
 	movdqa 272(%rsp),%xmm12
 	movdqa %xmm2,272(%rsp)
 	movdqa %xmm14,%xmm1
 	paddd %xmm12,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $7,%xmm1
 	pxor  %xmm1,%xmm5
 	psrld $25,%xmm2
 	pxor  %xmm2,%xmm5
 	movdqa %xmm0,%xmm1
 	paddd %xmm7,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $9,%xmm1
 	pxor  %xmm1,%xmm10
 	psrld $23,%xmm2
 	pxor  %xmm2,%xmm10
 	movdqa %xmm12,%xmm1
 	paddd %xmm5,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $9,%xmm1
 	pxor  %xmm1,%xmm8
 	psrld $23,%xmm2
 	pxor  %xmm2,%xmm8
 	movdqa %xmm7,%xmm1
 	paddd %xmm10,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $13,%xmm1
 	pxor  %xmm1,%xmm4
 	psrld $19,%xmm2
 	pxor  %xmm2,%xmm4
 	movdqa %xmm5,%xmm1
 	paddd %xmm8,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $13,%xmm1
 	pxor  %xmm1,%xmm14
 	psrld $19,%xmm2
 	pxor  %xmm2,%xmm14
 	movdqa %xmm10,%xmm1
 	paddd %xmm4,%xmm1
 	movdqa %xmm1,%xmm2
 	pslld $18,%xmm1
 	pxor  %xmm1,%xmm0
 	psrld $14,%xmm2
 	pxor  %xmm2,%xmm0
 	movdqa 256(%rsp),%xmm1
 	movdqa %xmm0,256(%rsp)
 	movdqa %xmm8,%xmm0
 	paddd %xmm14,%xmm0
 	movdqa %xmm0,%xmm2
 	pslld $18,%xmm0
 	pxor  %xmm0,%xmm12
 	psrld $14,%xmm2
 	pxor  %xmm2,%xmm12
 	movdqa %xmm11,%xmm0
 	paddd %xmm1,%xmm0
 	movdqa %xmm0,%xmm2
 	pslld $7,%xmm0
 	pxor  %xmm0,%xmm6
 	psrld $25,%xmm2
 	pxor  %xmm2,%xmm6
 	movdqa 272(%rsp),%xmm2
 	movdqa %xmm12,272(%rsp)
 	movdqa %xmm3,%xmm0
 	paddd %xmm2,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $7,%xmm0
 	pxor  %xmm0,%xmm13
 	psrld $25,%xmm12
 	pxor  %xmm12,%xmm13
 	movdqa %xmm1,%xmm0
 	paddd %xmm6,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $9,%xmm0
 	pxor  %xmm0,%xmm15
 	psrld $23,%xmm12
 	pxor  %xmm12,%xmm15
 	movdqa %xmm2,%xmm0
 	paddd %xmm13,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $9,%xmm0
 	pxor  %xmm0,%xmm9
 	psrld $23,%xmm12
 	pxor  %xmm12,%xmm9
 	movdqa %xmm6,%xmm0
 	paddd %xmm15,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $13,%xmm0
 	pxor  %xmm0,%xmm11
 	psrld $19,%xmm12
 	pxor  %xmm12,%xmm11
 	movdqa %xmm13,%xmm0
 	paddd %xmm9,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $13,%xmm0
 	pxor  %xmm0,%xmm3
 	psrld $19,%xmm12
 	pxor  %xmm12,%xmm3
 	movdqa %xmm15,%xmm0
 	paddd %xmm11,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $18,%xmm0
 	pxor  %xmm0,%xmm1
 	psrld $14,%xmm12
 	pxor  %xmm12,%xmm1
 	movdqa %xmm9,%xmm0
 	paddd %xmm3,%xmm0
 	movdqa %xmm0,%xmm12
 	pslld $18,%xmm0
 	pxor  %xmm0,%xmm2
 	psrld $14,%xmm12
 	pxor  %xmm12,%xmm2
 	movdqa 256(%rsp),%xmm12
 	movdqa 272(%rsp),%xmm0
 	sub  $2,%rdx
 	ja .L_mainloop1
 	paddd 48(%rsp),%xmm12
 	paddd 112(%rsp),%xmm7
 	paddd 160(%rsp),%xmm10
 	paddd 208(%rsp),%xmm4
 	movd   %xmm12,%rdx
 	movd   %xmm7,%rcx
 	movd   %xmm10,%r9
 	movd   %xmm4,%rax
 	pshufd $0x39,%xmm12,%xmm12
 	pshufd $0x39,%xmm7,%xmm7
 	pshufd $0x39,%xmm10,%xmm10
 	pshufd $0x39,%xmm4,%xmm4
 	xorl 0(%rsi),%edx
 	xorl 4(%rsi),%ecx
 	xorl 8(%rsi),%r9d
 	xorl 12(%rsi),%eax
 	movl   %edx,0(%rdi)
 	movl   %ecx,4(%rdi)
 	movl   %r9d,8(%rdi)
 	movl   %eax,12(%rdi)
 	movd   %xmm12,%rdx
 	movd   %xmm7,%rcx
 	movd   %xmm10,%r9
 	movd   %xmm4,%rax
 	pshufd $0x39,%xmm12,%xmm12
 	pshufd $0x39,%xmm7,%xmm7
 	pshufd $0x39,%xmm10,%xmm10
 	pshufd $0x39,%xmm4,%xmm4
 	xorl 64(%rsi),%edx
 	xorl 68(%rsi),%ecx
 	xorl 72(%rsi),%r9d
 	xorl 76(%rsi),%eax
 	movl   %edx,64(%rdi)
 	movl   %ecx,68(%rdi)
 	movl   %r9d,72(%rdi)
 	movl   %eax,76(%rdi)
 	movd   %xmm12,%rdx
 	movd   %xmm7,%rcx
 	movd   %xmm10,%r9
 	movd   %xmm4,%rax
 	pshufd $0x39,%xmm12,%xmm12
 	pshufd $0x39,%xmm7,%xmm7
 	pshufd $0x39,%xmm10,%xmm10
 	pshufd $0x39,%xmm4,%xmm4
 	xorl 128(%rsi),%edx
 	xorl 132(%rsi),%ecx
 	xorl 136(%rsi),%r9d
 	xorl 140(%rsi),%eax
 	movl   %edx,128(%rdi)
 	movl   %ecx,132(%rdi)
 	movl   %r9d,136(%rdi)
 	movl   %eax,140(%rdi)
 	movd   %xmm12,%rdx
 	movd   %xmm7,%rcx
 	movd   %xmm10,%r9
 	movd   %xmm4,%rax
 	xorl 192(%rsi),%edx
 	xorl 196(%rsi),%ecx
 	xorl 200(%rsi),%r9d
 	xorl 204(%rsi),%eax
 	movl   %edx,192(%rdi)
 	movl   %ecx,196(%rdi)
 	movl   %r9d,200(%rdi)
 	movl   %eax,204(%rdi)
 	paddd 176(%rsp),%xmm14
 	paddd 0(%rsp),%xmm0
 	paddd 64(%rsp),%xmm5
 	paddd 128(%rsp),%xmm8
 	movd   %xmm14,%rdx
 	movd   %xmm0,%rcx
 	movd   %xmm5,%r9
 	movd   %xmm8,%rax
 	pshufd $0x39,%xmm14,%xmm14
 	pshufd $0x39,%xmm0,%xmm0
 	pshufd $0x39,%xmm5,%xmm5
 	pshufd $0x39,%xmm8,%xmm8
 	xorl 16(%rsi),%edx
 	xorl 20(%rsi),%ecx
 	xorl 24(%rsi),%r9d
 	xorl 28(%rsi),%eax
 	movl   %edx,16(%rdi)
 	movl   %ecx,20(%rdi)
 	movl   %r9d,24(%rdi)
 	movl   %eax,28(%rdi)
 	movd   %xmm14,%rdx
 	movd   %xmm0,%rcx
 	movd   %xmm5,%r9
 	movd   %xmm8,%rax
 	pshufd $0x39,%xmm14,%xmm14
 	pshufd $0x39,%xmm0,%xmm0
 	pshufd $0x39,%xmm5,%xmm5
 	pshufd $0x39,%xmm8,%xmm8
 	xorl 80(%rsi),%edx
 	xorl 84(%rsi),%ecx
 	xorl 88(%rsi),%r9d
 	xorl 92(%rsi),%eax
 	movl   %edx,80(%rdi)
 	movl   %ecx,84(%rdi)
 	movl   %r9d,88(%rdi)
 	movl   %eax,92(%rdi)
 	movd   %xmm14,%rdx
 	movd   %xmm0,%rcx
 	movd   %xmm5,%r9
 	movd   %xmm8,%rax
 	pshufd $0x39,%xmm14,%xmm14
 	pshufd $0x39,%xmm0,%xmm0
 	pshufd $0x39,%xmm5,%xmm5
 	pshufd $0x39,%xmm8,%xmm8
 	xorl 144(%rsi),%edx
 	xorl 148(%rsi),%ecx
 	xorl 152(%rsi),%r9d
 	xorl 156(%rsi),%eax
 	movl   %edx,144(%rdi)
 	movl   %ecx,148(%rdi)
 	movl   %r9d,152(%rdi)
 	movl   %eax,156(%rdi)
 	movd   %xmm14,%rdx
 	movd   %xmm0,%rcx
 	movd   %xmm5,%r9
 	movd   %xmm8,%rax
 	xorl 208(%rsi),%edx
 	xorl 212(%rsi),%ecx
 	xorl 216(%rsi),%r9d
 	xorl 220(%rsi),%eax
 	movl   %edx,208(%rdi)
 	movl   %ecx,212(%rdi)
 	movl   %r9d,216(%rdi)
 	movl   %eax,220(%rdi)
 	paddd 224(%rsp),%xmm15
 	paddd 240(%rsp),%xmm11
 	paddd 16(%rsp),%xmm1
 	paddd 80(%rsp),%xmm6
 	movd   %xmm15,%rdx
 	movd   %xmm11,%rcx
 	movd   %xmm1,%r9
 	movd   %xmm6,%rax
 	pshufd $0x39,%xmm15,%xmm15
 	pshufd $0x39,%xmm11,%xmm11
 	pshufd $0x39,%xmm1,%xmm1
 	pshufd $0x39,%xmm6,%xmm6
 	xorl 32(%rsi),%edx
 	xorl 36(%rsi),%ecx
 	xorl 40(%rsi),%r9d
 	xorl 44(%rsi),%eax
 	movl   %edx,32(%rdi)
 	movl   %ecx,36(%rdi)
 	movl   %r9d,40(%rdi)
 	movl   %eax,44(%rdi)
 	movd   %xmm15,%rdx
 	movd   %xmm11,%rcx
 	movd   %xmm1,%r9
 	movd   %xmm6,%rax
 	pshufd $0x39,%xmm15,%xmm15
 	pshufd $0x39,%xmm11,%xmm11
 	pshufd $0x39,%xmm1,%xmm1
 	pshufd $0x39,%xmm6,%xmm6
 	xorl 96(%rsi),%edx
 	xorl 100(%rsi),%ecx
 	xorl 104(%rsi),%r9d
 	xorl 108(%rsi),%eax
 	movl   %edx,96(%rdi)
 	movl   %ecx,100(%rdi)
 	movl   %r9d,104(%rdi)
 	movl   %eax,108(%rdi)
 	movd   %xmm15,%rdx
 	movd   %xmm11,%rcx
 	movd   %xmm1,%r9
 	movd   %xmm6,%rax
 	pshufd $0x39,%xmm15,%xmm15
 	pshufd $0x39,%xmm11,%xmm11
 	pshufd $0x39,%xmm1,%xmm1
 	pshufd $0x39,%xmm6,%xmm6
 	xorl 160(%rsi),%edx
 	xorl 164(%rsi),%ecx
 	xorl 168(%rsi),%r9d
 	xorl 172(%rsi),%eax
 	movl   %edx,160(%rdi)
 	movl   %ecx,164(%rdi)
 	movl   %r9d,168(%rdi)
 	movl   %eax,172(%rdi)
 	movd   %xmm15,%rdx
 	movd   %xmm11,%rcx
 	movd   %xmm1,%r9
 	movd   %xmm6,%rax
 	xorl 224(%rsi),%edx
 	xorl 228(%rsi),%ecx
 	xorl 232(%rsi),%r9d
 	xorl 236(%rsi),%eax
 	movl   %edx,224(%rdi)
 	movl   %ecx,228(%rdi)
 	movl   %r9d,232(%rdi)
 	movl   %eax,236(%rdi)
 	paddd 96(%rsp),%xmm13
 	paddd 144(%rsp),%xmm9
 	paddd 192(%rsp),%xmm3
 	paddd 32(%rsp),%xmm2
 	movd   %xmm13,%rdx
 	movd   %xmm9,%rcx
 	movd   %xmm3,%r9
 	movd   %xmm2,%rax
 	pshufd $0x39,%xmm13,%xmm13
 	pshufd $0x39,%xmm9,%xmm9
 	pshufd $0x39,%xmm3,%xmm3
 	pshufd $0x39,%xmm2,%xmm2
 	xorl 48(%rsi),%edx
 	xorl 52(%rsi),%ecx
 	xorl 56(%rsi),%r9d
 	xorl 60(%rsi),%eax
 	movl   %edx,48(%rdi)
 	movl   %ecx,52(%rdi)
 	movl   %r9d,56(%rdi)
 	movl   %eax,60(%rdi)
 	movd   %xmm13,%rdx
 	movd   %xmm9,%rcx
 	movd   %xmm3,%r9
 	movd   %xmm2,%rax
 	pshufd $0x39,%xmm13,%xmm13
 	pshufd $0x39,%xmm9,%xmm9
 	pshufd $0x39,%xmm3,%xmm3
 	pshufd $0x39,%xmm2,%xmm2
 	xorl 112(%rsi),%edx
 	xorl 116(%rsi),%ecx
 	xorl 120(%rsi),%r9d
 	xorl 124(%rsi),%eax
 	movl   %edx,112(%rdi)
 	movl   %ecx,116(%rdi)
 	movl   %r9d,120(%rdi)
 	movl   %eax,124(%rdi)
 	movd   %xmm13,%rdx
 	movd   %xmm9,%rcx
 	movd   %xmm3,%r9
 	movd   %xmm2,%rax
 	pshufd $0x39,%xmm13,%xmm13
 	pshufd $0x39,%xmm9,%xmm9
 	pshufd $0x39,%xmm3,%xmm3
 	pshufd $0x39,%xmm2,%xmm2
 	xorl 176(%rsi),%edx
 	xorl 180(%rsi),%ecx
 	xorl 184(%rsi),%r9d
 	xorl 188(%rsi),%eax
 	movl   %edx,176(%rdi)
 	movl   %ecx,180(%rdi)
 	movl   %r9d,184(%rdi)
 	movl   %eax,188(%rdi)
 	movd   %xmm13,%rdx
 	movd   %xmm9,%rcx
 	movd   %xmm3,%r9
 	movd   %xmm2,%rax
 	xorl 240(%rsi),%edx
 	xorl 244(%rsi),%ecx
 	xorl 248(%rsi),%r9d
 	xorl 252(%rsi),%eax
 	movl   %edx,240(%rdi)
 	movl   %ecx,244(%rdi)
 	movl   %r9d,248(%rdi)
 	movl   %eax,252(%rdi)
 	movq 288(%rsp),%rdx
 	sub  $256,%rdx
 	add  $256,%rsi
 	add  $256,%rdi
 	cmp  $256,%rdx
 	jae .L_bytesatleast256
 	cmp  $0,%rdx
 	jbe .L_done
 .L_bytes_are_64_128_or_192:
 	movq %rdx,288(%rsp)
 	movdqa 0(%r8),%xmm0
 	movdqa 16(%r8),%xmm1
 	movdqa 32(%r8),%xmm2
 	movdqa 48(%r8),%xmm3
 	movdqa %xmm1,%xmm4
 	mov  %rbx,%rdx
 .L_mainloop2:
 	paddd %xmm0,%xmm4
 	movdqa %xmm0,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $7,%xmm4
 	psrld $25,%xmm6
 	pxor  %xmm4,%xmm3
 	pxor  %xmm6,%xmm3
 	paddd %xmm3,%xmm5
 	movdqa %xmm3,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $9,%xmm5
 	psrld $23,%xmm6
 	pxor  %xmm5,%xmm2
 	pshufd $0x93,%xmm3,%xmm3
 	pxor  %xmm6,%xmm2
 	paddd %xmm2,%xmm4
 	movdqa %xmm2,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $13,%xmm4
 	psrld $19,%xmm6
 	pxor  %xmm4,%xmm1
 	pshufd $0x4e,%xmm2,%xmm2
 	pxor  %xmm6,%xmm1
 	paddd %xmm1,%xmm5
 	movdqa %xmm3,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $18,%xmm5
 	psrld $14,%xmm6
 	pxor  %xmm5,%xmm0
 	pshufd $0x39,%xmm1,%xmm1
 	pxor  %xmm6,%xmm0
 	paddd %xmm0,%xmm4
 	movdqa %xmm0,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $7,%xmm4
 	psrld $25,%xmm6
 	pxor  %xmm4,%xmm1
 	pxor  %xmm6,%xmm1
 	paddd %xmm1,%xmm5
 	movdqa %xmm1,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $9,%xmm5
 	psrld $23,%xmm6
 	pxor  %xmm5,%xmm2
 	pshufd $0x93,%xmm1,%xmm1
 	pxor  %xmm6,%xmm2
 	paddd %xmm2,%xmm4
 	movdqa %xmm2,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $13,%xmm4
 	psrld $19,%xmm6
 	pxor  %xmm4,%xmm3
 	pshufd $0x4e,%xmm2,%xmm2
 	pxor  %xmm6,%xmm3
 	paddd %xmm3,%xmm5
 	movdqa %xmm1,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $18,%xmm5
 	psrld $14,%xmm6
 	pxor  %xmm5,%xmm0
 	pshufd $0x39,%xmm3,%xmm3
 	pxor  %xmm6,%xmm0
 	paddd %xmm0,%xmm4
 	movdqa %xmm0,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $7,%xmm4
 	psrld $25,%xmm6
 	pxor  %xmm4,%xmm3
 	pxor  %xmm6,%xmm3
 	paddd %xmm3,%xmm5
 	movdqa %xmm3,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $9,%xmm5
 	psrld $23,%xmm6
 	pxor  %xmm5,%xmm2
 	pshufd $0x93,%xmm3,%xmm3
 	pxor  %xmm6,%xmm2
 	paddd %xmm2,%xmm4
 	movdqa %xmm2,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $13,%xmm4
 	psrld $19,%xmm6
 	pxor  %xmm4,%xmm1
 	pshufd $0x4e,%xmm2,%xmm2
 	pxor  %xmm6,%xmm1
 	paddd %xmm1,%xmm5
 	movdqa %xmm3,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $18,%xmm5
 	psrld $14,%xmm6
 	pxor  %xmm5,%xmm0
 	pshufd $0x39,%xmm1,%xmm1
 	pxor  %xmm6,%xmm0
 	paddd %xmm0,%xmm4
 	movdqa %xmm0,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $7,%xmm4
 	psrld $25,%xmm6
 	pxor  %xmm4,%xmm1
 	pxor  %xmm6,%xmm1
 	paddd %xmm1,%xmm5
 	movdqa %xmm1,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $9,%xmm5
 	psrld $23,%xmm6
 	pxor  %xmm5,%xmm2
 	pshufd $0x93,%xmm1,%xmm1
 	pxor  %xmm6,%xmm2
 	paddd %xmm2,%xmm4
 	movdqa %xmm2,%xmm5
 	movdqa %xmm4,%xmm6
 	pslld $13,%xmm4
 	psrld $19,%xmm6
 	pxor  %xmm4,%xmm3
 	pshufd $0x4e,%xmm2,%xmm2
 	pxor  %xmm6,%xmm3
 	sub  $4,%rdx
 	paddd %xmm3,%xmm5
 	movdqa %xmm1,%xmm4
 	movdqa %xmm5,%xmm6
 	pslld $18,%xmm5
 	pxor   %xmm7,%xmm7
 	psrld $14,%xmm6
 	pxor  %xmm5,%xmm0
 	pshufd $0x39,%xmm3,%xmm3
 	pxor  %xmm6,%xmm0
 	ja .L_mainloop2
 	paddd 0(%r8),%xmm0
 	paddd 16(%r8),%xmm1
 	paddd 32(%r8),%xmm2
 	paddd 48(%r8),%xmm3
 	movd   %xmm0,%rdx
 	movd   %xmm1,%rcx
 	movd   %xmm2,%rax
 	movd   %xmm3,%r10
 	pshufd $0x39,%xmm0,%xmm0
 	pshufd $0x39,%xmm1,%xmm1
 	pshufd $0x39,%xmm2,%xmm2
 	pshufd $0x39,%xmm3,%xmm3
 	xorl 0(%rsi),%edx
 	xorl 48(%rsi),%ecx
 	xorl 32(%rsi),%eax
 	xorl 16(%rsi),%r10d
 	movl   %edx,0(%rdi)
 	movl   %ecx,48(%rdi)
 	movl   %eax,32(%rdi)
 	movl   %r10d,16(%rdi)
 	movd   %xmm0,%rdx
 	movd   %xmm1,%rcx
 	movd   %xmm2,%rax
 	movd   %xmm3,%r10
 	pshufd $0x39,%xmm0,%xmm0
 	pshufd $0x39,%xmm1,%xmm1
 	pshufd $0x39,%xmm2,%xmm2
 	pshufd $0x39,%xmm3,%xmm3
 	xorl 20(%rsi),%edx
 	xorl 4(%rsi),%ecx
 	xorl 52(%rsi),%eax
 	xorl 36(%rsi),%r10d
 	movl   %edx,20(%rdi)
 	movl   %ecx,4(%rdi)
 	movl   %eax,52(%rdi)
 	movl   %r10d,36(%rdi)
 	movd   %xmm0,%rdx
 	movd   %xmm1,%rcx
 	movd   %xmm2,%rax
 	movd   %xmm3,%r10
 	pshufd $0x39,%xmm0,%xmm0
 	pshufd $0x39,%xmm1,%xmm1
 	pshufd $0x39,%xmm2,%xmm2
 	pshufd $0x39,%xmm3,%xmm3
 	xorl 40(%rsi),%edx
 	xorl 24(%rsi),%ecx
 	xorl 8(%rsi),%eax
 	xorl 56(%rsi),%r10d
 	movl   %edx,40(%rdi)
 	movl   %ecx,24(%rdi)
 	movl   %eax,8(%rdi)
 	movl   %r10d,56(%rdi)
 	movd   %xmm0,%rdx
 	movd   %xmm1,%rcx
 	movd   %xmm2,%rax
 	movd   %xmm3,%r10
 	xorl 60(%rsi),%edx
 	xorl 44(%rsi),%ecx
 	xorl 28(%rsi),%eax
 	xorl 12(%rsi),%r10d
 	movl   %edx,60(%rdi)
 	movl   %ecx,44(%rdi)
 	movl   %eax,28(%rdi)
 	movl   %r10d,12(%rdi)
 	movq 288(%rsp),%rdx
 	movl   32(%r8),%ecx
 	movl   52(%r8),%eax
 	add  $1,%ecx
 	adc  $0,%eax
 	movl   %ecx,32(%r8)
 	movl   %eax,52(%r8)
 	cmp  $64,%rdx
 	ja .L_bytes_are_128_or_192
 .L_done:
-	add %r11,%rsp
+	CFI_REMEMBER_STATE();
 	mov %r11,%rax
+	sub %rsp,%rax
+	mov %r11,%rsp
+	CFI_REGISTER(%r11, %rsp)
+	CFI_DEF_CFA_REGISTER(%rsp)
 	pop %rbx
+	CFI_POP(%rbx)
 	ret
+	CFI_RESTORE_STATE();
 .L_bytes_are_128_or_192:
 	sub  $64,%rdx
 	add  $64,%rdi
 	add  $64,%rsi
 	jmp .L_bytes_are_64_128_or_192
+	CFI_ENDPROC();
 ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;)
 
 #endif /*defined(USE_SALSA20)*/
 #endif /*__x86_64*/
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index 8d60a159..9b17c2bd 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -1,1123 +1,1159 @@
 /* serpent-avx2-amd64.S  -  AVX2 implementation of Serpent cipher
  *
  * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \
     defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 /* struct serpent_context: */
 #define ctx_keys 0
 
 /* register macros */
 #define CTX %rdi
 
 /* vector registers */
 #define RA0 %ymm0
 #define RA1 %ymm1
 #define RA2 %ymm2
 #define RA3 %ymm3
 #define RA4 %ymm4
 
 #define RB0 %ymm5
 #define RB1 %ymm6
 #define RB2 %ymm7
 #define RB3 %ymm8
 #define RB4 %ymm9
 
 #define RNOT %ymm10
 #define RTMP0 %ymm11
 #define RTMP1 %ymm12
 #define RTMP2 %ymm13
 #define RTMP3 %ymm14
 #define RTMP4 %ymm15
 
 #define RNOTx %xmm10
 #define RTMP0x %xmm11
 #define RTMP1x %xmm12
 #define RTMP2x %xmm13
 #define RTMP3x %xmm14
 #define RTMP4x %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* vector 32-bit rotation to left */
 #define vec_rol(reg, nleft, tmp) \
 	vpslld $(nleft), reg, tmp;		\
 	vpsrld $(32 - (nleft)), reg, reg;	\
 	vpor tmp, reg, reg;
 
 /* vector 32-bit rotation to right */
 #define vec_ror(reg, nright, tmp) \
 	vec_rol(reg, 32 - nright, tmp)
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1, x0, x1; \
 	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2, t2, x2;
 
 /**********************************************************************
   16-way serpent
  **********************************************************************/
 
 /*
  * These are the S-Boxes of Serpent from following research paper.
  *
  *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
  *   (New York, New York, USA), p. 317–329, National Institute of Standards and
  *   Technology, 2000.
  *
  * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
  *
  */
 #define SBOX0(r0, r1, r2, r3, r4) \
 	vpxor	r0, r3, r3;		vmovdqa	r1, r4;			\
 	vpand	r3, r1, r1;		vpxor	r2, r4, r4;		\
 	vpxor	r0, r1, r1;		vpor	r3, r0, r0;		\
 	vpxor	r4, r0, r0;		vpxor	r3, r4, r4;		\
 	vpxor	r2, r3, r3;		vpor	r1, r2, r2;		\
 	vpxor	r4, r2, r2;		vpxor	RNOT, r4, r4;		\
 	vpor	r1, r4, r4;		vpxor	r3, r1, r1;		\
 	vpxor	r4, r1, r1;		vpor	r0, r3, r3;		\
 	vpxor	r3, r1, r1;		vpxor	r3, r4, r4;
 
 #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r2, r2;		vmovdqa	r1, r4;			\
 	vpor	r0, r1, r1;		vpxor	RNOT, r4, r4;		\
 	vpxor	r2, r1, r1;		vpor	r4, r2, r2;		\
 	vpxor	r3, r1, r1;		vpxor	r4, r0, r0;		\
 	vpxor	r0, r2, r2;		vpand	r3, r0, r0;		\
 	vpxor	r0, r4, r4;		vpor	r1, r0, r0;		\
 	vpxor	r2, r0, r0;		vpxor	r4, r3, r3;		\
 	vpxor	r1, r2, r2;		vpxor	r0, r3, r3;		\
 	vpxor	r1, r3, r3;	\
 	vpand	r3, r2, r2;	\
 	vpxor	r2, r4, r4;
 
 #define SBOX1(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r0, r0;		vpxor	RNOT, r2, r2;		\
 	vmovdqa	r0, r4;			vpand	r1, r0, r0;		\
 	vpxor	r0, r2, r2;		vpor	r3, r0, r0;		\
 	vpxor	r2, r3, r3;		vpxor	r0, r1, r1;		\
 	vpxor	r4, r0, r0;		vpor	r1, r4, r4;		\
 	vpxor	r3, r1, r1;		vpor	r0, r2, r2;		\
 	vpand	r4, r2, r2;		vpxor	r1, r0, r0;		\
 	vpand	r2, r1, r1;	\
 	vpxor	r0, r1, r1;		vpand	r2, r0, r0;		\
 	vpxor	r4, r0, r0;
 
 #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r1, r4;			vpxor	r3, r1, r1;		\
 	vpand	r1, r3, r3;		vpxor	r2, r4, r4;		\
 	vpxor	r0, r3, r3;		vpor	r1, r0, r0;		\
 	vpxor	r3, r2, r2;		vpxor	r4, r0, r0;		\
 	vpor	r2, r0, r0;		vpxor	r3, r1, r1;		\
 	vpxor	r1, r0, r0;		vpor	r3, r1, r1;		\
 	vpxor	r0, r1, r1;		vpxor	RNOT, r4, r4;		\
 	vpxor	r1, r4, r4;		vpor	r0, r1, r1;		\
 	vpxor	r0, r1, r1;	\
 	vpor	r4, r1, r1;	\
 	vpxor	r1, r3, r3;
 
 #define SBOX2(r0, r1, r2, r3, r4) \
 	vmovdqa	r0, r4;			vpand	r2, r0, r0;		\
 	vpxor	r3, r0, r0;		vpxor	r1, r2, r2;		\
 	vpxor	r0, r2, r2;		vpor	r4, r3, r3;		\
 	vpxor	r1, r3, r3;		vpxor	r2, r4, r4;		\
 	vmovdqa	r3, r1;			vpor	r4, r3, r3;		\
 	vpxor	r0, r3, r3;		vpand	r1, r0, r0;		\
 	vpxor	r0, r4, r4;		vpxor	r3, r1, r1;		\
 	vpxor	r4, r1, r1;		vpxor	RNOT, r4, r4;
 
 #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	r3, r2, r2;		vpxor	r0, r3, r3;		\
 	vmovdqa	r3, r4;			vpand	r2, r3, r3;		\
 	vpxor	r1, r3, r3;		vpor	r2, r1, r1;		\
 	vpxor	r4, r1, r1;		vpand	r3, r4, r4;		\
 	vpxor	r3, r2, r2;		vpand	r0, r4, r4;		\
 	vpxor	r2, r4, r4;		vpand	r1, r2, r2;		\
 	vpor	r0, r2, r2;		vpxor	RNOT, r3, r3;		\
 	vpxor	r3, r2, r2;		vpxor	r3, r0, r0;		\
 	vpand	r1, r0, r0;		vpxor	r4, r3, r3;		\
 	vpxor	r0, r3, r3;
 
 #define SBOX3(r0, r1, r2, r3, r4) \
 	vmovdqa	r0, r4;			vpor	r3, r0, r0;		\
 	vpxor	r1, r3, r3;		vpand	r4, r1, r1;		\
 	vpxor	r2, r4, r4;		vpxor	r3, r2, r2;		\
 	vpand	r0, r3, r3;		vpor	r1, r4, r4;		\
 	vpxor	r4, r3, r3;		vpxor	r1, r0, r0;		\
 	vpand	r0, r4, r4;		vpxor	r3, r1, r1;		\
 	vpxor	r2, r4, r4;		vpor	r0, r1, r1;		\
 	vpxor	r2, r1, r1;		vpxor	r3, r0, r0;		\
 	vmovdqa	r1, r2;			vpor	r3, r1, r1;		\
 	vpxor	r0, r1, r1;
 
 #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r2, r4;			vpxor	r1, r2, r2;		\
 	vpxor	r2, r0, r0;		vpand	r2, r4, r4;		\
 	vpxor	r0, r4, r4;		vpand	r1, r0, r0;		\
 	vpxor	r3, r1, r1;		vpor	r4, r3, r3;		\
 	vpxor	r3, r2, r2;		vpxor	r3, r0, r0;		\
 	vpxor	r4, r1, r1;		vpand	r2, r3, r3;		\
 	vpxor	r1, r3, r3;		vpxor	r0, r1, r1;		\
 	vpor	r2, r1, r1;		vpxor	r3, r0, r0;		\
 	vpxor	r4, r1, r1;	\
 	vpxor	r1, r0, r0;
 
 #define SBOX4(r0, r1, r2, r3, r4) \
 	vpxor	r3, r1, r1;		vpxor	RNOT, r3, r3;		\
 	vpxor	r3, r2, r2;		vpxor	r0, r3, r3;		\
 	vmovdqa	r1, r4;			vpand	r3, r1, r1;		\
 	vpxor	r2, r1, r1;		vpxor	r3, r4, r4;		\
 	vpxor	r4, r0, r0;		vpand	r4, r2, r2;		\
 	vpxor	r0, r2, r2;		vpand	r1, r0, r0;		\
 	vpxor	r0, r3, r3;		vpor	r1, r4, r4;		\
 	vpxor	r0, r4, r4;		vpor	r3, r0, r0;		\
 	vpxor	r2, r0, r0;		vpand	r3, r2, r2;		\
 	vpxor	RNOT, r0, r0;		vpxor	r2, r4, r4;
 
 #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r2, r4;			vpand	r3, r2, r2;		\
 	vpxor	r1, r2, r2;		vpor	r3, r1, r1;		\
 	vpand	r0, r1, r1;		vpxor	r2, r4, r4;		\
 	vpxor	r1, r4, r4;		vpand	r2, r1, r1;		\
 	vpxor	RNOT, r0, r0;		vpxor	r4, r3, r3;		\
 	vpxor	r3, r1, r1;		vpand	r0, r3, r3;		\
 	vpxor	r2, r3, r3;		vpxor	r1, r0, r0;		\
 	vpand	r0, r2, r2;		vpxor	r0, r3, r3;		\
 	vpxor	r4, r2, r2;	\
 	vpor	r3, r2, r2;		vpxor	r0, r3, r3;		\
 	vpxor	r1, r2, r2;
 
 #define SBOX5(r0, r1, r2, r3, r4) \
 	vpxor	r1, r0, r0;		vpxor	r3, r1, r1;		\
 	vpxor	RNOT, r3, r3;		vmovdqa	r1, r4;			\
 	vpand	r0, r1, r1;		vpxor	r3, r2, r2;		\
 	vpxor	r2, r1, r1;		vpor	r4, r2, r2;		\
 	vpxor	r3, r4, r4;		vpand	r1, r3, r3;		\
 	vpxor	r0, r3, r3;		vpxor	r1, r4, r4;		\
 	vpxor	r2, r4, r4;		vpxor	r0, r2, r2;		\
 	vpand	r3, r0, r0;		vpxor	RNOT, r2, r2;		\
 	vpxor	r4, r0, r0;		vpor	r3, r4, r4;		\
 	vpxor	r4, r2, r2;
 
 #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r1, r1;		vmovdqa	r3, r4;			\
 	vpxor	r1, r2, r2;		vpor	r0, r3, r3;		\
 	vpxor	r2, r3, r3;		vpor	r1, r2, r2;		\
 	vpand	r0, r2, r2;		vpxor	r3, r4, r4;		\
 	vpxor	r4, r2, r2;		vpor	r0, r4, r4;		\
 	vpxor	r1, r4, r4;		vpand	r2, r1, r1;		\
 	vpxor	r3, r1, r1;		vpxor	r2, r4, r4;		\
 	vpand	r4, r3, r3;		vpxor	r1, r4, r4;		\
 	vpxor	r4, r3, r3;		vpxor	RNOT, r4, r4;		\
 	vpxor	r0, r3, r3;
 
 #define SBOX6(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r2, r2;		vmovdqa	r3, r4;			\
 	vpand	r0, r3, r3;		vpxor	r4, r0, r0;		\
 	vpxor	r2, r3, r3;		vpor	r4, r2, r2;		\
 	vpxor	r3, r1, r1;		vpxor	r0, r2, r2;		\
 	vpor	r1, r0, r0;		vpxor	r1, r2, r2;		\
 	vpxor	r0, r4, r4;		vpor	r3, r0, r0;		\
 	vpxor	r2, r0, r0;		vpxor	r3, r4, r4;		\
 	vpxor	r0, r4, r4;		vpxor	RNOT, r3, r3;		\
 	vpand	r4, r2, r2;	\
 	vpxor	r3, r2, r2;
 
 #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	r2, r0, r0;		vmovdqa	r2, r4;			\
 	vpand	r0, r2, r2;		vpxor	r3, r4, r4;		\
 	vpxor	RNOT, r2, r2;		vpxor	r1, r3, r3;		\
 	vpxor	r3, r2, r2;		vpor	r0, r4, r4;		\
 	vpxor	r2, r0, r0;		vpxor	r4, r3, r3;		\
 	vpxor	r1, r4, r4;		vpand	r3, r1, r1;		\
 	vpxor	r0, r1, r1;		vpxor	r3, r0, r0;		\
 	vpor	r2, r0, r0;		vpxor	r1, r3, r3;		\
 	vpxor	r0, r4, r4;
 
 #define SBOX7(r0, r1, r2, r3, r4) \
 	vmovdqa	r1, r4;			vpor	r2, r1, r1;		\
 	vpxor	r3, r1, r1;		vpxor	r2, r4, r4;		\
 	vpxor	r1, r2, r2;		vpor	r4, r3, r3;		\
 	vpand	r0, r3, r3;		vpxor	r2, r4, r4;		\
 	vpxor	r1, r3, r3;		vpor	r4, r1, r1;		\
 	vpxor	r0, r1, r1;		vpor	r4, r0, r0;		\
 	vpxor	r2, r0, r0;		vpxor	r4, r1, r1;		\
 	vpxor	r1, r2, r2;		vpand	r0, r1, r1;		\
 	vpxor	r4, r1, r1;		vpxor	RNOT, r2, r2;		\
 	vpor	r0, r2, r2;	\
 	vpxor	r2, r4, r4;
 
 #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r2, r4;			vpxor	r0, r2, r2;		\
 	vpand	r3, r0, r0;		vpor	r3, r4, r4;		\
 	vpxor	RNOT, r2, r2;		vpxor	r1, r3, r3;		\
 	vpor	r0, r1, r1;		vpxor	r2, r0, r0;		\
 	vpand	r4, r2, r2;		vpand	r4, r3, r3;		\
 	vpxor	r2, r1, r1;		vpxor	r0, r2, r2;		\
 	vpor	r2, r0, r0;		vpxor	r1, r4, r4;		\
 	vpxor	r3, r0, r0;		vpxor	r4, r3, r3;		\
 	vpor	r0, r4, r4;		vpxor	r2, r3, r3;		\
 	vpxor	r2, r4, r4;
 
 /* Apply SBOX number WHICH to to the block.  */
 #define SBOX(which, r0, r1, r2, r3, r4) \
 	SBOX##which (r0, r1, r2, r3, r4)
 
 /* Apply inverse SBOX number WHICH to to the block.  */
 #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
 	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
 
 /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
 #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
 	vpbroadcastd (ctx_keys + (round) * 16 + 0 * 4)(CTX), r4; \
 	vpxor r4, r0, r0; \
 	vpbroadcastd (ctx_keys + (round) * 16 + 1 * 4)(CTX), r4; \
 	vpxor r4, r1, r1; \
 	vpbroadcastd (ctx_keys + (round) * 16 + 2 * 4)(CTX), r4; \
 	vpxor r4, r2, r2; \
 	vpbroadcastd (ctx_keys + (round) * 16 + 3 * 4)(CTX), r4; \
 	vpxor r4, r3, r3;
 
 /* Apply the linear transformation to BLOCK.  */
 #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
 	vec_rol(r0, 13, r4);	\
 	vec_rol(r2, 3, r4);	\
 	vpxor r0, r1, r1;	\
 	vpxor r2, r1, r1;	\
 	vpslld $3, r0, r4;	\
 	vpxor r2, r3, r3;	\
 	vpxor r4, r3, r3;	\
 	vec_rol(r1, 1, r4);	\
 	vec_rol(r3, 7, r4);	\
 	vpxor r1, r0, r0;	\
 	vpxor r3, r0, r0;	\
 	vpslld $7, r1, r4;	\
 	vpxor r3, r2, r2;	\
 	vpxor r4, r2, r2;	\
 	vec_rol(r0, 5, r4);	\
 	vec_rol(r2, 22, r4);
 
 /* Apply the inverse linear transformation to BLOCK.  */
 #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
 	vec_ror(r2, 22, r4);	\
 	vec_ror(r0, 5, r4);	\
 	vpslld $7, r1, r4;	\
 	vpxor r3, r2, r2;	\
 	vpxor r4, r2, r2;	\
 	vpxor r1, r0, r0;	\
 	vpxor r3, r0, r0;	\
 	vec_ror(r3, 7, r4);	\
 	vec_ror(r1, 1, r4);	\
 	vpslld $3, r0, r4;	\
 	vpxor r2, r3, r3;	\
 	vpxor r4, r3, r3;	\
 	vpxor r0, r1, r1;	\
 	vpxor r2, r1, r1;	\
 	vec_ror(r2, 3, r4);	\
 	vec_ror(r0, 13, r4);
 
 /* Apply a Serpent round to sixteen parallel blocks.  This macro increments
    `round'.  */
 #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
 			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
 	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
 	SBOX (which, a0, a1, a2, a3, a4);			\
 		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
 		SBOX (which, b0, b1, b2, b3, b4);			\
 	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4);	\
 		LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
 
 /* Apply the last Serpent round to sixteen parallel blocks.  This macro
    increments `round'.  */
 #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
 				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
 	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
 	SBOX (which, a0, a1, a2, a3, a4);			\
 		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
 		SBOX (which, b0, b1, b2, b3, b4);			\
 	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1));		\
 		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
 
 /* Apply an inverse Serpent round to sixteen parallel blocks.  This macro
    increments `round'.  */
 #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
 				    na0, na1, na2, na3, na4, \
 				    b0, b1, b2, b3, b4, \
 				    nb0, nb1, nb2, nb3, nb4) \
 	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
 		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
 	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);		\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
 		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 /* Apply the first inverse Serpent round to sixteen parallel blocks.  This macro
    increments `round'.  */
 #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
 					  na0, na1, na2, na3, na4, \
 					  b0, b1, b2, b3, b4, \
 					  nb0, nb1, nb2, nb3, nb4) \
 	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1));	\
 		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1));	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
 	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);	\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
 		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 .text
 
 .align 8
 ELF(.type   __serpent_enc_blk16,@function;)
 __serpent_enc_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
 	 *						plaintext blocks
 	 * output:
 	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
 	 * 						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
 		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
 	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
 		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
 	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
 		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
 	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
 		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
 	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
 		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
 	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
 		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
 	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
 		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
 	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
 		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
 	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
 		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
 	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
 		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
 	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
 		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
 	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
 		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
 	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
 		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
 	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
 		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
 	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
 		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
 	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
 		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
 	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
 		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
 	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
 		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
 	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
 		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
 	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
 		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
 	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
 		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
 	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
 		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
 	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
 		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
 	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
 		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
 	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
 		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
 	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
 		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
 	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
 		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
 	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
 		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
 	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
 		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
 	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
 		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
 	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
 		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
 	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
 		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
 
 	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
 
 .align 8
 ELF(.type   __serpent_dec_blk16,@function;)
 __serpent_dec_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
 	 * 						ciphertext blocks
 	 * output:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
 	 *						plaintext blocks
 	 */
+	CFI_STARTPROC();
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
 				    RA3, RA0, RA1, RA4, RA2,
 				    RB0, RB1, RB2, RB3, RB4,
 				    RB3, RB0, RB1, RB4, RB2);
 	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
 		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
 	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
 		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
 	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
 		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
 	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
 		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
 	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
 		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
 	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
 		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
 	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
 		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
 	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
 		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
 	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
 		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
 	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
 		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
 	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
 		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
 	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
 		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
 	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
 		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
 	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
 		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
 	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
 		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
 	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
 		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
 	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
 		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
 	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
 		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
 	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
 		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
 	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
 		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
 	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
 		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
 	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
 		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
 	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
 		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
 	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
 		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
 	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
 		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
 	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
 		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
 	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
 		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
 	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
 		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
 	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
 		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
 	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
 		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
 	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
 		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
 	vpslldq $8, tmp, tmp; \
 	vpsubq tmp, x, x;
 
 .align 8
 .globl _gcry_serpent_avx2_ctr_enc
 ELF(.type   _gcry_serpent_avx2_ctr_enc,@function;)
 _gcry_serpent_avx2_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
 	vzeroupper;
 
-	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
 	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), RTMP4x;
 	vpshufb RTMP3x, RTMP4x, RTMP4x;
 	vmovdqa RTMP4x, RTMP0x;
 	inc_le128(RTMP4x, RNOTx, RTMP1x);
 	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
 	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
 
 	/* check need for handling 64-bit overflow and carry */
 	cmpq $(0xffffffffffffffff - 16), %rax;
 	ja .Lhandle_ctr_carry;
 
 	/* construct IVs */
 	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
 	vpshufb RTMP3, RTMP0, RA1;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
 	vpshufb RTMP3, RTMP0, RA2;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
 	vpshufb RTMP3, RTMP0, RA3;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
 	vpshufb RTMP3, RTMP0, RB0;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
 	vpshufb RTMP3, RTMP0, RB1;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
 	vpshufb RTMP3, RTMP0, RB2;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
 	vpshufb RTMP3, RTMP0, RB3;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
 	vpshufb RTMP3x, RTMP0x, RTMP0x;
 
 	jmp .Lctr_carry_done;
 
 .Lhandle_ctr_carry:
 	/* construct IVs */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vextracti128 $1, RTMP0, RTMP0x;
 	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
 .align 4
 .Lctr_carry_done:
 	/* store new IV */
 	vmovdqu RTMP0x, (%rcx);
 
 	call __serpent_enc_blk16;
 
 	vpxor (0 * 32)(%rdx), RA4, RA4;
 	vpxor (1 * 32)(%rdx), RA1, RA1;
 	vpxor (2 * 32)(%rdx), RA2, RA2;
 	vpxor (3 * 32)(%rdx), RA0, RA0;
 	vpxor (4 * 32)(%rdx), RB4, RB4;
 	vpxor (5 * 32)(%rdx), RB1, RB1;
 	vpxor (6 * 32)(%rdx), RB2, RB2;
 	vpxor (7 * 32)(%rdx), RB0, RB0;
 
 	vmovdqu RA4, (0 * 32)(%rsi);
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vmovdqu RA2, (2 * 32)(%rsi);
 	vmovdqu RA0, (3 * 32)(%rsi);
 	vmovdqu RB4, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RB2, (6 * 32)(%rsi);
 	vmovdqu RB0, (7 * 32)(%rsi);
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_serpent_avx2_cbc_dec
 ELF(.type   _gcry_serpent_avx2_cbc_dec,@function;)
 _gcry_serpent_avx2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	vmovdqu (0 * 32)(%rdx), RA0;
 	vmovdqu (1 * 32)(%rdx), RA1;
 	vmovdqu (2 * 32)(%rdx), RA2;
 	vmovdqu (3 * 32)(%rdx), RA3;
 	vmovdqu (4 * 32)(%rdx), RB0;
 	vmovdqu (5 * 32)(%rdx), RB1;
 	vmovdqu (6 * 32)(%rdx), RB2;
 	vmovdqu (7 * 32)(%rdx), RB3;
 
 	call __serpent_dec_blk16;
 
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RNOT;
 	vpxor RNOT, RA0, RA0;
 	vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
 	vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
 	vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
 	vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
 	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
 	vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
 	vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
 	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
 	vmovdqu RNOTx, (%rcx); /* store new IV */
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vmovdqu RA2, (2 * 32)(%rsi);
 	vmovdqu RA3, (3 * 32)(%rsi);
 	vmovdqu RB0, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RB2, (6 * 32)(%rsi);
 	vmovdqu RB3, (7 * 32)(%rsi);
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_serpent_avx2_cfb_dec
 ELF(.type   _gcry_serpent_avx2_cfb_dec,@function;)
 _gcry_serpent_avx2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	/* Load input */
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RA0;
 	vmovdqu (0 * 32 + 16)(%rdx), RA1;
 	vmovdqu (1 * 32 + 16)(%rdx), RA2;
 	vmovdqu (2 * 32 + 16)(%rdx), RA3;
 	vmovdqu (3 * 32 + 16)(%rdx), RB0;
 	vmovdqu (4 * 32 + 16)(%rdx), RB1;
 	vmovdqu (5 * 32 + 16)(%rdx), RB2;
 	vmovdqu (6 * 32 + 16)(%rdx), RB3;
 
 	/* Update IV */
 	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
 	vmovdqu RNOTx, (%rcx);
 
 	call __serpent_enc_blk16;
 
 	vpxor (0 * 32)(%rdx), RA4, RA4;
 	vpxor (1 * 32)(%rdx), RA1, RA1;
 	vpxor (2 * 32)(%rdx), RA2, RA2;
 	vpxor (3 * 32)(%rdx), RA0, RA0;
 	vpxor (4 * 32)(%rdx), RB4, RB4;
 	vpxor (5 * 32)(%rdx), RB1, RB1;
 	vpxor (6 * 32)(%rdx), RB2, RB2;
 	vpxor (7 * 32)(%rdx), RB0, RB0;
 
 	vmovdqu RA4, (0 * 32)(%rsi);
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vmovdqu RA2, (2 * 32)(%rsi);
 	vmovdqu RA0, (3 * 32)(%rsi);
 	vmovdqu RB4, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RB2, (6 * 32)(%rsi);
 	vmovdqu RB0, (7 * 32)(%rsi);
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
 .align 8
 .globl _gcry_serpent_avx2_ocb_enc
 ELF(.type _gcry_serpent_avx2_ocb_enc,@function;)
 
 _gcry_serpent_avx2_ocb_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 	vmovdqu (%r8), RTMP1x;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), RTMP0x, RNOTx; \
 	  vpxor (l1reg), RNOTx, RTMP0x; \
 	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
 	  vpxor yreg, RTMP1, RTMP1; \
 	  vpxor yreg, RNOT, yreg; \
 	  vmovdqu RNOT, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, RA0);
 	OCB_INPUT(1, %r12, %r13, RA1);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, RA2);
 	OCB_INPUT(3, %r12, %r13, RA3);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, RB0);
 	OCB_INPUT(5, %r12, %r13, RB1);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, RB2);
 	OCB_INPUT(7, %r12, %r13, RB3);
 #undef OCB_INPUT
 
 	vextracti128 $1, RTMP1, RNOTx;
 	vmovdqu RTMP0x, (%rcx);
 	vpxor RNOTx, RTMP1x, RTMP1x;
 	vmovdqu RTMP1x, (%r8);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor (0 * 32)(%rsi), RA4, RA4;
 	vpxor (1 * 32)(%rsi), RA1, RA1;
 	vpxor (2 * 32)(%rsi), RA2, RA2;
 	vpxor (3 * 32)(%rsi), RA0, RA0;
 	vpxor (4 * 32)(%rsi), RB4, RB4;
 	vpxor (5 * 32)(%rsi), RB1, RB1;
 	vpxor (6 * 32)(%rsi), RB2, RB2;
 	vpxor (7 * 32)(%rsi), RB0, RB0;
 
 	vmovdqu RA4, (0 * 32)(%rsi);
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vmovdqu RA2, (2 * 32)(%rsi);
 	vmovdqu RA0, (3 * 32)(%rsi);
 	vmovdqu RB4, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RB2, (6 * 32)(%rsi);
 	vmovdqu RB0, (7 * 32)(%rsi);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
 
 .align 8
 .globl _gcry_serpent_avx2_ocb_dec
 ELF(.type _gcry_serpent_avx2_ocb_dec,@function;)
 
 _gcry_serpent_avx2_ocb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), RTMP0x, RNOTx; \
 	  vpxor (l1reg), RNOTx, RTMP0x; \
 	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
 	  vpxor yreg, RNOT, yreg; \
 	  vmovdqu RNOT, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, RA0);
 	OCB_INPUT(1, %r12, %r13, RA1);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, RA2);
 	OCB_INPUT(3, %r12, %r13, RA3);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, RB0);
 	OCB_INPUT(5, %r12, %r13, RB1);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, RB2);
 	OCB_INPUT(7, %r12, %r13, RB3);
 #undef OCB_INPUT
 
 	vmovdqu RTMP0x, (%rcx);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_dec_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vmovdqu (%r8), RTMP1x;
 
 	vpxor (0 * 32)(%rsi), RA0, RA0;
 	vpxor (1 * 32)(%rsi), RA1, RA1;
 	vpxor (2 * 32)(%rsi), RA2, RA2;
 	vpxor (3 * 32)(%rsi), RA3, RA3;
 	vpxor (4 * 32)(%rsi), RB0, RB0;
 	vpxor (5 * 32)(%rsi), RB1, RB1;
 	vpxor (6 * 32)(%rsi), RB2, RB2;
 	vpxor (7 * 32)(%rsi), RB3, RB3;
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vpxor RA0, RTMP1, RTMP1;
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vpxor RA1, RTMP1, RTMP1;
 	vmovdqu RA2, (2 * 32)(%rsi);
 	vpxor RA2, RTMP1, RTMP1;
 	vmovdqu RA3, (3 * 32)(%rsi);
 	vpxor RA3, RTMP1, RTMP1;
 	vmovdqu RB0, (4 * 32)(%rsi);
 	vpxor RB0, RTMP1, RTMP1;
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vpxor RB1, RTMP1, RTMP1;
 	vmovdqu RB2, (6 * 32)(%rsi);
 	vpxor RB2, RTMP1, RTMP1;
 	vmovdqu RB3, (7 * 32)(%rsi);
 	vpxor RB3, RTMP1, RTMP1;
 
 	vextracti128 $1, RTMP1, RNOTx;
 	vpxor RNOTx, RTMP1x, RTMP1x;
 	vmovdqu RTMP1x, (%r8);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
 
 .align 8
 .globl _gcry_serpent_avx2_ocb_auth
 ELF(.type _gcry_serpent_avx2_ocb_auth,@function;)
 
 _gcry_serpent_avx2_ocb_auth:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (16 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rdx), RTMP0x;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rsi), yreg; \
 	  vpxor (l0reg), RTMP0x, RNOTx; \
 	  vpxor (l1reg), RNOTx, RTMP0x; \
 	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
 	  vpxor yreg, RNOT, yreg;
 
 	movq (0 * 8)(%r8), %r10;
 	movq (1 * 8)(%r8), %r11;
 	movq (2 * 8)(%r8), %r12;
 	movq (3 * 8)(%r8), %r13;
 	OCB_INPUT(0, %r10, %r11, RA0);
 	OCB_INPUT(1, %r12, %r13, RA1);
 	movq (4 * 8)(%r8), %r10;
 	movq (5 * 8)(%r8), %r11;
 	movq (6 * 8)(%r8), %r12;
 	movq (7 * 8)(%r8), %r13;
 	OCB_INPUT(2, %r10, %r11, RA2);
 	OCB_INPUT(3, %r12, %r13, RA3);
 	movq (8 * 8)(%r8), %r10;
 	movq (9 * 8)(%r8), %r11;
 	movq (10 * 8)(%r8), %r12;
 	movq (11 * 8)(%r8), %r13;
 	OCB_INPUT(4, %r10, %r11, RB0);
 	OCB_INPUT(5, %r12, %r13, RB1);
 	movq (12 * 8)(%r8), %r10;
 	movq (13 * 8)(%r8), %r11;
 	movq (14 * 8)(%r8), %r12;
 	movq (15 * 8)(%r8), %r13;
 	OCB_INPUT(6, %r10, %r11, RB2);
 	OCB_INPUT(7, %r12, %r13, RB3);
 #undef OCB_INPUT
 
 	vmovdqu RTMP0x, (%rdx);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor RA4, RB4, RA4;
 	vpxor RA1, RB1, RA1;
 	vpxor RA2, RB2, RA2;
 	vpxor RA0, RB0, RA0;
 
 	vpxor RA4, RA1, RA1;
 	vpxor RA2, RA0, RA0;
 
 	vpxor RA1, RA0, RTMP1;
 
 	vextracti128 $1, RTMP1, RNOTx;
 	vpxor (%rcx), RTMP1x, RTMP1x;
 	vpxor RNOTx, RTMP1x, RTMP1x;
 	vmovdqu RTMP1x, (%rcx);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
 
 .align 16
 
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 #endif /*defined(USE_SERPENT) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index b149af24..39cba002 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -1,1175 +1,1211 @@
 /* serpent-sse2-amd64.S  -  SSE2 implementation of Serpent cipher
  *
  * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 /* struct serpent_context: */
 #define ctx_keys 0
 
 /* register macros */
 #define CTX %rdi
 
 /* vector registers */
 #define RA0 %xmm0
 #define RA1 %xmm1
 #define RA2 %xmm2
 #define RA3 %xmm3
 #define RA4 %xmm4
 
 #define RB0 %xmm5
 #define RB1 %xmm6
 #define RB2 %xmm7
 #define RB3 %xmm8
 #define RB4 %xmm9
 
 #define RNOT %xmm10
 #define RTMP0 %xmm11
 #define RTMP1 %xmm12
 #define RTMP2 %xmm13
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* vector 32-bit rotation to left */
 #define vec_rol(reg, nleft, tmp) \
 	movdqa reg, tmp; 		\
 	pslld $(nleft), tmp;		\
 	psrld $(32 - (nleft)), reg;	\
 	por tmp, reg;
 
 /* vector 32-bit rotation to right */
 #define vec_ror(reg, nright, tmp) \
 	vec_rol(reg, 32 - nright, tmp)
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 	movdqa    x0, t2; \
 	punpckhdq x1, t2; \
 	punpckldq x1, x0; \
 	\
 	movdqa    x2, t1; \
 	punpckldq x3, t1; \
 	punpckhdq x3, x2; \
 	\
 	movdqa     x0, x1; \
 	punpckhqdq t1, x1; \
 	punpcklqdq t1, x0; \
 	\
 	movdqa     t2, x3; \
 	punpckhqdq x2, x3; \
 	punpcklqdq x2, t2; \
 	movdqa     t2, x2;
 
 /* fill xmm register with 32-bit value from memory */
 #define pbroadcastd(mem32, xreg) \
 	movd mem32, xreg; \
 	pshufd $0, xreg, xreg;
 
 /* xor with unaligned memory operand */
 #define pxor_u(umem128, xreg, t) \
 	movdqu umem128, t; \
 	pxor t, xreg;
 
 /* 128-bit wide byte swap */
 #define pbswap(xreg, t0) \
 	/* reorder 32-bit words, [a,b,c,d] => [d,c,b,a] */ \
 	pshufd $0x1b, xreg, xreg; \
 	/* reorder high&low 16-bit words, [d0,d1,c0,c1] => [d1,d0,c1,c0] */ \
 	pshuflw $0xb1, xreg, xreg; \
 	pshufhw $0xb1, xreg, xreg; \
 	/* reorder bytes in 16-bit words */ \
 	movdqa xreg, t0; \
 	psrlw $8, t0; \
 	psllw $8, xreg; \
 	por t0, xreg;
 
 /**********************************************************************
   8-way serpent
  **********************************************************************/
 
 /*
  * These are the S-Boxes of Serpent from following research paper.
  *
  *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
  *   (New York, New York, USA), p. 317–329, National Institute of Standards and
  *   Technology, 2000.
  *
  * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
  *
  */
 #define SBOX0(r0, r1, r2, r3, r4) \
 	pxor	r0, r3;		movdqa	r1, r4;		\
 	pand	r3, r1;		pxor	r2, r4;		\
 	pxor	r0, r1;		por	r3, r0;		\
 	pxor	r4, r0;		pxor	r3, r4;		\
 	pxor	r2, r3;		por	r1, r2;		\
 	pxor	r4, r2;		pxor	RNOT, r4;	\
 	por	r1, r4;		pxor	r3, r1;		\
 	pxor	r4, r1;		por	r0, r3;		\
 	pxor	r3, r1;		pxor	r3, r4;
 
 #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r2;	movdqa	r1, r4;		\
 	por	r0, r1;		pxor	RNOT, r4;	\
 	pxor	r2, r1;		por	r4, r2;		\
 	pxor	r3, r1;		pxor	r4, r0;		\
 	pxor	r0, r2;		pand	r3, r0;		\
 	pxor	r0, r4;		por	r1, r0;		\
 	pxor	r2, r0;		pxor	r4, r3;		\
 	pxor	r1, r2;		pxor	r0, r3;		\
 	pxor	r1, r3;	\
 	pand	r3, r2;	\
 	pxor	r2, r4;
 
 #define SBOX1(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r0;	pxor	RNOT, r2;	\
 	movdqa	r0, r4;		pand	r1, r0;		\
 	pxor	r0, r2;		por	r3, r0;		\
 	pxor	r2, r3;		pxor	r0, r1;		\
 	pxor	r4, r0;		por	r1, r4;		\
 	pxor	r3, r1;		por	r0, r2;		\
 	pand	r4, r2;		pxor	r1, r0;		\
 	pand	r2, r1;	\
 	pxor	r0, r1;		pand	r2, r0;		\
 	pxor	r4, r0;
 
 #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r1, r4;		pxor	r3, r1;		\
 	pand	r1, r3;		pxor	r2, r4;		\
 	pxor	r0, r3;		por	r1, r0;		\
 	pxor	r3, r2;		pxor	r4, r0;		\
 	por	r2, r0;		pxor	r3, r1;		\
 	pxor	r1, r0;		por	r3, r1;		\
 	pxor	r0, r1;		pxor	RNOT, r4;	\
 	pxor	r1, r4;		por	r0, r1;		\
 	pxor	r0, r1;	\
 	por	r4, r1;	\
 	pxor	r1, r3;
 
 #define SBOX2(r0, r1, r2, r3, r4) \
 	movdqa	r0, r4;		pand	r2, r0;		\
 	pxor	r3, r0;		pxor	r1, r2;		\
 	pxor	r0, r2;		por	r4, r3;		\
 	pxor	r1, r3;		pxor	r2, r4;		\
 	movdqa	r3, r1;		por	r4, r3;		\
 	pxor	r0, r3;		pand	r1, r0;		\
 	pxor	r0, r4;		pxor	r3, r1;		\
 	pxor	r4, r1;		pxor	RNOT, r4;
 
 #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	r3, r2;		pxor	r0, r3;		\
 	movdqa	r3, r4;		pand	r2, r3;		\
 	pxor	r1, r3;		por	r2, r1;		\
 	pxor	r4, r1;		pand	r3, r4;		\
 	pxor	r3, r2;		pand	r0, r4;		\
 	pxor	r2, r4;		pand	r1, r2;		\
 	por	r0, r2;		pxor	RNOT, r3;	\
 	pxor	r3, r2;		pxor	r3, r0;		\
 	pand	r1, r0;		pxor	r4, r3;		\
 	pxor	r0, r3;
 
 #define SBOX3(r0, r1, r2, r3, r4) \
 	movdqa	r0, r4;		por	r3, r0;		\
 	pxor	r1, r3;		pand	r4, r1;		\
 	pxor	r2, r4;		pxor	r3, r2;		\
 	pand	r0, r3;		por	r1, r4;		\
 	pxor	r4, r3;		pxor	r1, r0;		\
 	pand	r0, r4;		pxor	r3, r1;		\
 	pxor	r2, r4;		por	r0, r1;		\
 	pxor	r2, r1;		pxor	r3, r0;		\
 	movdqa	r1, r2;		por	r3, r1;		\
 	pxor	r0, r1;
 
 #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r2, r4;		pxor	r1, r2;		\
 	pxor	r2, r0;		pand	r2, r4;		\
 	pxor	r0, r4;		pand	r1, r0;		\
 	pxor	r3, r1;		por	r4, r3;		\
 	pxor	r3, r2;		pxor	r3, r0;		\
 	pxor	r4, r1;		pand	r2, r3;		\
 	pxor	r1, r3;		pxor	r0, r1;		\
 	por	r2, r1;		pxor	r3, r0;		\
 	pxor	r4, r1;	\
 	pxor	r1, r0;
 
 #define SBOX4(r0, r1, r2, r3, r4) \
 	pxor	r3, r1;		pxor	RNOT, r3;	\
 	pxor	r3, r2;		pxor	r0, r3;		\
 	movdqa	r1, r4;		pand	r3, r1;		\
 	pxor	r2, r1;		pxor	r3, r4;		\
 	pxor	r4, r0;		pand	r4, r2;		\
 	pxor	r0, r2;		pand	r1, r0;		\
 	pxor	r0, r3;		por	r1, r4;		\
 	pxor	r0, r4;		por	r3, r0;		\
 	pxor	r2, r0;		pand	r3, r2;		\
 	pxor	RNOT, r0;	pxor	r2, r4;
 
 #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r2, r4;		pand	r3, r2;		\
 	pxor	r1, r2;		por	r3, r1;		\
 	pand	r0, r1;		pxor	r2, r4;		\
 	pxor	r1, r4;		pand	r2, r1;		\
 	pxor	RNOT, r0;	pxor	r4, r3;		\
 	pxor	r3, r1;		pand	r0, r3;		\
 	pxor	r2, r3;		pxor	r1, r0;		\
 	pand	r0, r2;		pxor	r0, r3;		\
 	pxor	r4, r2;	\
 	por	r3, r2;		pxor	r0, r3;		\
 	pxor	r1, r2;
 
 #define SBOX5(r0, r1, r2, r3, r4) \
 	pxor	r1, r0;		pxor	r3, r1;		\
 	pxor	RNOT, r3;	movdqa	r1, r4;		\
 	pand	r0, r1;		pxor	r3, r2;		\
 	pxor	r2, r1;		por	r4, r2;		\
 	pxor	r3, r4;		pand	r1, r3;		\
 	pxor	r0, r3;		pxor	r1, r4;		\
 	pxor	r2, r4;		pxor	r0, r2;		\
 	pand	r3, r0;		pxor	RNOT, r2;	\
 	pxor	r4, r0;		por	r3, r4;		\
 	pxor	r4, r2;
 
 #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r1;	movdqa	r3, r4;		\
 	pxor	r1, r2;		por	r0, r3;		\
 	pxor	r2, r3;		por	r1, r2;		\
 	pand	r0, r2;		pxor	r3, r4;		\
 	pxor	r4, r2;		por	r0, r4;		\
 	pxor	r1, r4;		pand	r2, r1;		\
 	pxor	r3, r1;		pxor	r2, r4;		\
 	pand	r4, r3;		pxor	r1, r4;		\
 	pxor	r4, r3;		pxor	RNOT, r4;	\
 	pxor	r0, r3;
 
 #define SBOX6(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r2;	movdqa	r3, r4;		\
 	pand	r0, r3;		pxor	r4, r0;		\
 	pxor	r2, r3;		por	r4, r2;		\
 	pxor	r3, r1;		pxor	r0, r2;		\
 	por	r1, r0;		pxor	r1, r2;		\
 	pxor	r0, r4;		por	r3, r0;		\
 	pxor	r2, r0;		pxor	r3, r4;		\
 	pxor	r0, r4;		pxor	RNOT, r3;	\
 	pand	r4, r2;	\
 	pxor	r3, r2;
 
 #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	r2, r0;		movdqa	r2, r4;		\
 	pand	r0, r2;		pxor	r3, r4;		\
 	pxor	RNOT, r2;	pxor	r1, r3;		\
 	pxor	r3, r2;		por	r0, r4;		\
 	pxor	r2, r0;		pxor	r4, r3;		\
 	pxor	r1, r4;		pand	r3, r1;		\
 	pxor	r0, r1;		pxor	r3, r0;		\
 	por	r2, r0;		pxor	r1, r3;		\
 	pxor	r0, r4;
 
 #define SBOX7(r0, r1, r2, r3, r4) \
 	movdqa	r1, r4;		por	r2, r1;		\
 	pxor	r3, r1;		pxor	r2, r4;		\
 	pxor	r1, r2;		por	r4, r3;		\
 	pand	r0, r3;		pxor	r2, r4;		\
 	pxor	r1, r3;		por	r4, r1;		\
 	pxor	r0, r1;		por	r4, r0;		\
 	pxor	r2, r0;		pxor	r4, r1;		\
 	pxor	r1, r2;		pand	r0, r1;		\
 	pxor	r4, r1;		pxor	RNOT, r2;	\
 	por	r0, r2;	\
 	pxor	r2, r4;
 
 #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r2, r4;		pxor	r0, r2;		\
 	pand	r3, r0;		por	r3, r4;		\
 	pxor	RNOT, r2;	pxor	r1, r3;		\
 	por	r0, r1;		pxor	r2, r0;		\
 	pand	r4, r2;		pand	r4, r3;		\
 	pxor	r2, r1;		pxor	r0, r2;		\
 	por	r2, r0;		pxor	r1, r4;		\
 	pxor	r3, r0;		pxor	r4, r3;		\
 	por	r0, r4;		pxor	r2, r3;		\
 	pxor	r2, r4;
 
 /* Apply SBOX number WHICH to to the block.  */
 #define SBOX(which, r0, r1, r2, r3, r4) \
 	SBOX##which (r0, r1, r2, r3, r4)
 
 /* Apply inverse SBOX number WHICH to to the block.  */
 #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
 	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
 
 /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
 #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
 	pbroadcastd ((ctx_keys + (round) * 16 + 0 * 4)(CTX), r4); \
 	pxor r4, r0; \
 	pbroadcastd ((ctx_keys + (round) * 16 + 1 * 4)(CTX), r4); \
 	pxor r4, r1; \
 	pbroadcastd ((ctx_keys + (round) * 16 + 2 * 4)(CTX), r4); \
 	pxor r4, r2; \
 	pbroadcastd ((ctx_keys + (round) * 16 + 3 * 4)(CTX), r4); \
 	pxor r4, r3;
 
 /* Apply the linear transformation to BLOCK.  */
 #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
 	vec_rol(r0, 13, r4);	\
 	vec_rol(r2, 3, r4);	\
 	pxor r0, r1;		\
 	pxor r2, r1;		\
 	movdqa r0, r4;		\
 	pslld $3, r4;		\
 	pxor r2, r3;		\
 	pxor r4, r3;		\
 	vec_rol(r1, 1, r4);	\
 	vec_rol(r3, 7, r4);	\
 	pxor r1, r0;		\
 	pxor r3, r0;		\
 	movdqa r1, r4;		\
 	pslld $7, r4;		\
 	pxor r3, r2;		\
 	pxor r4, r2;		\
 	vec_rol(r0, 5, r4);	\
 	vec_rol(r2, 22, r4);
 
 /* Apply the inverse linear transformation to BLOCK.  */
 #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
 	vec_ror(r2, 22, r4);	\
 	vec_ror(r0, 5, r4);	\
 	movdqa r1, r4;		\
 	pslld $7, r4;		\
 	pxor r3, r2;		\
 	pxor r4, r2;		\
 	pxor r1, r0;		\
 	pxor r3, r0;		\
 	vec_ror(r3, 7, r4);	\
 	vec_ror(r1, 1, r4);	\
 	movdqa r0, r4;		\
 	pslld $3, r4;		\
 	pxor r2, r3;		\
 	pxor r4, r3;		\
 	pxor r0, r1;		\
 	pxor r2, r1;		\
 	vec_ror(r2, 3, r4);	\
 	vec_ror(r0, 13, r4);
 
 /* Apply a Serpent round to eight parallel blocks.  This macro increments
    `round'.  */
 #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
 			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
 	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
 	SBOX (which, a0, a1, a2, a3, a4);			\
 		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
 		SBOX (which, b0, b1, b2, b3, b4);			\
 	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4);	\
 		LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
 
 /* Apply the last Serpent round to eight parallel blocks.  This macro increments
    `round'.  */
 #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
 				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
 	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
 	SBOX (which, a0, a1, a2, a3, a4);			\
 		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
 		SBOX (which, b0, b1, b2, b3, b4);			\
 	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1));		\
 		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
 
 /* Apply an inverse Serpent round to eight parallel blocks.  This macro
    increments `round'.  */
 #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
 				    na0, na1, na2, na3, na4, \
 				    b0, b1, b2, b3, b4, \
 				    nb0, nb1, nb2, nb3, nb4) \
 	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
 		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
 	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);		\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
 		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 /* Apply the first inverse Serpent round to eight parallel blocks.  This macro
    increments `round'.  */
 #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
 					  na0, na1, na2, na3, na4, \
 					  b0, b1, b2, b3, b4, \
 					  nb0, nb1, nb2, nb3, nb4) \
 	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1));	\
 		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1));	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
 	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);	\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
 		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 .text
 
 .align 8
 ELF(.type   __serpent_enc_blk8,@function;)
 __serpent_enc_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
 	 *						blocks
 	 * output:
 	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
 	 * 						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 
 	pcmpeqd RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
 		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
 	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
 		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
 	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
 		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
 	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
 		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
 	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
 		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
 	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
 		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
 	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
 		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
 	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
 		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
 	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
 		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
 	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
 		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
 	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
 		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
 	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
 		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
 	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
 		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
 	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
 		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
 	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
 		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
 	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
 		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
 	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
 		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
 	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
 		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
 	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
 		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
 	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
 		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
 	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
 		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
 	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
 		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
 	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
 		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
 	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
 		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
 	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
 		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
 	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
 		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
 	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
 		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
 	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
 		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
 	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
 		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
 	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
 		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
 	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
 		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
 	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
 		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
 
 	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
 
 .align 8
 ELF(.type   __serpent_dec_blk8,@function;)
 __serpent_dec_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
 	 * 						ciphertext blocks
 	 * output:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
 	 *						blocks
 	 */
+	CFI_STARTPROC();
 
 	pcmpeqd RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
 				    RA3, RA0, RA1, RA4, RA2,
 				    RB0, RB1, RB2, RB3, RB4,
 				    RB3, RB0, RB1, RB4, RB2);
 	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
 		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
 	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
 		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
 	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
 		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
 	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
 		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
 	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
 		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
 	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
 		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
 	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
 		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
 	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
 		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
 	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
 		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
 	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
 		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
 	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
 		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
 	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
 		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
 	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
 		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
 	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
 		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
 	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
 		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
 	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
 		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
 	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
 		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
 	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
 		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
 	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
 		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
 	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
 		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
 	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
 		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
 	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
 		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
 	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
 		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
 	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
 		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
 	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
 		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
 	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
 		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
 	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
 		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
 	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
 		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
 	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
 		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
 	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
 		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
 	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
 		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
 .align 8
 .globl _gcry_serpent_sse2_ctr_enc
 ELF(.type   _gcry_serpent_sse2_ctr_enc,@function;)
 _gcry_serpent_sse2_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	/* load IV and byteswap */
 	movdqu (%rcx), RA0;
 	movdqa RA0, RTMP0;
 	pbswap(RTMP0, RTMP1); /* be => le */
 
 	pcmpeqd RNOT, RNOT;
 	psrldq $8, RNOT; /* low: -1, high: 0 */
 	movdqa RNOT, RTMP2;
 	paddq RTMP2, RTMP2; /* low: -2, high: 0 */
 
 	/* construct IVs */
 	movdqa RTMP0, RTMP1;
 	psubq RNOT, RTMP0; /* +1 */
 	movdqa RTMP0, RA1;
 	psubq RTMP2, RTMP1; /* +2 */
 	movdqa RTMP1, RA2;
 	psubq RTMP2, RTMP0; /* +3 */
 	movdqa RTMP0, RA3;
 	psubq RTMP2, RTMP1; /* +4 */
 	movdqa RTMP1, RB0;
 	psubq RTMP2, RTMP0; /* +5 */
 	movdqa RTMP0, RB1;
 	psubq RTMP2, RTMP1; /* +6 */
 	movdqa RTMP1, RB2;
 	psubq RTMP2, RTMP0; /* +7 */
 	movdqa RTMP0, RB3;
 	psubq RTMP2, RTMP1; /* +8 */
 
 	/* check need for handling 64-bit overflow and carry */
 	cmpl $0xffffffff, 8(%rcx);
 	jne .Lno_ctr_carry;
 
 	movl 12(%rcx), %eax;
 	bswapl %eax;
 	cmpl $-8, %eax;
 	jb .Lno_ctr_carry;
 	pslldq $8, RNOT; /* low: 0, high: -1 */
 	je .Lcarry_RTMP0;
 
 	cmpl $-6, %eax;
 	jb .Lcarry_RB3;
 	je .Lcarry_RB2;
 
 	cmpl $-4, %eax;
 	jb .Lcarry_RB1;
 	je .Lcarry_RB0;
 
 	cmpl $-2, %eax;
 	jb .Lcarry_RA3;
 	je .Lcarry_RA2;
 
 	psubq RNOT, RA1;
 .Lcarry_RA2:
 	psubq RNOT, RA2;
 .Lcarry_RA3:
 	psubq RNOT, RA3;
 .Lcarry_RB0:
 	psubq RNOT, RB0;
 .Lcarry_RB1:
 	psubq RNOT, RB1;
 .Lcarry_RB2:
 	psubq RNOT, RB2;
 .Lcarry_RB3:
 	psubq RNOT, RB3;
 .Lcarry_RTMP0:
 	psubq RNOT, RTMP1;
 
 .Lno_ctr_carry:
 	/* le => be */
 	pbswap(RA1, RTMP0);
 	pbswap(RA2, RTMP0);
 	pbswap(RA3, RTMP0);
 	pbswap(RB0, RTMP0);
 	pbswap(RB1, RTMP0);
 	pbswap(RB2, RTMP0);
 	pbswap(RB3, RTMP0);
 	pbswap(RTMP1, RTMP0);
 	/* store new IV */
 	movdqu RTMP1, (%rcx);
 
 	call __serpent_enc_blk8;
 
 	pxor_u((0 * 16)(%rdx), RA4, RTMP0);
 	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
 	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
 	pxor_u((3 * 16)(%rdx), RA0, RTMP0);
 	pxor_u((4 * 16)(%rdx), RB4, RTMP0);
 	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
 	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
 	pxor_u((7 * 16)(%rdx), RB0, RTMP0);
 
 	movdqu RA4, (0 * 16)(%rsi);
 	movdqu RA1, (1 * 16)(%rsi);
 	movdqu RA2, (2 * 16)(%rsi);
 	movdqu RA0, (3 * 16)(%rsi);
 	movdqu RB4, (4 * 16)(%rsi);
 	movdqu RB1, (5 * 16)(%rsi);
 	movdqu RB2, (6 * 16)(%rsi);
 	movdqu RB0, (7 * 16)(%rsi);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
 	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
 	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
 
 .align 8
 .globl _gcry_serpent_sse2_cbc_dec
 ELF(.type   _gcry_serpent_sse2_cbc_dec,@function;)
 _gcry_serpent_sse2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	movdqu (0 * 16)(%rdx), RA0;
 	movdqu (1 * 16)(%rdx), RA1;
 	movdqu (2 * 16)(%rdx), RA2;
 	movdqu (3 * 16)(%rdx), RA3;
 	movdqu (4 * 16)(%rdx), RB0;
 	movdqu (5 * 16)(%rdx), RB1;
 	movdqu (6 * 16)(%rdx), RB2;
 	movdqu (7 * 16)(%rdx), RB3;
 
 	call __serpent_dec_blk8;
 
 	movdqu (7 * 16)(%rdx), RNOT;
 	pxor_u((%rcx), RA0, RTMP0);
 	pxor_u((0 * 16)(%rdx), RA1, RTMP0);
 	pxor_u((1 * 16)(%rdx), RA2, RTMP0);
 	pxor_u((2 * 16)(%rdx), RA3, RTMP0);
 	pxor_u((3 * 16)(%rdx), RB0, RTMP0);
 	pxor_u((4 * 16)(%rdx), RB1, RTMP0);
 	pxor_u((5 * 16)(%rdx), RB2, RTMP0);
 	pxor_u((6 * 16)(%rdx), RB3, RTMP0);
 	movdqu RNOT, (%rcx); /* store new IV */
 
 	movdqu RA0, (0 * 16)(%rsi);
 	movdqu RA1, (1 * 16)(%rsi);
 	movdqu RA2, (2 * 16)(%rsi);
 	movdqu RA3, (3 * 16)(%rsi);
 	movdqu RB0, (4 * 16)(%rsi);
 	movdqu RB1, (5 * 16)(%rsi);
 	movdqu RB2, (6 * 16)(%rsi);
 	movdqu RB3, (7 * 16)(%rsi);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
 	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
 	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
 
 .align 8
 .globl _gcry_serpent_sse2_cfb_dec
 ELF(.type   _gcry_serpent_sse2_cfb_dec,@function;)
 _gcry_serpent_sse2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	/* Load input */
 	movdqu (%rcx), RA0;
 	movdqu 0 * 16(%rdx), RA1;
 	movdqu 1 * 16(%rdx), RA2;
 	movdqu 2 * 16(%rdx), RA3;
 	movdqu 3 * 16(%rdx), RB0;
 	movdqu 4 * 16(%rdx), RB1;
 	movdqu 5 * 16(%rdx), RB2;
 	movdqu 6 * 16(%rdx), RB3;
 
 	/* Update IV */
 	movdqu 7 * 16(%rdx), RNOT;
 	movdqu RNOT, (%rcx);
 
 	call __serpent_enc_blk8;
 
 	pxor_u((0 * 16)(%rdx), RA4, RTMP0);
 	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
 	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
 	pxor_u((3 * 16)(%rdx), RA0, RTMP0);
 	pxor_u((4 * 16)(%rdx), RB4, RTMP0);
 	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
 	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
 	pxor_u((7 * 16)(%rdx), RB0, RTMP0);
 
 	movdqu RA4, (0 * 16)(%rsi);
 	movdqu RA1, (1 * 16)(%rsi);
 	movdqu RA2, (2 * 16)(%rsi);
 	movdqu RA0, (3 * 16)(%rsi);
 	movdqu RB4, (4 * 16)(%rsi);
 	movdqu RB1, (5 * 16)(%rsi);
 	movdqu RB2, (6 * 16)(%rsi);
 	movdqu RB0, (7 * 16)(%rsi);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
 	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
 	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
 .align 8
 .globl _gcry_serpent_sse2_ocb_enc
 ELF(.type _gcry_serpent_sse2_ocb_enc,@function;)
 
 _gcry_serpent_sse2_ocb_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rcx), RTMP0;
 	movdqu (%r8), RTMP1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, lreg, xreg) \
 	  movdqu (n * 16)(%rdx), xreg; \
 	  movdqu (lreg), RNOT; \
 	  pxor RNOT, RTMP0; \
 	  pxor xreg, RTMP1; \
 	  pxor RTMP0, xreg; \
 	  movdqu RTMP0, (n * 16)(%rsi);
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, RA0);
 	OCB_INPUT(1, %r11, RA1);
 	OCB_INPUT(2, %r12, RA2);
 	OCB_INPUT(3, %r13, RA3);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, RB0);
 	OCB_INPUT(5, %r11, RB1);
 	OCB_INPUT(6, %r12, RB2);
 	OCB_INPUT(7, %r13, RB3);
 #undef OCB_INPUT
 
 	movdqu RTMP0, (%rcx);
 	movdqu RTMP1, (%r8);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	pxor_u((0 * 16)(%rsi), RA4, RTMP0);
 	pxor_u((1 * 16)(%rsi), RA1, RTMP0);
 	pxor_u((2 * 16)(%rsi), RA2, RTMP0);
 	pxor_u((3 * 16)(%rsi), RA0, RTMP0);
 	pxor_u((4 * 16)(%rsi), RB4, RTMP0);
 	pxor_u((5 * 16)(%rsi), RB1, RTMP0);
 	pxor_u((6 * 16)(%rsi), RB2, RTMP0);
 	pxor_u((7 * 16)(%rsi), RB0, RTMP0);
 
 	movdqu RA4, (0 * 16)(%rsi);
 	movdqu RA1, (1 * 16)(%rsi);
 	movdqu RA2, (2 * 16)(%rsi);
 	movdqu RA0, (3 * 16)(%rsi);
 	movdqu RB4, (4 * 16)(%rsi);
 	movdqu RB1, (5 * 16)(%rsi);
 	movdqu RB2, (6 * 16)(%rsi);
 	movdqu RB0, (7 * 16)(%rsi);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
 	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
 	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
 
 .align 8
 .globl _gcry_serpent_sse2_ocb_dec
 ELF(.type _gcry_serpent_sse2_ocb_dec,@function;)
 
 _gcry_serpent_sse2_ocb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rcx), RTMP0;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
 
 #define OCB_INPUT(n, lreg, xreg) \
 	  movdqu (n * 16)(%rdx), xreg; \
 	  movdqu (lreg), RNOT; \
 	  pxor RNOT, RTMP0; \
 	  pxor RTMP0, xreg; \
 	  movdqu RTMP0, (n * 16)(%rsi);
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, RA0);
 	OCB_INPUT(1, %r11, RA1);
 	OCB_INPUT(2, %r12, RA2);
 	OCB_INPUT(3, %r13, RA3);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, RB0);
 	OCB_INPUT(5, %r11, RB1);
 	OCB_INPUT(6, %r12, RB2);
 	OCB_INPUT(7, %r13, RB3);
 #undef OCB_INPUT
 
 	movdqu RTMP0, (%rcx);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_dec_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	movdqu (%r8), RTMP0;
 
 	pxor_u((0 * 16)(%rsi), RA0, RTMP1);
 	pxor_u((1 * 16)(%rsi), RA1, RTMP1);
 	pxor_u((2 * 16)(%rsi), RA2, RTMP1);
 	pxor_u((3 * 16)(%rsi), RA3, RTMP1);
 	pxor_u((4 * 16)(%rsi), RB0, RTMP1);
 	pxor_u((5 * 16)(%rsi), RB1, RTMP1);
 	pxor_u((6 * 16)(%rsi), RB2, RTMP1);
 	pxor_u((7 * 16)(%rsi), RB3, RTMP1);
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
 	movdqu RA0, (0 * 16)(%rsi);
 	pxor RA0, RTMP0;
 	movdqu RA1, (1 * 16)(%rsi);
 	pxor RA1, RTMP0;
 	movdqu RA2, (2 * 16)(%rsi);
 	pxor RA2, RTMP0;
 	movdqu RA3, (3 * 16)(%rsi);
 	pxor RA3, RTMP0;
 	movdqu RB0, (4 * 16)(%rsi);
 	pxor RB0, RTMP0;
 	movdqu RB1, (5 * 16)(%rsi);
 	pxor RB1, RTMP0;
 	movdqu RB2, (6 * 16)(%rsi);
 	pxor RB2, RTMP0;
 	movdqu RB3, (7 * 16)(%rsi);
 	pxor RB3, RTMP0;
 
 	movdqu RTMP0, (%r8);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
 	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
 	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
 
 .align 8
 .globl _gcry_serpent_sse2_ocb_auth
 ELF(.type _gcry_serpent_sse2_ocb_auth,@function;)
 
 _gcry_serpent_sse2_ocb_auth:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (8 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rdx), RTMP0;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
 
 #define OCB_INPUT(n, lreg, xreg) \
 	  movdqu (n * 16)(%rsi), xreg; \
 	  movdqu (lreg), RNOT; \
 	  pxor RNOT, RTMP0; \
 	  pxor RTMP0, xreg;
 	movq (0 * 8)(%r8), %r10;
 	movq (1 * 8)(%r8), %r11;
 	movq (2 * 8)(%r8), %r12;
 	movq (3 * 8)(%r8), %r13;
 	OCB_INPUT(0, %r10, RA0);
 	OCB_INPUT(1, %r11, RA1);
 	OCB_INPUT(2, %r12, RA2);
 	OCB_INPUT(3, %r13, RA3);
 	movq (4 * 8)(%r8), %r10;
 	movq (5 * 8)(%r8), %r11;
 	movq (6 * 8)(%r8), %r12;
 	movq (7 * 8)(%r8), %r13;
 	OCB_INPUT(4, %r10, RB0);
 	OCB_INPUT(5, %r11, RB1);
 	OCB_INPUT(6, %r12, RB2);
 	OCB_INPUT(7, %r13, RB3);
 #undef OCB_INPUT
 
 	movdqu RTMP0, (%rdx);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	movdqu (%rcx), RTMP0;
 	pxor RB4, RA4;
 	pxor RB1, RA1;
 	pxor RB2, RA2;
 	pxor RB0, RA0;
 
 	pxor RTMP0, RA2;
 	pxor RA4, RA1;
 	pxor RA2, RA0;
 
 	pxor RA1, RA0;
 	movdqu RA0, (%rcx);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
 	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
 	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;)
 
 #endif /*defined(USE_SERPENT)*/
 #endif /*__x86_64*/
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 5d674c15..85876ad4 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -1,431 +1,429 @@
 /* sha1-avx-amd64.S - Intel AVX accelerated SHA-1 transform function
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Based on sha1.c:
  *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
  *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
  *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
  */
 
 #ifdef __x86_64__
 #include <config.h>
 
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
 
 #define state_h0 0
 #define state_h1 4
 #define state_h2 8
 #define state_h3 12
 #define state_h4 16
 
 
 /* Constants */
 
 .text
 #define K1  0x5A827999
 #define K2  0x6ED9EBA1
 #define K3  0x8F1BBCDC
 #define K4  0xCA62C1D6
 .align 16
 .LK_XMM:
 .LK1:	.long K1, K1, K1, K1
 .LK2:	.long K2, K2, K2, K2
 .LK3:	.long K3, K3, K3, K3
 .LK4:	.long K4, K4, K4, K4
 
 .Lbswap_shufb_ctl:
 	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
 
 
 /* Register macros */
 
 #define RSTATE %r8
 #define RDATA %r9
 #define ROLDSTACK %r10
 #define RNBLKS %r11
 
 #define a %eax
 #define b %ebx
 #define c %ecx
 #define d %edx
 #define e %edi
 
 #define RT0 %esi
 #define RT1 %ebp
 
 #define Wtmp0 %xmm0
 #define Wtmp1 %xmm1
 
 #define W0 %xmm2
 #define W1 %xmm3
 #define W2 %xmm4
 #define W3 %xmm5
 #define W4 %xmm6
 #define W5 %xmm7
 #define W6 %xmm8
 #define W7 %xmm9
 
 #define BSWAP_REG %xmm10
 
 
 /* Round function macros. */
 
 #define WK(i) (((i) & 15) * 4)(%rsp)
 
 #define R_F1(a,b,c,d,e,i) \
 	movl c, RT0; \
 	addl WK(i), e; \
 	xorl d, RT0; \
 	movl a, RT1; \
 	andl b, RT0; \
 	shldl $30, b, b; \
 	xorl d, RT0; \
 	leal (RT0,e), e; \
 	shldl $5, RT1, RT1; \
 	addl RT1, e;
 
 #define R_F2(a,b,c,d,e,i) \
 	movl c, RT0; \
 	addl WK(i), e; \
 	xorl b, RT0; \
 	shldl $30, b, b; \
 	xorl d, RT0; \
 	movl a, RT1; \
 	leal (RT0,e), e; \
 	shldl $5, RT1, RT1; \
 	addl RT1, e;
 
 #define R_F3(a,b,c,d,e,i) \
 	movl c, RT0; \
 	movl b, RT1; \
 	xorl b, RT0; \
 	andl c, RT1; \
 	andl d, RT0; \
 	addl RT1, e; \
 	addl WK(i), e; \
 	shldl $30, b, b; \
 	movl a, RT1; \
 	leal (RT0,e), e; \
 	shldl $5, RT1, RT1; \
 	addl RT1, e;
 
 #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
 
 #define R(a,b,c,d,e,f,i) \
 	R_##f(a,b,c,d,e,i)
 
 
 /* Input expansion macros. */
 
 #define W_PRECALC_00_15_0(i, W, tmp0) \
 	vmovdqu (4*(i))(RDATA), tmp0;
 
 #define W_PRECALC_00_15_1(i, W, tmp0) \
 	vpshufb BSWAP_REG, tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0) \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0;
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	vmovdqa tmp0, WK(i&~3);
 
 #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpalignr $8, W_m16, W_m12, W; \
 	vpsrldq $4, W_m04, tmp0; \
 	vpxor W_m08, W, W;
 
 #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpxor W_m16, tmp0, tmp0; \
 	vpxor tmp0, W, W; \
 	vpslld $1, W, tmp0; \
 	vpslldq $12, W, tmp1; \
 	vpsrld $31, W, W;
 
 #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpor W, tmp0, tmp0; \
 	vpsrld $30, tmp1, W; \
 	vpslld $2, tmp1, tmp1;
 
 #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpxor W, tmp0, tmp0; \
 	vpxor tmp1, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpxor W_m28, W, W; \
 	vpalignr $8, W_m08, W_m04, tmp0;
 
 #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpxor W_m16, W, W; \
 	vpxor tmp0, W, W;
 
 #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpsrld $30, W, tmp0; \
 	vpslld $2, W, W;
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpor W, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  *
  * unsigned int
  * _gcry_sha1_transform_amd64_avx (void *ctx, const unsigned char *data,
  *                                  size_t nblks)
  */
 .globl _gcry_sha1_transform_amd64_avx
 ELF(.type _gcry_sha1_transform_amd64_avx,@function)
 .align 16
 _gcry_sha1_transform_amd64_avx:
   /* input:
    *	%rdi: ctx, CTX
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
   jz .Lret;
 
   vzeroupper;
 
   movq %rdx, RNBLKS;
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
 
   /* Get the values of the chaining variables. */
   movl state_h0(RSTATE), a;
   movl state_h1(RSTATE), b;
   movl state_h2(RSTATE), c;
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
 
-  vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
   W_PRECALC_00_15_1(1, W0, Wtmp0);
   W_PRECALC_00_15_2(2, W0, Wtmp0);
   W_PRECALC_00_15_3(3, W0, Wtmp0);
   W_PRECALC_00_15_0(4, W7, Wtmp0);
   W_PRECALC_00_15_1(5, W7, Wtmp0);
   W_PRECALC_00_15_2(6, W7, Wtmp0);
   W_PRECALC_00_15_3(7, W7, Wtmp0);
   W_PRECALC_00_15_0(8, W6, Wtmp0);
   W_PRECALC_00_15_1(9, W6, Wtmp0);
   W_PRECALC_00_15_2(10, W6, Wtmp0);
   W_PRECALC_00_15_3(11, W6, Wtmp0);
   W_PRECALC_00_15_0(12, W5, Wtmp0);
   W_PRECALC_00_15_1(13, W5, Wtmp0);
   W_PRECALC_00_15_2(14, W5, Wtmp0);
   W_PRECALC_00_15_3(15, W5, Wtmp0);
 
 .align 8
 .Loop:
   addq $64, RDATA;
 
   /* Transform 0-15 + Precalc 16-31. */
   R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
 
   /* Transform 16-63 + Precalc 32-79. */
   R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
 
   decq RNBLKS;
   jz .Lend;
 
   /* Transform 64-79 + Precalc 0-15 of next block. */
   R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
   R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
   R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
   R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
   R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
   R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
   R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
   R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
   R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
   R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
   R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
   R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
   R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
   R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
   R( c, d, e, a, b, F4, 78 );
   addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0);
   R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   jmp .Loop;
 
 .align 16
 .Lend:
   vzeroall;
 
   /* Transform 64-79 + burn stack */
   R( b, c, d, e, a, F4, 64 );
   R( a, b, c, d, e, F4, 65 );
   R( e, a, b, c, d, F4, 66 );
   R( d, e, a, b, c, F4, 67 );
   R( c, d, e, a, b, F4, 68 );
   R( b, c, d, e, a, F4, 69 );
   R( a, b, c, d, e, F4, 70 );
   R( e, a, b, c, d, F4, 71 );
   R( d, e, a, b, c, F4, 72 );
   R( c, d, e, a, b, F4, 73 );
   R( b, c, d, e, a, F4, 74 );
   R( a, b, c, d, e, F4, 75 );
   R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
   R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
   R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
 
   /* 16*4/16-1 = 3 */
   vmovdqa %xmm0, (3*16)(%rsp);
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx,
     .-_gcry_sha1_transform_amd64_avx;)
 
 #endif
 #endif
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index fe8901ef..5dfcdca9 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -1,441 +1,441 @@
 /* sha1-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SHA-1 transform function
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Based on sha1.c:
  *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
  *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
  *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
  */
 
 #ifdef __x86_64__
 #include <config.h>
 
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
 
 #define state_h0 0
 #define state_h1 4
 #define state_h2 8
 #define state_h3 12
 #define state_h4 16
 
 
 /* Constants */
 
 .text
 .align 16
 .Lbswap_shufb_ctl:
 	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
 
 .LK1:	.long 0x5A827999
 .LK2:	.long 0x6ED9EBA1
 .LK3:	.long 0x8F1BBCDC
 .LK4:	.long 0xCA62C1D6
 
 
 /* Register macros */
 
 #define RSTATE %r8
 #define RDATA %r9
 #define ROLDSTACK %r10
 #define RNBLKS %r11
 
 #define a %esi
 #define b %edi
 #define c %ebp
 #define d %edx
 #define e %ecx
 #define ne %ebx
 
 #define RT0 %eax
 #define RT1 %r12d
 
 #define Wtmp0 %xmm0
 #define Wtmp1 %xmm1
 
 #define W0 %xmm2
 #define W1 %xmm3
 #define W2 %xmm4
 #define W3 %xmm5
 #define W4 %xmm6
 #define W5 %xmm7
 #define W6 %xmm8
 #define W7 %xmm9
 
 #define BSWAP_REG %xmm10
 
 #define K1 %xmm11
 #define K2 %xmm12
 #define K3 %xmm13
 #define K4 %xmm14
 
 
 /* Round function macros. */
 
 #define WK(i) (((i) & 15) * 4)(%rsp)
 
 #define R_F1(a,b,c,d,e,i) \
 	movl c, RT0; \
 	andn d, b, RT1; \
 	addl WK(i), e; \
 	andl b, RT0; \
 	rorxl $2, b, b; \
 	addl RT1, e; \
 	addl ne, a; \
 	leal (RT0,e), ne; \
 	rorxl $27, a, e;
 
 #define R_F2(a,b,c,d,e,i) \
 	movl c, RT0; \
 	addl WK(i), e; \
 	xorl b, RT0; \
 	rorxl $2, b, b; \
 	xorl d, RT0; \
 	addl ne, a; \
 	leal (RT0,e), ne; \
 	rorxl $27, a, e;
 
 #define R_F3(a,b,c,d,e,i) \
 	movl c, RT0; \
 	movl b, RT1; \
 	addl WK(i), e; \
 	xorl b, RT0; \
 	andl c, RT1; \
 	andl d, RT0; \
 	addl RT1, e; \
 	rorxl $2, b, b; \
 	addl ne, a; \
 	leal (RT0,e), ne; \
 	rorxl $27, a, e;
 
 #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
 
 #define R(a,b,c,d,e,f,i) \
 	R_##f(a,b,c,d,e,i)
 
 
 /* Input expansion macros. */
 
 #define W_PRECALC_00_15_0(i, W, tmp0) \
 	vmovdqu (4*(i))(RDATA), tmp0;
 
 #define W_PRECALC_00_15_1(i, W, tmp0) \
 	vpshufb BSWAP_REG, tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0, K) \
 	vpaddd K, W, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	vmovdqa tmp0, WK(i&~3);
 
 #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpalignr $8, W_m16, W_m12, W; \
 	vpsrldq $4, W_m04, tmp0; \
 	vpxor W_m08, W, W;
 
 #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpxor W_m16, tmp0, tmp0; \
 	vpxor tmp0, W, W; \
 	vpslld $1, W, tmp0; \
 	vpslldq $12, W, tmp1; \
 	vpsrld $31, W, W;
 
 #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpor W, tmp0, tmp0; \
 	vpsrld $30, tmp1, W; \
 	vpslld $2, tmp1, tmp1;
 
 #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
 	vpxor W, tmp0, tmp0; \
 	vpxor tmp1, tmp0, W; \
 	vpaddd K, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpxor W_m28, W, W; \
 	vpalignr $8, W_m08, W_m04, tmp0;
 
 #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpxor W_m16, W, W; \
 	vpxor tmp0, W, W;
 
 #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpsrld $30, W, tmp0; \
 	vpslld $2, W, W;
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
 	vpor W, tmp0, W; \
 	vpaddd K, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  *
  * unsigned int
  * _gcry_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
  *                                      size_t nblks)
  */
 .globl _gcry_sha1_transform_amd64_avx_bmi2
 ELF(.type _gcry_sha1_transform_amd64_avx_bmi2,@function)
 .align 16
 _gcry_sha1_transform_amd64_avx_bmi2:
   /* input:
    *	%rdi: ctx, CTX
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
   jz .Lret;
 
   vzeroupper;
 
   movq %rdx, RNBLKS;
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
   pushq %r12;
+  CFI_PUSH(%r12);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
 
   /* Get the values of the chaining variables. */
   movl state_h0(RSTATE), a;
   movl state_h1(RSTATE), b;
   movl state_h2(RSTATE), c;
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
   xorl ne, ne;
 
-  vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
-  vpbroadcastd .LK1 RIP, K1;
-  vpbroadcastd .LK2 RIP, K2;
-  vpbroadcastd .LK3 RIP, K3;
-  vpbroadcastd .LK4 RIP, K4;
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
   W_PRECALC_00_15_1(1, W0, Wtmp0);
   W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
   W_PRECALC_00_15_3(3, W0, Wtmp0);
   W_PRECALC_00_15_0(4, W7, Wtmp0);
   W_PRECALC_00_15_1(5, W7, Wtmp0);
   W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
   W_PRECALC_00_15_3(7, W7, Wtmp0);
   W_PRECALC_00_15_0(8, W6, Wtmp0);
   W_PRECALC_00_15_1(9, W6, Wtmp0);
   W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
   W_PRECALC_00_15_3(11, W6, Wtmp0);
   W_PRECALC_00_15_0(12, W5, Wtmp0);
   W_PRECALC_00_15_1(13, W5, Wtmp0);
   W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
   W_PRECALC_00_15_3(15, W5, Wtmp0);
 
 .align 8
 .Loop:
   addq $64, RDATA;
 
   /* Transform 0-15 + Precalc 16-31. */
   R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
   R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
   R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
   R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
 
   /* Transform 16-63 + Precalc 32-79. */
   R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
   R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
   R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
   R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
   R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
   R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
   R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
   R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
   R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
   R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
   R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
   R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
 
   decq RNBLKS;
   jz .Lend;
 
   /* Transform 64-79 + Precalc 0-15 of next block. */
   R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
   R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
   R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
   R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
   R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
   R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
   R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
   R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
   R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
   R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
   R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
   R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
   R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
   R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
   R( c, d, e, a, b, F4, 78 );
   addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
   R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
   addl ne, a;
   xorl ne, ne;
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   jmp .Loop;
 
 .align 16
 .Lend:
   vzeroall;
 
   /* Transform 64-79 + burn stack */
   R( b, c, d, e, a, F4, 64 );
   R( a, b, c, d, e, F4, 65 );
   R( e, a, b, c, d, F4, 66 );
   R( d, e, a, b, c, F4, 67 );
   R( c, d, e, a, b, F4, 68 );
   R( b, c, d, e, a, F4, 69 );
   R( a, b, c, d, e, F4, 70 );
   R( e, a, b, c, d, F4, 71 );
   R( d, e, a, b, c, F4, 72 );
   R( c, d, e, a, b, F4, 73 );
   R( b, c, d, e, a, F4, 74 );
   R( a, b, c, d, e, F4, 75 );
   R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
   R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
   R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
   addl ne, a;
   xorl ne, ne;
 
   /* 16*4/16-1 = 3 */
   vmovdqa %xmm0, (3*16)(%rsp);
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %r12;
+  CFI_POP(%r12);
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
     .-_gcry_sha1_transform_amd64_avx_bmi2;)
 
 #endif
 #endif
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
index 2a2f21a5..93863230 100644
--- a/cipher/sha1-avx2-bmi2-amd64.S
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -1,573 +1,573 @@
 /* sha1-avx2-bmi2-amd64.S - Intel AVX2/BMI2 accelerated SHA-1 transform function
  * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Based on sha1.c:
  *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
  *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
  *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
  */
 
 #ifdef __x86_64__
 #include <config.h>
 
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
      defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
      defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
 
 #define state_h0 0
 #define state_h1 4
 #define state_h2 8
 #define state_h3 12
 #define state_h4 16
 
 
 /* Constants */
 
 #define WK_STACK_WORDS (80 * 2)
 
 .text
 .align 16
 .Lbswap_shufb_ctl:
 	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
 
 .LK1:	.long 0x5A827999
 .LK2:	.long 0x6ED9EBA1
 .LK3:	.long 0x8F1BBCDC
 .LK4:	.long 0xCA62C1D6
 
 
 /* Register macros */
 
 #define RSTATE %r8
 #define RDATA %r9
 #define ROLDSTACK %r10
 #define RNBLKS %r11
 
 #define a %eax
 #define b %ebx
 #define c %ecx
 #define d %edx
 #define e %edi
 #define ne %r12d
 
 #define RT0 %esi
 #define RT1 %ebp
 
 #define Wtmp0 %ymm0
 #define Wtmp1 %ymm1
 #define Wtmp0x %xmm0
 #define Wtmp1x %xmm1
 
 #define W0 %ymm2
 #define W1 %ymm3
 #define W2 %ymm4
 #define W3 %ymm5
 #define W4 %ymm6
 #define W5 %ymm7
 #define W6 %ymm8
 #define W7 %ymm9
 
 #define BSWAP_REG %ymm10
 
 #define K1 %ymm11
 #define K2 %ymm12
 #define K3 %ymm13
 #define K4 %ymm14
 
 
 /* Round function macros. */
 
 #define WK(i,block) ((block) * 16 + ((i) / 4) * 32 + ((i) % 4) * 4)(%rsp)
 #define PRE_WK(i) ((i) * 4 * 2)(%rsp)
 
 #define R_F1(a,b,c,d,e,i,block) \
 	movl c, RT0; \
 	andn d, b, RT1; \
 	addl WK(i,block), e; \
 	andl b, RT0; \
 	leal (a,ne), a; \
 	rorxl $2, b, b; \
 	addl RT1, e; \
 	rorxl $27, a, ne; \
 	addl RT0, e;
 
 #define R_F2(a,b,c,d,e,i,block) \
 	addl WK(i,block), e; \
 	movl c, RT0; \
 	xorl b, RT0; \
 	leal (a,ne), a; \
 	rorxl $2, b, b; \
 	xorl d, RT0; \
 	addl RT0, e; \
 	rorxl $27, a, ne;
 
 #define R_F3(a,b,c,d,e,i,block) \
 	movl c, RT0; \
 	addl WK(i,block), e; \
 	movl b, RT1; \
 	xorl b, RT0; \
 	leal (a,ne), a; \
 	rorxl $2, b, b; \
 	andl c, RT1; \
 	addl RT1, e; \
 	andl d, RT0; \
 	rorxl $27, a, ne; \
 	addl RT0, e;
 
 #define R_F4(a,b,c,d,e,i,block) R_F2(a,b,c,d,e,i,block)
 
 #define R(a,b,c,d,e,f,i,block) \
 	R_##f(a,b,c,d,e,i,block)
 
 
 /* Input expansion macros. */
 
 #define W_PRECALC_00_15_0(i, W, tmp0) \
 	vmovdqu (4*(i))(RDATA), tmp0##x; \
 	vinserti128 $1, (4*(i) + 64)(RDATA), tmp0, tmp0;
 
 #define W_PRECALC_00_15_1(i, W, tmp0) \
 	vpshufb BSWAP_REG, tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0, K) \
 	vpaddd K, W, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	vmovdqa tmp0, PRE_WK((i)&~3);
 
 #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpalignr $8, W_m16, W_m12, W; \
 	vpsrldq $4, W_m04, tmp0; \
 	vpxor W_m08, W, W;
 
 #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpxor W_m16, tmp0, tmp0; \
 	vpxor tmp0, W, W; \
 	vpslld $1, W, tmp0; \
 	vpslldq $12, W, tmp1; \
 	vpsrld $31, W, W;
 
 #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpor W, tmp0, tmp0; \
 	vpsrld $30, tmp1, W; \
 	vpslld $2, tmp1, tmp1;
 
 #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
 	vpxor W, tmp0, tmp0; \
 	vpxor tmp1, tmp0, W; \
 	vpaddd K, W, tmp0; \
 	vmovdqa tmp0, PRE_WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpxor W_m28, W, W; \
 	vpalignr $8, W_m08, W_m04, tmp0;
 
 #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpxor W_m16, W, W; \
 	vpxor tmp0, W, W;
 
 #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpsrld $30, W, tmp0; \
 	vpslld $2, W, W;
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
 	vpor W, tmp0, W; \
 	vpaddd K, W, tmp0; \
 	vmovdqa tmp0, PRE_WK((i)&~3);
 
 
 /*
  * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA.
  *
  * unsigned int
  * _gcry_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
  *                                       size_t nblks)
  */
 .globl _gcry_sha1_transform_amd64_avx2_bmi2
 ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2,@function)
 .align 16
 _gcry_sha1_transform_amd64_avx2_bmi2:
   /* input:
    *	%rdi: ctx, CTX
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks (multiple of 2, larger than 0)
    */
+  CFI_STARTPROC();
 
   vzeroupper;
 
   movq %rdx, RNBLKS;
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
   pushq %r12;
+  CFI_PUSH(%r12);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(WK_STACK_WORDS*4), %rsp;
   andq $(~63), %rsp;
 
   /* Get the values of the chaining variables. */
   movl state_h0(RSTATE), a;
   movl state_h1(RSTATE), b;
   movl state_h2(RSTATE), c;
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
   xorl ne, ne;
 
-  vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG;
-  vpbroadcastd .LK1 RIP, K1;
-  vpbroadcastd .LK2 RIP, K2;
-  vpbroadcastd .LK3 RIP, K3;
-  vpbroadcastd .LK4 RIP, K4;
+  vbroadcasti128 .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
 
   /* Precalc 0-31 for block 1 & 2. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
   W_PRECALC_00_15_1(1, W0, Wtmp0);
   W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
   W_PRECALC_00_15_3(3, W0, Wtmp0);
   W_PRECALC_00_15_0(4, W7, Wtmp0);
   W_PRECALC_00_15_1(5, W7, Wtmp0);
   W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
   W_PRECALC_00_15_3(7, W7, Wtmp0);
   W_PRECALC_00_15_0(8, W6, Wtmp0);
   W_PRECALC_00_15_1(9, W6, Wtmp0);
   W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
   W_PRECALC_00_15_3(11, W6, Wtmp0);
   W_PRECALC_00_15_0(12, W5, Wtmp0);
   W_PRECALC_00_15_1(13, W5, Wtmp0);
   W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
   W_PRECALC_00_15_3(15, W5, Wtmp0);
   W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
   W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
   W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
   W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
 
 .align 8
 .Loop:
   addq $(2 * 64), RDATA;
 
   /* Transform 0-15 for block 1 + Precalc 32-47 for block 1 & 2. */
   R( a, b, c, d, e, F1,  0, 0 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( e, a, b, c, d, F1,  1, 0 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F1,  2, 0 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( c, d, e, a, b, F1,  3, 0 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
   R( b, c, d, e, a, F1,  4, 0 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( a, b, c, d, e, F1,  5, 0 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F1,  6, 0 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( d, e, a, b, c, F1,  7, 0 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
   R( c, d, e, a, b, F1,  8, 0 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( b, c, d, e, a, F1,  9, 0 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F1, 10, 0 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( e, a, b, c, d, F1, 11, 0 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
   R( d, e, a, b, c, F1, 12, 0 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( c, d, e, a, b, F1, 13, 0 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( b, c, d, e, a, F1, 14, 0 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( a, b, c, d, e, F1, 15, 0 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
 
   /* Transform 16-47 for block 1 + Precalc 48-79 for block 1 & 2. */
   R( e, a, b, c, d, F1, 16, 0 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( d, e, a, b, c, F1, 17, 0 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( c, d, e, a, b, F1, 18, 0 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( b, c, d, e, a, F1, 19, 0 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
   R( a, b, c, d, e, F2, 20, 0 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( e, a, b, c, d, F2, 21, 0 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( d, e, a, b, c, F2, 22, 0 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( c, d, e, a, b, F2, 23, 0 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
   R( b, c, d, e, a, F2, 24, 0 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( a, b, c, d, e, F2, 25, 0 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( e, a, b, c, d, F2, 26, 0 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( d, e, a, b, c, F2, 27, 0 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
   R( c, d, e, a, b, F2, 28, 0 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( b, c, d, e, a, F2, 29, 0 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( a, b, c, d, e, F2, 30, 0 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( e, a, b, c, d, F2, 31, 0 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
   R( d, e, a, b, c, F2, 32, 0 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( c, d, e, a, b, F2, 33, 0 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F2, 34, 0 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F2, 35, 0 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
   R( e, a, b, c, d, F2, 36, 0 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( d, e, a, b, c, F2, 37, 0 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F2, 38, 0 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F2, 39, 0 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
   R( a, b, c, d, e, F3, 40, 0 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( e, a, b, c, d, F3, 41, 0 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F3, 42, 0 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F3, 43, 0 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
   R( b, c, d, e, a, F3, 44, 0 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( a, b, c, d, e, F3, 45, 0 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F3, 46, 0 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F3, 47, 0 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
 
   /* Transform 48-79 for block 1. */
   R( c, d, e, a, b, F3, 48, 0 );
   R( b, c, d, e, a, F3, 49, 0 );
   R( a, b, c, d, e, F3, 50, 0 );
   R( e, a, b, c, d, F3, 51, 0 );
   R( d, e, a, b, c, F3, 52, 0 );
   R( c, d, e, a, b, F3, 53, 0 );
   R( b, c, d, e, a, F3, 54, 0 );
   R( a, b, c, d, e, F3, 55, 0 );
   R( e, a, b, c, d, F3, 56, 0 );
   R( d, e, a, b, c, F3, 57, 0 );
   R( c, d, e, a, b, F3, 58, 0 );
   R( b, c, d, e, a, F3, 59, 0 );
   R( a, b, c, d, e, F4, 60, 0 );
   R( e, a, b, c, d, F4, 61, 0 );
   R( d, e, a, b, c, F4, 62, 0 );
   R( c, d, e, a, b, F4, 63, 0 );
   R( b, c, d, e, a, F4, 64, 0 );
   R( a, b, c, d, e, F4, 65, 0 );
   R( e, a, b, c, d, F4, 66, 0 );
   R( d, e, a, b, c, F4, 67, 0 );
   R( c, d, e, a, b, F4, 68, 0 );
   R( b, c, d, e, a, F4, 69, 0 );
   R( a, b, c, d, e, F4, 70, 0 );
   R( e, a, b, c, d, F4, 71, 0 );
   R( d, e, a, b, c, F4, 72, 0 );
   R( c, d, e, a, b, F4, 73, 0 );
   R( b, c, d, e, a, F4, 74, 0 );
   R( a, b, c, d, e, F4, 75, 0 );
   R( e, a, b, c, d, F4, 76, 0 );
   R( d, e, a, b, c, F4, 77, 0 );
   R( c, d, e, a, b, F4, 78, 0 );
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79, 0 );
   addl ne, a;
   xorl ne, ne;
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   /* Transform 0-47 for block 2. */
   R( a, b, c, d, e, F1,  0, 1 );
   R( e, a, b, c, d, F1,  1, 1 );
   R( d, e, a, b, c, F1,  2, 1 );
   R( c, d, e, a, b, F1,  3, 1 );
   R( b, c, d, e, a, F1,  4, 1 );
   R( a, b, c, d, e, F1,  5, 1 );
   R( e, a, b, c, d, F1,  6, 1 );
   R( d, e, a, b, c, F1,  7, 1 );
   R( c, d, e, a, b, F1,  8, 1 );
   R( b, c, d, e, a, F1,  9, 1 );
   R( a, b, c, d, e, F1, 10, 1 );
   R( e, a, b, c, d, F1, 11, 1 );
   R( d, e, a, b, c, F1, 12, 1 );
   R( c, d, e, a, b, F1, 13, 1 );
   R( b, c, d, e, a, F1, 14, 1 );
   R( a, b, c, d, e, F1, 15, 1 );
   R( e, a, b, c, d, F1, 16, 1 );
   R( d, e, a, b, c, F1, 17, 1 );
   R( c, d, e, a, b, F1, 18, 1 );
   R( b, c, d, e, a, F1, 19, 1 );
   R( a, b, c, d, e, F2, 20, 1 );
   R( e, a, b, c, d, F2, 21, 1 );
   R( d, e, a, b, c, F2, 22, 1 );
   R( c, d, e, a, b, F2, 23, 1 );
   R( b, c, d, e, a, F2, 24, 1 );
   R( a, b, c, d, e, F2, 25, 1 );
   R( e, a, b, c, d, F2, 26, 1 );
   R( d, e, a, b, c, F2, 27, 1 );
   R( c, d, e, a, b, F2, 28, 1 );
   R( b, c, d, e, a, F2, 29, 1 );
   R( a, b, c, d, e, F2, 30, 1 );
   R( e, a, b, c, d, F2, 31, 1 );
   R( d, e, a, b, c, F2, 32, 1 );
   R( c, d, e, a, b, F2, 33, 1 );
   R( b, c, d, e, a, F2, 34, 1 );
   R( a, b, c, d, e, F2, 35, 1 );
   R( e, a, b, c, d, F2, 36, 1 );
   R( d, e, a, b, c, F2, 37, 1 );
   R( c, d, e, a, b, F2, 38, 1 );
   R( b, c, d, e, a, F2, 39, 1 );
   R( a, b, c, d, e, F3, 40, 1 );
   R( e, a, b, c, d, F3, 41, 1 );
   R( d, e, a, b, c, F3, 42, 1 );
   R( c, d, e, a, b, F3, 43, 1 );
   R( b, c, d, e, a, F3, 44, 1 );
   R( a, b, c, d, e, F3, 45, 1 );
   R( e, a, b, c, d, F3, 46, 1 );
   R( d, e, a, b, c, F3, 47, 1 );
 
   addq $-2, RNBLKS;
   jz .Lend;
 
   /* Transform 48-79 for block 2 + Precalc 0-31 for next two blocks. */
   R( c, d, e, a, b, F3, 48, 1 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
   R( b, c, d, e, a, F3, 49, 1 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
   R( a, b, c, d, e, F3, 50, 1 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
   R( e, a, b, c, d, F3, 51, 1 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
   R( d, e, a, b, c, F3, 52, 1 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
   R( c, d, e, a, b, F3, 53, 1 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
   R( b, c, d, e, a, F3, 54, 1 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
   R( a, b, c, d, e, F3, 55, 1 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
   R( e, a, b, c, d, F3, 56, 1 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
   R( d, e, a, b, c, F3, 57, 1 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
   R( c, d, e, a, b, F3, 58, 1 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
   R( b, c, d, e, a, F3, 59, 1 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
   R( a, b, c, d, e, F4, 60, 1 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
   R( e, a, b, c, d, F4, 61, 1 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
   R( d, e, a, b, c, F4, 62, 1 ); W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
   R( c, d, e, a, b, F4, 63, 1 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
   R( b, c, d, e, a, F4, 64, 1 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F4, 65, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F4, 66, 1 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F4, 67, 1 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
   R( c, d, e, a, b, F4, 68, 1 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F4, 69, 1 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F4, 70, 1 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F4, 71, 1 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
   R( d, e, a, b, c, F4, 72, 1 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F4, 73, 1 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F4, 74, 1 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F4, 75, 1 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
   R( e, a, b, c, d, F4, 76, 1 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F4, 77, 1 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F4, 78, 1 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   addl state_h0(RSTATE), a;      W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
   R( b, c, d, e, a, F4, 79, 1 );
   addl ne, a;
   xorl ne, ne;
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   jmp .Loop;
 
 .align 16
 .Lend:
   vzeroall;
 
   /* Transform 48-79 for block 2 + burn stack */
   R( c, d, e, a, b, F3, 48, 1 );
   R( b, c, d, e, a, F3, 49, 1 );
   R( a, b, c, d, e, F3, 50, 1 );
   R( e, a, b, c, d, F3, 51, 1 );
   R( d, e, a, b, c, F3, 52, 1 );
   R( c, d, e, a, b, F3, 53, 1 );
   R( b, c, d, e, a, F3, 54, 1 );
   R( a, b, c, d, e, F3, 55, 1 );
   R( e, a, b, c, d, F3, 56, 1 );
   R( d, e, a, b, c, F3, 57, 1 );
   R( c, d, e, a, b, F3, 58, 1 );
   R( b, c, d, e, a, F3, 59, 1 );
   R( a, b, c, d, e, F4, 60, 1 ); vmovdqa %ymm0, (0*32)(%rsp);
   R( e, a, b, c, d, F4, 61, 1 ); vmovdqa %ymm0, (1*32)(%rsp);
   R( d, e, a, b, c, F4, 62, 1 ); vmovdqa %ymm0, (2*32)(%rsp);
   R( c, d, e, a, b, F4, 63, 1 ); vmovdqa %ymm0, (3*32)(%rsp);
   R( b, c, d, e, a, F4, 64, 1 ); vmovdqa %ymm0, (4*32)(%rsp);
   R( a, b, c, d, e, F4, 65, 1 ); vmovdqa %ymm0, (5*32)(%rsp);
   R( e, a, b, c, d, F4, 66, 1 ); vmovdqa %ymm0, (6*32)(%rsp);
   R( d, e, a, b, c, F4, 67, 1 ); vmovdqa %ymm0, (7*32)(%rsp);
   R( c, d, e, a, b, F4, 68, 1 ); vmovdqa %ymm0, (8*32)(%rsp);
   R( b, c, d, e, a, F4, 69, 1 ); vmovdqa %ymm0, (9*32)(%rsp);
   R( a, b, c, d, e, F4, 70, 1 ); vmovdqa %ymm0, (10*32)(%rsp);
   R( e, a, b, c, d, F4, 71, 1 ); vmovdqa %ymm0, (11*32)(%rsp);
   R( d, e, a, b, c, F4, 72, 1 ); vmovdqa %ymm0, (12*32)(%rsp);
   R( c, d, e, a, b, F4, 73, 1 ); vmovdqa %ymm0, (13*32)(%rsp);
   R( b, c, d, e, a, F4, 74, 1 ); vmovdqa %ymm0, (14*32)(%rsp);
   R( a, b, c, d, e, F4, 75, 1 ); vmovdqa %ymm0, (15*32)(%rsp);
   R( e, a, b, c, d, F4, 76, 1 ); vmovdqa %ymm0, (16*32)(%rsp);
   R( d, e, a, b, c, F4, 77, 1 ); vmovdqa %ymm0, (17*32)(%rsp);
   R( c, d, e, a, b, F4, 78, 1 ); vmovdqa %ymm0, (18*32)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79, 1 );
   addl ne, a;
   xorl ne, ne;
 
   /* WK_STACK_WORDS*4/32-1 = 19 */
   vmovdqa %ymm0, (19*32)(%rsp);
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %r12;
+  CFI_POP(%r12);
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
     .-_gcry_sha1_transform_amd64_avx2_bmi2;)
 
 #endif
 #endif
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index fff14034..7e32b0f4 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -1,439 +1,437 @@
 /* sha1-ssse3-amd64.S - Intel SSSE3 accelerated SHA-1 transform function
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Based on sha1.c:
  *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
  *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
  *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
  */
 
 #ifdef __x86_64__
 #include <config.h>
 
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
 
 #define state_h0 0
 #define state_h1 4
 #define state_h2 8
 #define state_h3 12
 #define state_h4 16
 
 
 /* Constants */
 
 .text
 #define K1  0x5A827999
 #define K2  0x6ED9EBA1
 #define K3  0x8F1BBCDC
 #define K4  0xCA62C1D6
 .align 16
 .LK_XMM:
 .LK1:	.long K1, K1, K1, K1
 .LK2:	.long K2, K2, K2, K2
 .LK3:	.long K3, K3, K3, K3
 .LK4:	.long K4, K4, K4, K4
 
 .Lbswap_shufb_ctl:
 	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
 
 
 /* Register macros */
 
 #define RSTATE %r8
 #define RDATA %r9
 #define ROLDSTACK %r10
 #define RNBLKS %r11
 
 #define a %eax
 #define b %ebx
 #define c %ecx
 #define d %edx
 #define e %edi
 
 #define RT0 %esi
 #define RT1 %ebp
 
 #define Wtmp0 %xmm0
 #define Wtmp1 %xmm1
 
 #define W0 %xmm2
 #define W1 %xmm3
 #define W2 %xmm4
 #define W3 %xmm5
 #define W4 %xmm6
 #define W5 %xmm7
 #define W6 %xmm8
 #define W7 %xmm9
 
 #define BSWAP_REG %xmm10
 
 
 /* Round function macros. */
 
 #define WK(i) (((i) & 15) * 4)(%rsp)
 
 #define R_F1(a,b,c,d,e,i) \
 	movl c, RT0; \
 	addl WK(i), e; \
 	xorl d, RT0; \
 	movl a, RT1; \
 	andl b, RT0; \
 	roll $30, b; \
 	xorl d, RT0; \
 	leal (RT0,e), e; \
 	roll $5, RT1; \
 	addl RT1, e;
 
 #define R_F2(a,b,c,d,e,i) \
 	movl c, RT0; \
 	addl WK(i), e; \
 	xorl b, RT0; \
 	roll $30, b; \
 	xorl d, RT0; \
 	movl a, RT1; \
 	leal (RT0,e), e; \
 	roll $5, RT1; \
 	addl RT1, e;
 
 #define R_F3(a,b,c,d,e,i) \
 	movl c, RT0; \
 	movl b, RT1; \
 	xorl b, RT0; \
 	andl c, RT1; \
 	andl d, RT0; \
 	addl RT1, e; \
 	addl WK(i), e; \
 	roll $30, b; \
 	movl a, RT1; \
 	leal (RT0,e), e; \
 	roll $5, RT1; \
 	addl RT1, e;
 
 #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
 
 #define R(a,b,c,d,e,f,i) \
 	R_##f(a,b,c,d,e,i)
 
 
 /* Input expansion macros. */
 
 #define W_PRECALC_00_15_0(i, W, tmp0) \
 	movdqu (4*(i))(RDATA), tmp0;
 
 #define W_PRECALC_00_15_1(i, W, tmp0) \
 	pshufb BSWAP_REG, tmp0; \
 	movdqa tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0) \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0;
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	movdqa tmp0, WK(i&~3);
 
 #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	movdqa W_m12, W; \
 	palignr $8, W_m16, W; \
 	movdqa W_m04, tmp0; \
 	psrldq $4, tmp0; \
 	pxor W_m08, W;
 
 #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	pxor W_m16, tmp0; \
 	pxor tmp0, W; \
 	movdqa W, tmp1; \
 	movdqa W, tmp0; \
 	pslldq $12, tmp1;
 
 #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	psrld $31, W; \
 	pslld $1, tmp0; \
 	por W, tmp0; \
 	movdqa tmp1, W; \
 	psrld $30, tmp1; \
 	pslld $2, W;
 
 #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	pxor W, tmp0; \
 	pxor tmp1, tmp0; \
 	movdqa tmp0, W; \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
 	movdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	movdqa W_m04, tmp0; \
 	pxor W_m28, W; \
 	palignr $8, W_m08, tmp0;
 
 #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	pxor W_m16, W; \
 	pxor tmp0, W; \
 	movdqa W, tmp0;
 
 #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	psrld $30, W; \
 	pslld $2, tmp0; \
 	por W, tmp0;
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	movdqa tmp0, W; \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
 	movdqa tmp0, WK((i)&~3);
 
 #define CLEAR_REG(reg) pxor reg, reg;
 
 
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  *
  * unsigned int
  * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data,
  *                                   size_t nblks)
  */
 .globl _gcry_sha1_transform_amd64_ssse3
 ELF(.type _gcry_sha1_transform_amd64_ssse3,@function)
 .align 16
 _gcry_sha1_transform_amd64_ssse3:
   /* input:
    *	%rdi: ctx, CTX
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
   jz .Lret;
 
   movq %rdx, RNBLKS;
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
 
   /* Get the values of the chaining variables. */
   movl state_h0(RSTATE), a;
   movl state_h1(RSTATE), b;
   movl state_h2(RSTATE), c;
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
 
-  movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
   W_PRECALC_00_15_1(1, W0, Wtmp0);
   W_PRECALC_00_15_2(2, W0, Wtmp0);
   W_PRECALC_00_15_3(3, W0, Wtmp0);
   W_PRECALC_00_15_0(4, W7, Wtmp0);
   W_PRECALC_00_15_1(5, W7, Wtmp0);
   W_PRECALC_00_15_2(6, W7, Wtmp0);
   W_PRECALC_00_15_3(7, W7, Wtmp0);
   W_PRECALC_00_15_0(8, W6, Wtmp0);
   W_PRECALC_00_15_1(9, W6, Wtmp0);
   W_PRECALC_00_15_2(10, W6, Wtmp0);
   W_PRECALC_00_15_3(11, W6, Wtmp0);
   W_PRECALC_00_15_0(12, W5, Wtmp0);
   W_PRECALC_00_15_1(13, W5, Wtmp0);
   W_PRECALC_00_15_2(14, W5, Wtmp0);
   W_PRECALC_00_15_3(15, W5, Wtmp0);
 
 .align 8
 .Loop:
   addq $64, RDATA;
 
   /* Transform 0-15 + Precalc 16-31. */
   R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
 
   /* Transform 16-63 + Precalc 32-79. */
   R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
 
   decq RNBLKS;
   jz .Lend;
 
   /* Transform 64-79 + Precalc 0-15 of next block. */
   R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
   R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
   R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
   R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
   R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
   R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
   R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
   R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
   R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
   R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
   R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
   R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
   R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
   R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
   R( c, d, e, a, b, F4, 78 );
   addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0);
   R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   jmp .Loop;
 
 .align 16
 .Lend:
   /* Transform 64-79 + Clear XMM registers + Burn stack. */
   R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG);
   R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0);
   R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1);
   R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0);
   R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1);
   R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2);
   R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3);
   R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4);
   R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5);
   R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6);
   R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7);
   R( a, b, c, d, e, F4, 75 );
   R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp);
   R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp);
   R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
 
   /* 16*4/16-1 = 3 */
   vmovdqa Wtmp0, (3*16)(%rsp);
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
   addl state_h1(RSTATE), b;
   addl state_h4(RSTATE), e;
 
   movl d, state_h3(RSTATE);
   movl c, state_h2(RSTATE);
   movl b, state_h1(RSTATE);
   movl a, state_h0(RSTATE);
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_ssse3,
     .-_gcry_sha1_transform_amd64_ssse3;)
 
 #endif
 #endif
diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index b8b01b15..77143ff0 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -1,528 +1,532 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Note: Based on the SSSE3 implementation.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 #define	VMOVDQ vmovdqu /* assume buffers not aligned */
 
 .macro ROR p1 p2
 	/* shld is faster than ror on Intel Sandybridge */
 	shld	\p1, \p1, (32 - \p2)
 .endm
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
 .macro addm p1 p2
 	add	\p2, \p1
 	mov	\p1, \p2
 .endm
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
 .macro COPY_XMM_AND_BSWAP p1 p2 p3
 	VMOVDQ \p1, \p2
 	vpshufb \p1, \p1, \p3
 .endm
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 X0 = xmm4
 X1 = xmm5
 X2 = xmm6
 X3 = xmm7
 
 XTMP0 = xmm0
 XTMP1 = xmm1
 XTMP2 = xmm2
 XTMP3 = xmm3
 XTMP4 = xmm8
 XFER  = xmm9
 
 SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
 SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
 BYTE_FLIP_MASK = xmm12
 
 NUM_BLKS = rdx	/* 3rd arg */
 CTX = rsi	/* 2nd arg */
 INP = rdi	/* 1st arg */
 
 SRND = rdi	/* clobbers INP */
 c = ecx
 d = r8d
 e = edx
 
 TBL = rbp
 a = eax
 b = ebx
 
 f = r9d
 g = r10d
 h = r11d
 
 y0 = r13d
 y1 = r14d
 y2 = r15d
 
 
 
 #define _INP_END_SIZE	8
 #define _INP_SIZE	8
 #define _XFER_SIZE	8
 #define _XMM_SAVE_SIZE	0
 /* STACK_SIZE plus pushes must be an odd multiple of 8 */
 #define _ALIGN_SIZE	8
 
 #define _INP_END	0
 #define _INP		(_INP_END  + _INP_END_SIZE)
 #define _XFER		(_INP      + _INP_SIZE)
 #define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
 
 /* rotate_Xs
  * Rotate values of symbols X0...X3 */
 .macro rotate_Xs
 X_ = X0
 X0 = X1
 X1 = X2
 X2 = X3
 X3 = X_
 .endm
 
 /* ROTATE_ARGS
  * Rotate values of symbols a...h */
 .macro ROTATE_ARGS
 TMP_ = h
 h = g
 g = f
 f = e
 e = d
 d = c
 c = b
 b = a
 a = TMP_
 .endm
 
 .macro FOUR_ROUNDS_AND_SCHED
 		/* compute s0 four at a time and s1 two at a time
 		 * compute W[-16] + W[-7] 4 at a time */
 	mov	y0, e		/* y0 = e */
 	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
 	mov	y1, a		/* y1 = a */
 		vpalignr	XTMP0, X3, X2, 4	/* XTMP0 = W[-7] */
 	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	mov	y2, f		/* y2 = f */
 	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	xor	y2, g		/* y2 = f^g */
 		vpaddd	XTMP0, XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 		/* compute s0 */
 		vpalignr	XTMP1, X1, X0, 4	/* XTMP1 = W[-15] */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, y0		/* y2 = S1 + CH */
 	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		vpslld	XTMP2, XTMP1, (32-7)
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		vpsrld	XTMP3, XTMP1, 7
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		vpor	XTMP3, XTMP3, XTMP2	/* XTMP1 = W[-15] ror 7 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 	mov	y0, e		/* y0 = e */
 	mov	y1, a		/* y1 = a */
 	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	mov	y2, f		/* y2 = f */
 	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
 		vpslld	XTMP2, XTMP1, (32-18)
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 	xor	y2, g		/* y2 = f^g */
 		vpsrld	XTMP4, XTMP1, 18
 	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 		vpxor	XTMP4, XTMP4, XTMP3
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 		vpsrld	XTMP1, XTMP1, 3	/* XTMP4 = W[-15] >> 3 */
 	add	y2, y0		/* y2 = S1 + CH */
 	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */
 	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 		vpxor	XTMP1, XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		vpxor	XTMP1, XTMP1, XTMP4	/* XTMP1 = s0 */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		/* compute low s1 */
 		vpshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		vpaddd	XTMP0, XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 	mov	y0, e		/* y0 = e */
 	mov	y1, a		/* y1 = a */
 	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
 	mov	y2, f		/* y2 = f */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */
 	xor	y2, g		/* y2 = f^g */
 		vpsrlq	XTMP4, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 		vpsrld	XTMP2, XTMP2, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */
 	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 		vpxor	XTMP2, XTMP2, XTMP3
 	add	y2, y0		/* y2 = S1 + CH */
 	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */
 		vpxor	XTMP4, XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		vpshufb	XTMP4, XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		vpaddd	XTMP0, XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		/* compute high s1 */
 		vpshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 	mov	y0, e		/* y0 = e */
 	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
 	mov	y1, a		/* y1 = a */
 	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	mov	y2, f		/* y2 = f */
 	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	xor	y2, g		/* y2 = f^g */
 		vpsrlq	X0, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 		vpsrld	XTMP2, XTMP2,    10	/* X0 = W[-2] >> 10 {DDCC} */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 		vpxor	XTMP2, XTMP2, XTMP3
 	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, y0		/* y2 = S1 + CH */
 	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */
 		vpxor	X0, X0, XTMP2	/* X0 = s1 {xDxC} */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		vpshufb	X0, X0, SHUF_DC00	/* X0 = s1 {DC00} */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		vpaddd	X0, X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 rotate_Xs
 .endm
 
 /* input is [rsp + _XFER + %1 * 4] */
 .macro DO_ROUND i1
 	mov	y0, e		/* y0 = e */
 	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
 	mov	y1, a		/* y1 = a */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
 	mov	y2, f		/* y2 = f */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 	xor	y2, g		/* y2 = f^g */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 	add	y2, y0		/* y2 = S1 + CH */
 	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, [rsp + _XFER + \i1 * 4]	/* y2 = k + w + S1 + CH */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 	ROTATE_ARGS
 .endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 */
 .text
 .globl _gcry_sha256_transform_amd64_avx
 ELF(.type  _gcry_sha256_transform_amd64_avx,@function;)
 .align 16
 _gcry_sha256_transform_amd64_avx:
+	CFI_STARTPROC()
 	vzeroupper
 
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
 	add	NUM_BLKS, INP	/* pointer to end of data */
 	mov	[rsp + _INP_END], NUM_BLKS
 
 	/* load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 .Loop0:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* byte swap first 16 dwords */
 	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
 
 	mov	[rsp + _INP], INP
 
 	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
 	mov	SRND, 3
 .align 16
 .Loop1:
 	vpaddd	XFER, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 
 	vpaddd	XFER, X0, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 
 	vpaddd	XFER, X0, [TBL + 2*16]
 	vmovdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 
 	vpaddd	XFER, X0, [TBL + 3*16]
 	vmovdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
 	FOUR_ROUNDS_AND_SCHED
 
 	sub	SRND, 1
 	jne	.Loop1
 
 	mov	SRND, 2
 .Loop2:
 	vpaddd	X0, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], X0
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 	vpaddd	X1, X1, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 
 	vmovdqa	X0, X2
 	vmovdqa	X1, X3
 
 	sub	SRND, 1
 	jne	.Loop2
 
 	addm	[4*0 + CTX],a
 	addm	[4*1 + CTX],b
 	addm	[4*2 + CTX],c
 	addm	[4*3 + CTX],d
 	addm	[4*4 + CTX],e
 	addm	[4*5 + CTX],f
 	addm	[4*6 + CTX],g
 	addm	[4*7 + CTX],h
 
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 	cmp	INP, [rsp + _INP_END]
 	jne	.Loop0
 
 .Ldone_hash:
 	vzeroall
 
 	vmovdqa	[rsp + _XFER], XFER
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 	ret
+	CFI_ENDPROC()
 
 
 .align 16
 .LK256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
 
 /* shuffle xBxA -> 00BA */
 .L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 /* shuffle xDxC -> DC00 */
 .L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 5fc402cd..52be1a07 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -1,568 +1,575 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 2 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 #define	VMOVDQ vmovdqu /* ; assume buffers not aligned  */
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros */
 
 /*  addm [mem], reg */
 /*  Add reg to mem using reg-mem add and store */
 .macro addm p1 p2
 	add	\p2, \p1
 	mov	\p1, \p2
 .endm
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 X0 = ymm4
 X1 = ymm5
 X2 = ymm6
 X3 = ymm7
 
 /*  XMM versions of above */
 XWORD0 = xmm4
 XWORD1 = xmm5
 XWORD2 = xmm6
 XWORD3 = xmm7
 
 XTMP0 = ymm0
 XTMP1 = ymm1
 XTMP2 = ymm2
 XTMP3 = ymm3
 XTMP4 = ymm8
 XFER =  ymm9
 XTMP5 = ymm11
 
 SHUF_00BA = ymm10 /*  shuffle xBxA -> 00BA */
 SHUF_DC00 = ymm12 /*  shuffle xDxC -> DC00 */
 BYTE_FLIP_MASK = ymm13
 
 X_BYTE_FLIP_MASK = xmm13 /*  XMM version of BYTE_FLIP_MASK */
 
 NUM_BLKS = rdx	/*  3rd arg */
 CTX =	rsi   	/*  2nd arg */
 INP =	rdi	/*  1st arg */
 c =	ecx
 d =	r8d
 e =	edx	/*  clobbers NUM_BLKS */
 y3 =	edi	/*  clobbers INP */
 
 TBL =	rbp
 SRND =	CTX	/*  SRND is same register as CTX */
 
 a =	eax
 b =	ebx
 f =	r9d
 g =	r10d
 h =	r11d
 old_h =	r11d
 
 T1 = r12d
 y0 = r13d
 y1 = r14d
 y2 = r15d
 
 
 _XFER_SIZE	= 2*64*4	/*  2 blocks, 64 rounds, 4 bytes/round */
 _XMM_SAVE_SIZE  = 0
 _INP_END_SIZE	= 8
 _INP_SIZE	= 8
 _CTX_SIZE	= 8
 _RSP_SIZE	= 8
 
 _XFER		= 0
 _XMM_SAVE	= _XFER     + _XFER_SIZE
 _INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
 _INP 		= _INP_END  + _INP_END_SIZE
 _CTX		= _INP      + _INP_SIZE
 _RSP		= _CTX      + _CTX_SIZE
 STACK_SIZE	= _RSP      + _RSP_SIZE
 
 /*  rotate_Xs */
 /*  Rotate values of symbols X0...X3 */
 .macro rotate_Xs
 X_ = X0
 X0 = X1
 X1 = X2
 X2 = X3
 X3 = X_
 .endm
 
 /*  ROTATE_ARGS */
 /*  Rotate values of symbols a...h */
 .macro ROTATE_ARGS
 old_h = h
 TMP_ = h
 h = g
 g = f
 f = e
 e = d
 d = c
 c = b
 b = a
 a = TMP_
 .endm
 
 .macro ONE_ROUND_PART1 XFER
 	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
 	 * d += h;
 	 * h += Sum0 (a) + Maj (a, b, c);
 	 *
 	 * Ch(x, y, z) => ((x & y) + (~x & z))
 	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
 	 */
 
 	mov y3, e
 	add h, [\XFER]
 	and y3, f
 	rorx y0, e, 25
 	rorx y1, e, 11
 	lea h, [h + y3]
 	andn y3, e, g
 	rorx T1, a, 13
 	xor y0, y1
 	lea h, [h + y3]
 .endm
 .macro ONE_ROUND_PART2
 	rorx y2, a, 22
 	rorx y1, e, 6
 	mov y3, a
 	xor T1, y2
 	xor y0, y1
 	xor y3, b
 	lea h, [h + y0]
 	mov y0, a
 	rorx y2, a, 2
 	add d, h
 	and y3, c
 	xor T1, y2
 	lea h, [h + y3]
 	lea h, [h + T1]
 	and y0, b
 	lea h, [h + y0]
 .endm
 
 .macro ONE_ROUND XFER
 	ONE_ROUND_PART1 \XFER
 	ONE_ROUND_PART2
 .endm
 
 .macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */
 		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */
 		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */
 		vpsrld	XTMP2, XTMP1, 7
 		vpslld	XTMP3, XTMP1, (32-7)
 		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */
 		vpsrld	XTMP2, XTMP1,18
 
 	ONE_ROUND 0*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */
 		vpslld	XTMP1, XTMP1, (32-18)
 		vpxor	XTMP3, XTMP3, XTMP1
 		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */
 		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */
 		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */
 		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */
 		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */
 
 	ONE_ROUND 1*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */
 		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */
 		vpxor	XTMP2, XTMP2, XTMP3
 		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */
 		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */
 		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */
 		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */
 
 	ONE_ROUND 2*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */
 		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */
 		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */
 		vpxor	XTMP2, XTMP2, XTMP3
 		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */
 		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */
 		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */
 		vpaddd	XFER, X0, [TBL + \XFEROUT]
 
 	ONE_ROUND_PART1 3*4+\XFER
 		vmovdqa [rsp + _XFER + \XFEROUT], XFER
 	ONE_ROUND_PART2
 	ROTATE_ARGS
 	rotate_Xs
 .endm
 
 .macro DO_4ROUNDS XFER
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND 0*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND 1*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND 2*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND 3*4+\XFER
 	ROTATE_ARGS
 .endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 */
 .text
 .globl _gcry_sha256_transform_amd64_avx2
 ELF(.type _gcry_sha256_transform_amd64_avx2,@function)
 .align 32
 _gcry_sha256_transform_amd64_avx2:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
 	je .Lnowork
 
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r12
+	CFI_PUSH(r12)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	vzeroupper
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, STACK_SIZE
 	and	rsp, ~63
 	mov	[rsp + _RSP], rax
+	CFI_CFA_ON_STACK(_RSP, 6 * 8)
 
 	shl	NUM_BLKS, 6	/*  convert to bytes */
 	lea	NUM_BLKS, [NUM_BLKS + INP - 64] /*  pointer to last block */
 	mov	[rsp + _INP_END], NUM_BLKS
 
 	/* ; load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	mov	[rsp + _CTX], CTX
 
 .Loop0:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* ; Load first 16 dwords from two blocks */
 	VMOVDQ	XTMP0, [INP + 0*32]
 	VMOVDQ	XTMP1, [INP + 1*32]
 	VMOVDQ	XTMP2, [INP + 2*32]
 	VMOVDQ	XTMP3, [INP + 3*32]
 
 	/* ; byte swap data */
 	vpshufb	XTMP0, XTMP0, BYTE_FLIP_MASK
 	vpshufb	XTMP1, XTMP1, BYTE_FLIP_MASK
 	vpshufb	XTMP2, XTMP2, BYTE_FLIP_MASK
 	vpshufb	XTMP3, XTMP3, BYTE_FLIP_MASK
 
 	/* ; transpose data into high/low halves */
 	vperm2i128	X0, XTMP0, XTMP2, 0x20
 	vperm2i128	X1, XTMP0, XTMP2, 0x31
 	vperm2i128	X2, XTMP1, XTMP3, 0x20
 	vperm2i128	X3, XTMP1, XTMP3, 0x31
 
 .Last_block_enter:
 	add	INP, 64
 	mov	[rsp + _INP], INP
 
 	/* ; schedule 48 input dwords, by doing 3 rounds of 12 each */
 	xor	SRND, SRND
 
 	vpaddd	XFER, X0, [TBL + 0*32]
 	vmovdqa [rsp + _XFER + 0*32], XFER
 	vpaddd	XFER, X1, [TBL + 1*32]
 	vmovdqa [rsp + _XFER + 1*32], XFER
 	vpaddd	XFER, X2, [TBL + 2*32]
 	vmovdqa [rsp + _XFER + 2*32], XFER
 	vpaddd	XFER, X3, [TBL + 3*32]
 	vmovdqa [rsp + _XFER + 3*32], XFER
 
 .align 16
 .Loop1:
 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 0*32, SRND + 4*32
 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 1*32, SRND + 5*32
 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 2*32, SRND + 6*32
 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 3*32, SRND + 7*32
 
 	add	SRND, 4*32
 	cmp	SRND, 3 * 4*32
 	jb	.Loop1
 
 	/* ; Do last 16 rounds with no scheduling */
 	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 0*32)
 	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 1*32)
 	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 2*32)
 	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 3*32)
 
 	mov	CTX, [rsp + _CTX]
 	mov	INP, [rsp + _INP]
 
 	addm	[4*0 + CTX],a
 	addm	[4*1 + CTX],b
 	addm	[4*2 + CTX],c
 	addm	[4*3 + CTX],d
 	addm	[4*4 + CTX],e
 	addm	[4*5 + CTX],f
 	addm	[4*6 + CTX],g
 	addm	[4*7 + CTX],h
 
 	cmp	INP, [rsp + _INP_END]
 	ja	.Ldone_hash
 
 	/* ;;; Do second block using previously scheduled results */
 	xor	SRND, SRND
 .align 16
 .Loop3:
 	DO_4ROUNDS	rsp + _XFER + SRND + 0*32 + 16
 	DO_4ROUNDS	rsp + _XFER + SRND + 1*32 + 16
 	add	SRND, 2*32
 	cmp	SRND, 4 * 4*32
 	jb .Loop3
 
 	mov	CTX, [rsp + _CTX]
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 
 	addm	[4*0 + CTX],a
 	addm	[4*1 + CTX],b
 	addm	[4*2 + CTX],c
 	addm	[4*3 + CTX],d
 	addm	[4*4 + CTX],e
 	addm	[4*5 + CTX],f
 	addm	[4*6 + CTX],g
 	addm	[4*7 + CTX],h
 
 	cmp	INP, [rsp + _INP_END]
 	jb	.Loop0
 	ja	.Ldone_hash
 
 .Ldo_last_block:
 	/* ;;; do last block */
 	lea	TBL, [.LK256 ADD_RIP]
 
 	VMOVDQ	XWORD0, [INP + 0*16]
 	VMOVDQ	XWORD1, [INP + 1*16]
 	VMOVDQ	XWORD2, [INP + 2*16]
 	VMOVDQ	XWORD3, [INP + 3*16]
 
 	vpshufb	XWORD0, XWORD0, X_BYTE_FLIP_MASK
 	vpshufb	XWORD1, XWORD1, X_BYTE_FLIP_MASK
 	vpshufb	XWORD2, XWORD2, X_BYTE_FLIP_MASK
 	vpshufb	XWORD3, XWORD3, X_BYTE_FLIP_MASK
 
 	jmp	.Last_block_enter
 
 .Lonly_one_block:
 
 	/* ; load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 	mov	[rsp + _CTX], CTX
 	jmp	.Ldo_last_block
 
 .Ldone_hash:
 	vzeroall
 
 	/* burn stack */
 	vmovdqa [rsp + _XFER + 0 * 32], ymm0
 	vmovdqa [rsp + _XFER + 1 * 32], ymm0
 	vmovdqa [rsp + _XFER + 2 * 32], ymm0
 	vmovdqa [rsp + _XFER + 3 * 32], ymm0
 	vmovdqa [rsp + _XFER + 4 * 32], ymm0
 	vmovdqa [rsp + _XFER + 5 * 32], ymm0
 	vmovdqa [rsp + _XFER + 6 * 32], ymm0
 	vmovdqa [rsp + _XFER + 7 * 32], ymm0
 	vmovdqa [rsp + _XFER + 8 * 32], ymm0
 	vmovdqa [rsp + _XFER + 9 * 32], ymm0
 	vmovdqa [rsp + _XFER + 10 * 32], ymm0
 	vmovdqa [rsp + _XFER + 11 * 32], ymm0
 	vmovdqa [rsp + _XFER + 12 * 32], ymm0
 	vmovdqa [rsp + _XFER + 13 * 32], ymm0
 	vmovdqa [rsp + _XFER + 14 * 32], ymm0
 	vmovdqa [rsp + _XFER + 15 * 32], ymm0
 	xor     eax, eax
 
 	mov	rsp, [rsp + _RSP]
+	CFI_DEF_CFA_REGISTER(rsp)
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	r12
+	CFI_POP(r12)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 .align 64
 .LK256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 .LPSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 
 /*  shuffle xBxA -> 00BA */
 .L_SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 /*  shuffle xDxC -> DC00 */
 .L_SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index ca5c9fd1..0fb94c1b 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -1,549 +1,553 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Note: original implementation was named as SHA256-SSE4. However, only SSSE3
  *       is required.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 #define	MOVDQ movdqu /* assume buffers not aligned */
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
 .macro addm p1 p2
 	add	\p2, \p1
 	mov	\p1, \p2
 .endm
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
 .macro COPY_XMM_AND_BSWAP p1 p2 p3
 	MOVDQ \p1, \p2
 	pshufb \p1, \p3
 .endm
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 X0 = xmm4
 X1 = xmm5
 X2 = xmm6
 X3 = xmm7
 
 XTMP0 = xmm0
 XTMP1 = xmm1
 XTMP2 = xmm2
 XTMP3 = xmm3
 XTMP4 = xmm8
 XFER  = xmm9
 
 SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
 SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
 BYTE_FLIP_MASK = xmm12
 
 NUM_BLKS = rdx	/* 3rd arg */
 CTX = rsi	/* 2nd arg */
 INP = rdi	/* 1st arg */
 
 SRND = rdi	/* clobbers INP */
 c = ecx
 d = r8d
 e = edx
 
 TBL = rbp
 a = eax
 b = ebx
 
 f = r9d
 g = r10d
 h = r11d
 
 y0 = r13d
 y1 = r14d
 y2 = r15d
 
 
 
 #define _INP_END_SIZE	8
 #define _INP_SIZE	8
 #define _XFER_SIZE	8
 #define _XMM_SAVE_SIZE	0
 /* STACK_SIZE plus pushes must be an odd multiple of 8 */
 #define _ALIGN_SIZE	8
 
 #define _INP_END	0
 #define _INP		(_INP_END  + _INP_END_SIZE)
 #define _XFER		(_INP      + _INP_SIZE)
 #define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
 
 /* rotate_Xs
  * Rotate values of symbols X0...X3 */
 .macro rotate_Xs
 X_ = X0
 X0 = X1
 X1 = X2
 X2 = X3
 X3 = X_
 .endm
 
 /* ROTATE_ARGS
  * Rotate values of symbols a...h */
 .macro ROTATE_ARGS
 TMP_ = h
 h = g
 g = f
 f = e
 e = d
 d = c
 c = b
 b = a
 a = TMP_
 .endm
 
 .macro FOUR_ROUNDS_AND_SCHED
 		/* compute s0 four at a time and s1 two at a time
 		 * compute W[-16] + W[-7] 4 at a time */
 		movdqa	XTMP0, X3
 	mov	y0, e		/* y0 = e */
 	ror	y0, (25-11)	/* y0 = e >> (25-11) */
 	mov	y1, a		/* y1 = a */
 		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */
 	ror	y1, (22-13)	/* y1 = a >> (22-13) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	mov	y2, f		/* y2 = f */
 	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 		movdqa	XTMP1, X1
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	xor	y2, g		/* y2 = f^g */
 		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 		/* compute s0 */
 		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */
 	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, y0		/* y2 = S1 + CH */
 	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */
 		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		pslld	XTMP1, (32-7)
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		psrld	XTMP2, 7
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */
 	mov	y0, e		/* y0 = e */
 	mov	y1, a		/* y1 = a */
 		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */
 	ror	y0, (25-11)	/* y0 = e >> (25-11) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	mov	y2, f		/* y2 = f */
 	ror	y1, (22-13)	/* y1 = a >> (22-13) */
 		pslld	XTMP3, (32-18)
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 	xor	y2, g		/* y2 = f^g */
 		psrld	XTMP2, 18
 	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 		pxor	XTMP1, XTMP3
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */
 	add	y2, y0		/* y2 = S1 + CH */
 	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */
 	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		/* compute low s1 */
 		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */
 	mov	y0, e		/* y0 = e */
 	mov	y1, a		/* y1 = a */
 	ror	y0, (25-11)	/* y0 = e >> (25-11) */
 		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	ror	y1, (22-13)	/* y1 = a >> (22-13) */
 	mov	y2, f		/* y2 = f */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */
 	xor	y2, g		/* y2 = f^g */
 		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */
 	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 		pxor	XTMP2, XTMP3
 	add	y2, y0		/* y2 = S1 + CH */
 	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */
 		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		/* compute high s1 */
 		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */
 	mov	y0, e		/* y0 = e */
 	ror	y0, (25-11)	/* y0 = e >> (25-11) */
 	mov	y1, a		/* y1 = a */
 		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */
 	ror	y1, (22-13)	/* y1 = a >> (22-13) */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	mov	y2, f		/* y2 = f */
 	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	xor	y2, g		/* y2 = f^g */
 		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 		pxor	XTMP2, XTMP3
 	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, y0		/* y2 = S1 + CH */
 	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */
 		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 rotate_Xs
 .endm
 
 /* input is [rsp + _XFER + %1 * 4] */
 .macro DO_ROUND i1
 	mov	y0, e		/* y0 = e */
 	ror	y0, (25-11)	/* y0 = e >> (25-11) */
 	mov	y1, a		/* y1 = a */
 	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
 	ror	y1, (22-13)	/* y1 = a >> (22-13) */
 	mov	y2, f		/* y2 = f */
 	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
 	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
 	xor	y2, g		/* y2 = f^g */
 	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
 	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
 	and	y2, e		/* y2 = (f^g)&e */
 	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
 	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
 	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
 	add	y2, y0		/* y2 = S1 + CH */
 	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
 	add	y2, [rsp + _XFER + \i1 * 4]	/* y2 = k + w + S1 + CH */
 	mov	y0, a		/* y0 = a */
 	add	h, y2		/* h = h + S1 + CH + k + w */
 	mov	y2, a		/* y2 = a */
 	or	y0, c		/* y0 = a|c */
 	add	d, h		/* d = d + h + S1 + CH + k + w */
 	and	y2, c		/* y2 = a&c */
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 	ROTATE_ARGS
 .endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 */
 .text
 .globl _gcry_sha256_transform_amd64_ssse3
 ELF(.type  _gcry_sha256_transform_amd64_ssse3,@function;)
 .align 16
 _gcry_sha256_transform_amd64_ssse3:
+	CFI_STARTPROC()
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
 	add	NUM_BLKS, INP	/* pointer to end of data */
 	mov	[rsp + _INP_END], NUM_BLKS
 
 	/* load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
 	movdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	movdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
 	movdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 .Loop0:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* byte swap first 16 dwords */
 	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
 
 	mov	[rsp + _INP], INP
 
 	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
 	mov	SRND, 3
 .align 16
 .Loop1:
 	movdqa	XFER, [TBL + 0*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 
 	movdqa	XFER, [TBL + 1*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 
 	movdqa	XFER, [TBL + 2*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 
 	movdqa	XFER, [TBL + 3*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
 	FOUR_ROUNDS_AND_SCHED
 
 	sub	SRND, 1
 	jne	.Loop1
 
 	mov	SRND, 2
 .Loop2:
 	paddd	X0, [TBL + 0*16]
 	movdqa	[rsp + _XFER], X0
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 	paddd	X1, [TBL + 1*16]
 	movdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 
 	movdqa	X0, X2
 	movdqa	X1, X3
 
 	sub	SRND, 1
 	jne	.Loop2
 
 	addm	[4*0 + CTX],a
 	addm	[4*1 + CTX],b
 	addm	[4*2 + CTX],c
 	addm	[4*3 + CTX],d
 	addm	[4*4 + CTX],e
 	addm	[4*5 + CTX],f
 	addm	[4*6 + CTX],g
 	addm	[4*7 + CTX],h
 
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 	cmp	INP, [rsp + _INP_END]
 	jne	.Loop0
 
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
 	pxor	xmm2, xmm2
 	pxor	xmm3, xmm3
 	pxor	xmm4, xmm4
 	pxor	xmm5, xmm5
 	pxor	xmm6, xmm6
 	pxor	xmm7, xmm7
 	pxor	xmm8, xmm8
 	pxor	xmm9, xmm9
 	pxor	xmm10, xmm10
 	pxor	xmm11, xmm11
 	pxor	xmm12, xmm12
 
 .Ldone_hash:
 	pxor	XFER, XFER
 	movdqa	[rsp + _XFER], XFER
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 	ret
+	CFI_ENDPROC()
 
 
 .align 16
 .LK256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
 
 /* shuffle xBxA -> 00BA */
 .L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 /* shuffle xDxC -> DC00 */
 .L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 534351e4..991fd639 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -1,427 +1,431 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
 msg = rdi /* ARG1 */
 digest = rsi /* ARG2 */
 msglen = rdx /* ARG3 */
 T1 = rcx
 T2 = r8
 a_64 = r9
 b_64 = r10
 c_64 = r11
 d_64 = r12
 e_64 = r13
 f_64 = r14
 g_64 = r15
 h_64 = rbx
 tmp0 = rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
 frame_W      = 0 /* Message Schedule */
 frame_W_size = (80 * 8)
 frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
 frame_WK_size = (2 * 8)
 frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
 frame_GPRSAVE_size = (5 * 8)
 frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
 #define MSG(i)    msg    + 8*(i)               /* Input message (arg1) */
 #define DIGEST(i) digest + 8*(i)               /* Output Digest (arg2) */
 #define K_t(i)    .LK512   + 8*(i) ADD_RIP     /* SHA Constants (static mem) */
 #define W_t(i)    rsp + frame_W  + 8*(i)       /* Message Schedule (stack frame) */
 #define WK_2(i)   rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
 .macro RotateState
 	/* Rotate symbles a..h right */
 	__TMP = h_64
 	h_64 =  g_64
 	g_64 =  f_64
 	f_64 =  e_64
 	e_64 =  d_64
 	d_64 =  c_64
 	c_64 =  b_64
 	b_64 =  a_64
 	a_64 =  __TMP
 .endm
 
 .macro RORQ p1 p2
 	/* shld is faster than ror on Intel Sandybridge */
 	shld	\p1, \p1, (64 - \p2)
 .endm
 
 .macro SHA512_Round t
 	/* Compute Round %%t */
 	mov	T1,   f_64        /* T1 = f */
 	mov	tmp0, e_64        /* tmp = e */
 	xor	T1,   g_64        /* T1 = f ^ g */
 	RORQ	tmp0, 23 /* 41     ; tmp = e ror 23 */
 	and	T1,   e_64        /* T1 = (f ^ g) & e */
 	xor	tmp0, e_64        /* tmp = (e ror 23) ^ e */
 	xor	T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
 	add	T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
 	RORQ	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
 	xor	tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
 	mov	T2,   a_64        /* T2 = a */
 	add	T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
 	RORQ	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
 	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
 	mov	tmp0, a_64        /* tmp = a */
 	xor	T2,   c_64        /* T2 = a ^ c */
 	and	tmp0, c_64        /* tmp = a & c */
 	and	T2,   b_64        /* T2 = (a ^ c) & b */
 	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
 	mov	tmp0, a_64        /* tmp = a */
 	RORQ	tmp0, 5 /* 39      ; tmp = a ror 5 */
 	xor	tmp0, a_64        /* tmp = (a ror 5) ^ a */
 	add	d_64, T1          /* e(next_state) = d + T1  */
 	RORQ	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
 	xor	tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
 	lea	h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
 	RORQ	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
 	add	h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
 	RotateState
 .endm
 
 .macro SHA512_2Sched_2Round_avx t
 /*	; Compute rounds %%t-2 and %%t-1
 	; Compute message schedule QWORDS %%t and %%t+1
 
 	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
 	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
 	; scheduler.
 	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
 	; They are then added to their respective SHA512 constants at
 	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
 	;   For brievity, the comments following vectored instructions only refer to
 	; the first of a pair of QWORDS.
 	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
 	;   The computation of the message schedule and the rounds are tightly
 	; stitched to take advantage of instruction-level parallelism.
 	; For clarity, integer instructions (for the rounds calculation) are indented
 	; by one tab. Vectored instructions (for the message scheduler) are indented
 	; by two tabs. */
 
 		vmovdqa	xmm4, [W_t(\t-2)]   /* XMM4 = W[t-2] */
 		vmovdqu	xmm5, [W_t(\t-15)]  /* XMM5 = W[t-15] */
 	mov	T1,   f_64
 		vpsrlq	xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */
 	mov	tmp0, e_64
 		vpsrlq	xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */
 	xor	T1,   g_64
 	RORQ	tmp0, 23 /* 41 */
 		vpsrlq	xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */
 	and	T1,   e_64
 	xor	tmp0, e_64
 		vpxor	xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */
 	xor	T1,   g_64
 	add	T1,   [WK_2(\t)];
 		vpsrlq	xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */
 	RORQ	tmp0, 4 /* 18 */
 		vpsrlq	xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */
 	xor	tmp0, e_64
 	mov	T2,   a_64
 	add	T1,   h_64
 		vpxor	xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */
 	RORQ	tmp0, 14 /* 14 */
 	add	T1,   tmp0
 		vpsrlq	xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */
 	mov 	tmp0, a_64
 	xor	T2,   c_64
 		vpsllq	xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */
 	and	tmp0, c_64
 	and	T2,   b_64
 		vpxor	xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */
 	xor	T2,   tmp0
 	mov	tmp0, a_64
 		vpsllq	xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */
 	RORQ	tmp0, 5 /* 39 */
 		vpxor	xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */
 	xor	tmp0, a_64
 	add	d_64, T1
 	RORQ	tmp0, 6 /* 34 */
 	xor	tmp0, a_64
 		vpxor	xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */
 	lea	h_64, [T1 + T2]
 	RORQ 	tmp0, 28 /* 28 */
 		vpsllq	xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */
 	add	h_64, tmp0
 	RotateState
 		vpxor	xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */
 	mov	T1, f_64
 		vpxor	xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */
 	mov	tmp0, e_64
 	xor	T1,   g_64
 		vpaddq	xmm0, xmm0, [W_t(\t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */
 		vmovdqu	xmm1, [W_t(\t- 7)]  /* XMM1 = W[t-7] */
 	RORQ	tmp0, 23 /* 41 */
 	and	T1,   e_64
 	xor	tmp0, e_64
 	xor	T1,   g_64
 		vpsllq	xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */
 	add	T1,   [WK_2(\t+1)]
 		vpxor	xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */
 	RORQ	tmp0, 4 /* 18 */
 		vpaddq	xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */
 	xor	tmp0, e_64
 		vpaddq	xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
 	mov	T2,   a_64
 	add	T1,   h_64
 	RORQ	tmp0, 14 /* 14 */
 	add	T1,   tmp0
 		vmovdqa	[W_t(\t)], xmm0      /* Store W[t] */
 		vpaddq	xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */
 		vmovdqa	[WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */
 	mov	tmp0, a_64
 	xor	T2,   c_64
 	and	tmp0, c_64
 	and	T2,   b_64
 	xor	T2,   tmp0
 	mov	tmp0, a_64
 	RORQ	tmp0, 5 /* 39 */
 	xor	tmp0, a_64
 	add	d_64, T1
 	RORQ	tmp0, 6 /* 34 */
 	xor	tmp0, a_64
 	lea	h_64, [T1 + T2]
 	RORQ	tmp0, 28 /* 28 */
 	add	h_64, tmp0
 	RotateState
 .endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_avx(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks
 */
 .globl _gcry_sha512_transform_amd64_avx
 ELF(.type _gcry_sha512_transform_amd64_avx,@function;)
 .align 16
 _gcry_sha512_transform_amd64_avx:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp	msglen, 0
 	je	.Lnowork
 
 	vzeroupper
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
 	mov	[rsp + frame_GPRSAVE + 8 * 1], r12
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
 	/* Load state variables */
 	mov	a_64, [DIGEST(0)]
 	mov	b_64, [DIGEST(1)]
 	mov	c_64, [DIGEST(2)]
 	mov	d_64, [DIGEST(3)]
 	mov	e_64, [DIGEST(4)]
 	mov	f_64, [DIGEST(5)]
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 
 	t = 0
 	.rept 80/2 + 1
 	/* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
 	/* +1 iteration because the scheduler leads hashing by 1 iteration */
 		.if t < 2
 			/* BSWAP 2 QWORDS */
 			vmovdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
 			vmovdqu	xmm0, [MSG(t)]
 			vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
 			vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */
 			vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
 			vmovdqa	[WK_2(t)], xmm0      /* Store into WK for rounds */
 		.elseif t < 16
 			/* BSWAP 2 QWORDS, Compute 2 Rounds */
 			vmovdqu	xmm0, [MSG(t)]
 			vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
 			SHA512_Round (t - 2)         /* Round t-2 */
 			vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */
 			vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
 			SHA512_Round (t - 1)         /* Round t-1 */
 			vmovdqa	[WK_2(t)], xmm0      /* W[t]+K[t] into WK */
 		.elseif t < 79
 			/* Schedule 2 QWORDS; Compute 2 Rounds */
 			SHA512_2Sched_2Round_avx t
 		.else
 			/* Compute 2 Rounds */
 			SHA512_Round (t - 2)
 			SHA512_Round (t - 1)
 		.endif
 		t = ((t)+2)
 	.endr
 
 	/* Update digest */
 	add	[DIGEST(0)], a_64
 	add	[DIGEST(1)], b_64
 	add	[DIGEST(2)], c_64
 	add	[DIGEST(3)], d_64
 	add	[DIGEST(4)], e_64
 	add	[DIGEST(5)], f_64
 	add	[DIGEST(6)], g_64
 	add	[DIGEST(7)], h_64
 
 	/* Advance to next message block */
 	add	msg, 16*8
 	dec	msglen
 	jnz	.Lupdateblock
 
 	/* Restore GPRs */
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	r12, [rsp + frame_GPRSAVE + 8 * 1]
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	vzeroall
 
 	/* Burn stack */
 	t = 0
 	.rept frame_W_size / 32
 		vmovups [rsp + frame_W + (t) * 32], ymm0
 		t = ((t)+1)
 	.endr
 	vmovdqu [rsp + frame_WK], xmm0
 	xor     eax, eax
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Binary Data
 */
 
 .align 16
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .LXMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad 0x3956c25bf348b538,0x59f111f1b605d019
 	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad 0xd807aa98a3030242,0x12835b0145706fbe
 	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad 0x06ca6351e003826f,0x142929670a0e6e70
 	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad 0x81c2c92e47edaee6,0x92722c851482353b
 	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad 0xd192e819d6ef5218,0xd69906245565a910
 	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad 0x90befffa23631e28,0xa4506cebde82bde9
 	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad 0xca273eceea26619c,0xd186b8c721c0c207
 	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad 0x113f9804bef90dae,0x1b710b35131c471b
 	.quad 0x28db77f523047d84,0x32caab7b40c72493
 	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 #endif
 #endif
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 32cfceb0..3b28ab6c 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -1,560 +1,568 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
 Y_0 = ymm4
 Y_1 = ymm5
 Y_2 = ymm6
 Y_3 = ymm7
 
 YTMP0 = ymm0
 YTMP1 = ymm1
 YTMP2 = ymm2
 YTMP3 = ymm3
 YTMP4 = ymm8
 XFER =  YTMP0
 
 BYTE_FLIP_MASK =  ymm9
 MASK_YMM_LO    =  ymm10
 MASK_YMM_LOx   =  xmm10
 
 INP =         rdi /* 1st arg */
 CTX =         rsi /* 2nd arg */
 NUM_BLKS =    rdx /* 3rd arg */
 c =           rcx
 d =           r8
 e =           rdx
 y3 =          rdi
 
 TBL =   rbp
 
 a =     rax
 b =     rbx
 
 f =     r9
 g =     r10
 h =     r11
 old_h = rax
 
 T1 =    r12
 y0 =    r13
 y1 =    r14
 y2 =    r15
 
 y4 =    r12
 
 /* Local variables (stack frame) */
 #define frame_XFER      0
 #define frame_XFER_size (4*4*8)
 #define frame_SRND      (frame_XFER + frame_XFER_size)
 #define frame_SRND_size (1*8)
 #define frame_INP      (frame_SRND + frame_SRND_size)
 #define frame_INP_size (1*8)
 #define frame_NBLKS      (frame_INP + frame_INP_size)
 #define frame_NBLKS_size (1*8)
 #define frame_RSPSAVE      (frame_NBLKS + frame_NBLKS_size)
 #define frame_RSPSAVE_size (1*8)
 #define frame_GPRSAVE      (frame_RSPSAVE + frame_RSPSAVE_size)
 #define frame_GPRSAVE_size (6*8)
 #define frame_size (frame_GPRSAVE + frame_GPRSAVE_size)
 
 #define	VMOVDQ vmovdqu /*; assume buffers not aligned  */
 
 /* addm [mem], reg */
 /* Add reg to mem using reg-mem add and store */
 .macro addm p1 p2
 	add	\p2, \p1
 	mov	\p1, \p2
 .endm
 
 
 /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
 /* Load ymm with mem and byte swap each dword */
 .macro COPY_YMM_AND_BSWAP p1 p2 p3
 	VMOVDQ \p1, \p2
 	vpshufb \p1, \p1, \p3
 .endm
 /* rotate_Ys */
 /* Rotate values of symbols Y0...Y3 */
 .macro rotate_Ys
 	__Y_ = Y_0
 	Y_0 = Y_1
 	Y_1 = Y_2
 	Y_2 = Y_3
 	Y_3 = __Y_
 .endm
 
 /* RotateState */
 .macro RotateState
 	/* Rotate symbles a..h right */
 	old_h =  h
 	__TMP_ = h
 	h =      g
 	g =      f
 	f =      e
 	e =      d
 	d =      c
 	c =      b
 	b =      a
 	a =      __TMP_
 .endm
 
 /* %macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL */
 /* YDST = {YSRC1, YSRC2} >> RVAL*8 */
 .macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
 	vperm2f128 	\YDST, \YSRC1, \YSRC2, 0x3	/* YDST = {YS1_LO, YS2_HI} */
 	vpalignr 	\YDST, \YDST, \YSRC2, \RVAL	/* YDST = {YDS1, YS2} >> RVAL*8 */
 .endm
 
 .macro ONE_ROUND_PART1 XFER
 	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
 	 * d += h;
 	 * h += Sum0 (a) + Maj (a, b, c);
 	 *
 	 * Ch(x, y, z) => ((x & y) + (~x & z))
 	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
 	 */
 
 	mov y3, e
 	add h, [\XFER]
 	and y3, f
 	rorx y0, e, 41
 	rorx y1, e, 18
 	lea h, [h + y3]
 	andn y3, e, g
 	rorx T1, a, 34
 	xor y0, y1
 	lea h, [h + y3]
 .endm
 .macro ONE_ROUND_PART2
 	rorx y2, a, 39
 	rorx y1, e, 14
 	mov y3, a
 	xor T1, y2
 	xor y0, y1
 	xor y3, b
 	lea h, [h + y0]
 	mov y0, a
 	rorx y2, a, 28
 	add d, h
 	and y3, c
 	xor T1, y2
 	lea h, [h + y3]
 	lea h, [h + T1]
 	and y0, b
 	lea h, [h + y0]
 .endm
 
 .macro ONE_ROUND XFER
 	ONE_ROUND_PART1 \XFER
 	ONE_ROUND_PART2
 .endm
 
 .macro FOUR_ROUNDS_AND_SCHED X
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		/* Extract w[t-7] */
 		MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		/* YTMP0 = W[-7] */
 		/* Calculate w[t-16] + w[t-7] */
 		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */
 		/* Extract w[t-15] */
 		MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		/* YTMP1 = W[-15] */
 
 		/* Calculate sigma0 */
 
 		/* Calculate w[t-15] ror 1 */
 		vpsrlq		YTMP2, YTMP1, 1
 		vpsllq		YTMP3, YTMP1, (64-1)
 		vpor		YTMP3, YTMP3, YTMP2		/* YTMP3 = W[-15] ror 1 */
 		/* Calculate w[t-15] shr 7 */
 		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */
 
 	ONE_ROUND rsp+frame_XFER+0*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		/* Calculate w[t-15] ror 8 */
 		vpsrlq		YTMP2, YTMP1, 8
 		vpsllq		YTMP1, YTMP1, (64-8)
 		vpor		YTMP1, YTMP1, YTMP2		/* YTMP1 = W[-15] ror 8 */
 		/* XOR the three components */
 		vpxor		YTMP3, YTMP3, YTMP4		/* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */
 		vpxor		YTMP1, YTMP3, YTMP1		/* YTMP1 = s0 */
 
 
 		/* Add three components, w[t-16], w[t-7] and sigma0 */
 		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */
 		/* Move to appropriate lanes for calculating w[16] and w[17] */
 		vperm2f128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */
 		/* Move to appropriate lanes for calculating w[18] and w[19] */
 		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */
 
 		/* Calculate w[16] and w[17] in both 128 bit lanes */
 
 		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */
 		vperm2f128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */
 		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */
 
 	ONE_ROUND rsp+frame_XFER+1*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 
 		vpsrlq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] >> 19 {BABA} */
 		vpsllq		YTMP1, YTMP2, (64-19)		/* YTMP1 = W[-2] << 19 {BABA} */
 		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {BABA} */
 		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */
 		vpsrlq		YTMP3, YTMP2, 61		/* YTMP3 = W[-2] >> 61 {BABA} */
 		vpsllq		YTMP1, YTMP2, (64-61)		/* YTMP1 = W[-2] << 61 {BABA} */
 		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {BABA} */
 		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */
 
 		/* Add sigma1 to the other compunents to get w[16] and w[17] */
 		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */
 
 		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */
 		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */
 
 	ONE_ROUND rsp+frame_XFER+2*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		vpsrlq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] >> 19 {DC--} */
 		vpsllq		YTMP1, Y_0, (64-19)		/* YTMP1 = W[-2] << 19 {DC--} */
 		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {DC--} */
 		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */
 		vpsrlq		YTMP3, Y_0, 61			/* YTMP3 = W[-2] >> 61 {DC--} */
 		vpsllq		YTMP1, Y_0, (64-61)		/* YTMP1 = W[-2] << 61 {DC--} */
 		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {DC--} */
 		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */
 
 		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */
 		vpaddq		YTMP2, YTMP0, YTMP4		/* YTMP2 = {W[3], W[2], --, --} */
 
 		/* Form w[19, w[18], w17], w[16] */
 		vpblendd		Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */
 
 	ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32
 		vpaddq		XFER, Y_0, [TBL + (4+\X)*32]
 		vmovdqa		[rsp + frame_XFER + \X*32], XFER
 	ONE_ROUND_PART2
 	RotateState
 	rotate_Ys
 .endm
 
 .macro DO_4ROUNDS X
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND rsp+frame_XFER+0*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND rsp+frame_XFER+1*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND rsp+frame_XFER+2*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 	ONE_ROUND rsp+frame_XFER+3*8+\X*32
 	RotateState
 
 .endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_rorx(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks
 */
 .globl _gcry_sha512_transform_amd64_avx2
 ELF(.type _gcry_sha512_transform_amd64_avx2,@function;)
 .align 16
 _gcry_sha512_transform_amd64_avx2:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
 	je .Lnowork
 
 	vzeroupper
 
 	/* Allocate Stack Space */
 	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, frame_size
 	and	rsp, ~(0x40 - 1)
 	mov	[rsp + frame_RSPSAVE], rax
+	CFI_CFA_ON_STACK(frame_RSPSAVE, 0)
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbp
 	mov	[rsp + frame_GPRSAVE + 8 * 1], rbx
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r12
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 5], r15
+	CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0)
+	CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1)
+	CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2)
+	CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3)
+	CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4)
+	CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5)
 
 	mov	[rsp + frame_NBLKS], NUM_BLKS
 
 	/*; load initial digest */
 	mov	a,[8*0 + CTX]
 	mov	b,[8*1 + CTX]
 	mov	c,[8*2 + CTX]
 	mov	d,[8*3 + CTX]
 	mov	e,[8*4 + CTX]
 	mov	f,[8*5 + CTX]
 	mov	g,[8*6 + CTX]
 	mov	h,[8*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
 	vmovdqa	MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP]
 
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/*; byte swap first 16 dwords */
 	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
 
 	add	INP, 128
 	mov	[rsp + frame_INP], INP
 
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa [rsp + frame_XFER + 0*32], XFER
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa [rsp + frame_XFER + 1*32], XFER
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa [rsp + frame_XFER + 2*32], XFER
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
 	movq	[rsp + frame_SRND],4
 
 .align 16
 .Loop0:
 	FOUR_ROUNDS_AND_SCHED 0
 	FOUR_ROUNDS_AND_SCHED 1
 	FOUR_ROUNDS_AND_SCHED 2
 	FOUR_ROUNDS_AND_SCHED 3
 	add	TBL, 4*32
 
 	subq	[rsp + frame_SRND], 1
 	jne	.Loop0
 
 	subq	[rsp + frame_NBLKS], 1
 	je	.Ldone_hash
 
 	mov	INP, [rsp + frame_INP]
 
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/* load next block and byte swap */
 	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
 
 	add	INP, 128
 	mov	[rsp + frame_INP], INP
 
 	DO_4ROUNDS 0
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa [rsp + frame_XFER + 0*32], XFER
 	DO_4ROUNDS 1
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa [rsp + frame_XFER + 1*32], XFER
 	DO_4ROUNDS 2
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa [rsp + frame_XFER + 2*32], XFER
 	DO_4ROUNDS 3
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
 	addm	[8*0 + CTX],a
 	addm	[8*1 + CTX],b
 	addm	[8*2 + CTX],c
 	addm	[8*3 + CTX],d
 	addm	[8*4 + CTX],e
 	addm	[8*5 + CTX],f
 	addm	[8*6 + CTX],g
 	addm	[8*7 + CTX],h
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
 	movq	[rsp + frame_SRND],4
 
 	jmp	.Loop0
 
 .Ldone_hash:
 	vzeroall
 
 	DO_4ROUNDS 0
 	vmovdqa	[rsp + frame_XFER + 0*32], ymm0 /* burn stack */
 	DO_4ROUNDS 1
 	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
 	DO_4ROUNDS 2
 	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
 	DO_4ROUNDS 3
 	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
 
 	addm	[8*0 + CTX],a
 	xor     eax, eax /* burn stack */
 	addm	[8*1 + CTX],b
 	addm	[8*2 + CTX],c
 	addm	[8*3 + CTX],d
 	addm	[8*4 + CTX],e
 	addm	[8*5 + CTX],f
 	addm	[8*6 + CTX],g
 	addm	[8*7 + CTX],h
 
 	/* Restore GPRs */
 	mov	rbp, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 1]
 	mov	r12, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
+	CFI_RESTORE(rbp)
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	/* Restore Stack Pointer */
 	mov	rsp, [rsp + frame_RSPSAVE]
+	CFI_DEF_CFA_REGISTER(rsp)
+
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 /*;; Binary Data */
 
 .align 64
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad	0xd192e819d6ef5218,0xd69906245565a910
 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
 	.quad	0x28db77f523047d84,0x32caab7b40c72493
 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 .align 32
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
 			   .octa 0x18191a1b1c1d1e1f1011121314151617
 
 .LMASK_YMM_LO:		   .octa 0x00000000000000000000000000000000
 			   .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
 
 #endif
 #endif
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 8e950e0e..39bfe362 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -1,432 +1,436 @@
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright (c) 2012, Intel Corporation
 ;
 ; All rights reserved.
 ;
 ; Redistribution and use in source and binary forms, with or without
 ; modification, are permitted provided that the following conditions are
 ; met:
 ;
 ; * Redistributions of source code must retain the above copyright
 ;   notice, this list of conditions and the following disclaimer.
 ;
 ; * Redistributions in binary form must reproduce the above copyright
 ;   notice, this list of conditions and the following disclaimer in the
 ;   documentation and/or other materials provided with the
 ;   distribution.
 ;
 ; * Neither the name of the Intel Corporation nor the names of its
 ;   contributors may be used to endorse or promote products derived from
 ;   this software without specific prior written permission.
 ;
 ;
 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 */
 /*
  * Conversion to GAS assembly and integration to libgcrypt
  *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * Note: original implementation was named as SHA512-SSE4. However, only SSSE3
  *       is required.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
 .text
 
 /* Virtual Registers */
 msg = rdi /* ARG1 */
 digest = rsi /* ARG2 */
 msglen = rdx /* ARG3 */
 T1 = rcx
 T2 = r8
 a_64 = r9
 b_64 = r10
 c_64 = r11
 d_64 = r12
 e_64 = r13
 f_64 = r14
 g_64 = r15
 h_64 = rbx
 tmp0 = rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
 frame_W      = 0 /* Message Schedule */
 frame_W_size = (80 * 8)
 frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
 frame_WK_size = (2 * 8)
 frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
 frame_GPRSAVE_size = (5 * 8)
 frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
 #define MSG(i)    msg    + 8*(i)               /* Input message (arg1) */
 #define DIGEST(i) digest + 8*(i)               /* Output Digest (arg2) */
 #define K_t(i)    .LK512   + 8*(i) ADD_RIP     /* SHA Constants (static mem) */
 #define W_t(i)    rsp + frame_W  + 8*(i)       /* Message Schedule (stack frame) */
 #define WK_2(i)   rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
 .macro RotateState
 	/* Rotate symbles a..h right */
 	__TMP = h_64
 	h_64 =  g_64
 	g_64 =  f_64
 	f_64 =  e_64
 	e_64 =  d_64
 	d_64 =  c_64
 	c_64 =  b_64
 	b_64 =  a_64
 	a_64 =  __TMP
 .endm
 
 .macro SHA512_Round t
 	/* Compute Round %%t */
 	mov	T1,   f_64        /* T1 = f */
 	mov	tmp0, e_64        /* tmp = e */
 	xor	T1,   g_64        /* T1 = f ^ g */
 	ror	tmp0, 23 /* 41     ; tmp = e ror 23 */
 	and	T1,   e_64        /* T1 = (f ^ g) & e */
 	xor	tmp0, e_64        /* tmp = (e ror 23) ^ e */
 	xor	T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
 	add	T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
 	ror	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
 	xor	tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
 	mov	T2,   a_64        /* T2 = a */
 	add	T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
 	ror	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
 	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
 	mov	tmp0, a_64        /* tmp = a */
 	xor	T2,   c_64        /* T2 = a ^ c */
 	and	tmp0, c_64        /* tmp = a & c */
 	and	T2,   b_64        /* T2 = (a ^ c) & b */
 	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
 	mov	tmp0, a_64        /* tmp = a */
 	ror	tmp0, 5 /* 39      ; tmp = a ror 5 */
 	xor	tmp0, a_64        /* tmp = (a ror 5) ^ a */
 	add	d_64, T1          /* e(next_state) = d + T1  */
 	ror	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
 	xor	tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
 	lea	h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
 	ror	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
 	add	h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
 	RotateState
 .endm
 
 .macro SHA512_2Sched_2Round_sse t
 /*	; Compute rounds %%t-2 and %%t-1
 	; Compute message schedule QWORDS %%t and %%t+1
 
 	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
 	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
 	; scheduler.
 	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
 	; They are then added to their respective SHA512 constants at
 	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
 	;   For brievity, the comments following vectored instructions only refer to
 	; the first of a pair of QWORDS.
 	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
 	;   The computation of the message schedule and the rounds are tightly
 	; stitched to take advantage of instruction-level parallelism.
 	; For clarity, integer instructions (for the rounds calculation) are indented
 	; by one tab. Vectored instructions (for the message scheduler) are indented
 	; by two tabs. */
 
 	mov	T1, f_64
 		movdqa	xmm2, [W_t(\t-2)]  /* XMM2 = W[t-2] */
 	xor	T1,   g_64
 	and	T1,   e_64
 		movdqa	xmm0, xmm2          /* XMM0 = W[t-2] */
 	xor	T1,   g_64
 	add	T1,   [WK_2(\t)]
 		movdqu	xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */
 	mov	tmp0, e_64
 	ror	tmp0, 23 /* 41 */
 		movdqa	xmm3, xmm5          /* XMM3 = W[t-15] */
 	xor	tmp0, e_64
 	ror	tmp0, 4 /* 18 */
 		psrlq	xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */
 	xor	tmp0, e_64
 	ror	tmp0, 14 /* 14 */
 		psrlq	xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */
 	add	T1,   tmp0
 	add	T1,   h_64
 		pxor	xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */
 	mov	T2,   a_64
 	xor	T2,   c_64
 		pxor	xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */
 	and	T2,   b_64
 	mov	tmp0, a_64
 		psrlq	xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */
 	and	tmp0, c_64
 	xor	T2,   tmp0
 		psrlq	xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */
 	mov	tmp0, a_64
 	ror	tmp0, 5 /* 39 */
 		pxor	xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */
 	xor	tmp0, a_64
 	ror	tmp0, 6 /* 34 */
 		pxor	xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */
 	xor	tmp0, a_64
 	ror	tmp0, 28 /* 28 */
 		psrlq	xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */
 	add	T2,   tmp0
 	add	d_64, T1
 		psrlq	xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */
 	lea	h_64, [T1 + T2]
 	RotateState
 		movdqa	xmm1, xmm2          /* XMM1 = W[t-2] */
 	mov	T1, f_64
 	xor	T1,   g_64
 		movdqa	xmm4, xmm5          /* XMM4 = W[t-15] */
 	and	T1,   e_64
 	xor	T1,   g_64
 		psllq	xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */
 	add	T1,   [WK_2(\t+1)]
 	mov	tmp0, e_64
 		psllq	xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */
 	ror	tmp0, 23 /* 41 */
 	xor	tmp0, e_64
 		pxor	xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */
 	ror	tmp0, 4 /* 18 */
 	xor	tmp0, e_64
 		pxor	xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */
 	ror	tmp0, 14 /* 14 */
 	add	T1,   tmp0
 		psllq	xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */
 	add	T1,   h_64
 	mov	T2,   a_64
 		psllq	xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */
 	xor	T2,   c_64
 	and	T2,   b_64
 		pxor	xmm0, xmm1          /* XMM0 = s1(W[t-2]) */
 	mov	tmp0, a_64
 	and	tmp0, c_64
 		movdqu	xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */
 	xor	T2,   tmp0
 		pxor	xmm3, xmm4          /* XMM3 = s0(W[t-15]) */
 	mov	tmp0, a_64
 		paddq	xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */
 	ror	tmp0, 5 /* 39 */
 		paddq	xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */
 	xor	tmp0, a_64
 		paddq	xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
 	ror	tmp0, 6 /* 34 */
 		movdqa	[W_t(\t)], xmm0     /* Store scheduled qwords */
 	xor	tmp0, a_64
 		paddq	xmm0, [K_t(t)]      /* Compute W[t]+K[t] */
 	ror	tmp0, 28 /* 28 */
 		movdqa	[WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */
 	add	T2,   tmp0
 	add	d_64, T1
 	lea	h_64, [T1 + T2]
 	RotateState
 .endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_sse4(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks.
 */
 .globl _gcry_sha512_transform_amd64_ssse3
 ELF(.type _gcry_sha512_transform_amd64_ssse3,@function;)
 .align 16
 _gcry_sha512_transform_amd64_ssse3:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp msglen, 0
 	je .Lnowork
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
 	mov	[rsp + frame_GPRSAVE + 8 * 1], r12
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
 	/* Load state variables */
 	mov	a_64, [DIGEST(0)]
 	mov	b_64, [DIGEST(1)]
 	mov	c_64, [DIGEST(2)]
 	mov	d_64, [DIGEST(3)]
 	mov	e_64, [DIGEST(4)]
 	mov	f_64, [DIGEST(5)]
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 
 	t = 0
 	.rept 80/2 + 1
 	/* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
 	/* +1 iteration because the scheduler leads hashing by 1 iteration */
 		.if t < 2
 			/* BSWAP 2 QWORDS */
 			movdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
 			movdqu	xmm0, [MSG(t)]
 			pshufb	xmm0, xmm1      /* BSWAP */
 			movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */
 			paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
 			movdqa	[WK_2(t)], xmm0 /* Store into WK for rounds */
 		.elseif t < 16
 			/* BSWAP 2 QWORDS; Compute 2 Rounds */
 			movdqu	xmm0, [MSG(t)]
 			pshufb	xmm0, xmm1      /* BSWAP */
 			SHA512_Round (t - 2)    /* Round t-2 */
 			movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */
 			paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
 			SHA512_Round (t - 1)    /* Round t-1 */
 			movdqa	[WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
 		.elseif t < 79
 			/* Schedule 2 QWORDS; Compute 2 Rounds */
 			SHA512_2Sched_2Round_sse t
 		.else
 			/* Compute 2 Rounds */
 			SHA512_Round (t - 2)
 			SHA512_Round (t - 1)
 		.endif
 		t = (t)+2
 	.endr
 
 	/* Update digest */
 	add	[DIGEST(0)], a_64
 	add	[DIGEST(1)], b_64
 	add	[DIGEST(2)], c_64
 	add	[DIGEST(3)], d_64
 	add	[DIGEST(4)], e_64
 	add	[DIGEST(5)], f_64
 	add	[DIGEST(6)], g_64
 	add	[DIGEST(7)], h_64
 
 	/* Advance to next message block */
 	add	msg, 16*8
 	dec	msglen
 	jnz	.Lupdateblock
 
 	/* Restore GPRs */
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	r12, [rsp + frame_GPRSAVE + 8 * 1]
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
 	pxor	xmm2, xmm2
 	pxor	xmm3, xmm3
 	pxor	xmm4, xmm4
 	pxor	xmm5, xmm5
 
 	/* Burn stack */
 	t = 0
 	.rept frame_W_size / 16
 		movdqu [rsp + frame_W + (t) * 16], xmm0
 		t = ((t)+1)
 	.endr
 	movdqu [rsp + frame_WK], xmm0
 	xor     eax, eax
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Binary Data
 */
 
 .align 16
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
 .LXMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
 /* K[t] used in SHA512 hashing */
 .LK512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad 0x3956c25bf348b538,0x59f111f1b605d019
 	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad 0xd807aa98a3030242,0x12835b0145706fbe
 	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad 0x06ca6351e003826f,0x142929670a0e6e70
 	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad 0x81c2c92e47edaee6,0x92722c851482353b
 	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad 0xd192e819d6ef5218,0xd69906245565a910
 	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad 0x90befffa23631e28,0xa4506cebde82bde9
 	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad 0xca273eceea26619c,0xd186b8c721c0c207
 	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad 0x113f9804bef90dae,0x1b710b35131c471b
 	.quad 0x28db77f523047d84,0x32caab7b40c72493
 	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
 #endif
 #endif
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 134d6401..3cb73431 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -1,1066 +1,1184 @@
 /* twofish-amd64.S  -  AMD64 assembly implementation of Twofish cipher
  *
  * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH)
 
 #include "asm-common-amd64.h"
 
 .text
 
 /* structure of TWOFISH_context: */
 #define s0 0
 #define s1 ((s0) + 4 * 256)
 #define s2 ((s1) + 4 * 256)
 #define s3 ((s2) + 4 * 256)
 #define w  ((s3) + 4 * 256)
 #define k  ((w) + 4 * 8)
 
 /* register macros */
 #define CTX	%rdi
 
 #define RA	%rax
 #define RB	%rbx
 #define RC	%rcx
 #define RD	%rdx
 
 #define RAd	%eax
 #define RBd	%ebx
 #define RCd	%ecx
 #define RDd	%edx
 
 #define RAbl	%al
 #define RBbl	%bl
 #define RCbl	%cl
 #define RDbl	%dl
 
 #define RAbh	%ah
 #define RBbh	%bh
 #define RCbh	%ch
 #define RDbh	%dh
 
 #define RX	%r8
 #define RY	%r9
 
 #define RXd	%r8d
 #define RYd	%r9d
 
 #define RT0	%rsi
 #define RT1	%rbp
 #define RT2	%r10
 #define RT3	%r11
 
 #define RT0d	%esi
 #define RT1d	%ebp
 #define RT2d	%r10d
 #define RT3d	%r11d
 
 /***********************************************************************
  * AMD64 assembly implementation of the Twofish cipher
  ***********************************************************************/
 #define enc_g1_2(a, b, x, y) \
 	movzbl b ## bl, RT3d; \
 	movzbl b ## bh, RT1d; \
 	movzbl a ## bl, RT2d; \
 	movzbl a ## bh, RT0d; \
 	rorl $16, b ## d; \
 	rorl $16, a ## d; \
 	movl s1(CTX, RT3, 4), RYd; \
 	movzbl b ## bl, RT3d; \
 	movl s0(CTX, RT2, 4), RXd; \
 	movzbl a ## bl, RT2d; \
 	xorl s2(CTX, RT1, 4), RYd; \
 	movzbl b ## bh, RT1d; \
 	xorl s1(CTX, RT0, 4), RXd; \
 	movzbl a ## bh, RT0d; \
 	rorl $16, b ## d; \
 	rorl $16, a ## d; \
 	xorl s3(CTX, RT3, 4), RYd; \
 	xorl s2(CTX, RT2, 4), RXd; \
 	xorl s0(CTX, RT1, 4), RYd; \
 	xorl s3(CTX, RT0, 4), RXd;
 
 #define dec_g1_2(a, b, x, y) \
 	movzbl a ## bl, RT2d; \
 	movzbl a ## bh, RT0d; \
 	movzbl b ## bl, RT3d; \
 	movzbl b ## bh, RT1d; \
 	rorl $16, a ## d; \
 	rorl $16, b ## d; \
 	movl s0(CTX, RT2, 4), RXd; \
 	movzbl a ## bl, RT2d; \
 	movl s1(CTX, RT3, 4), RYd; \
 	movzbl b ## bl, RT3d; \
 	xorl s1(CTX, RT0, 4), RXd; \
 	movzbl a ## bh, RT0d; \
 	xorl s2(CTX, RT1, 4), RYd; \
 	movzbl b ## bh, RT1d; \
 	rorl $16, a ## d; \
 	rorl $16, b ## d; \
 	xorl s2(CTX, RT2, 4), RXd; \
 	xorl s3(CTX, RT3, 4), RYd; \
 	xorl s3(CTX, RT0, 4), RXd; \
 	xorl s0(CTX, RT1, 4), RYd;
 
 #define encrypt_round(ra, rb, rc, rd, n) \
 	enc_g1_2(##ra, ##rb, RX, RY); \
 	\
 	leal (RXd, RYd, 2), RT0d; \
 	addl RYd, RXd; \
 	addl (k + 8 * (n) + 4)(CTX), RT0d; \
 	roll $1, rd ## d; \
 	addl (k + 8 * (n))(CTX), RXd; \
 	xorl RT0d, rd ## d; \
 	xorl RXd, rc ## d; \
 	rorl $1, rc ## d;
 
 #define decrypt_round(ra, rb, rc, rd, n) \
 	dec_g1_2(##ra, ##rb, RX, RY); \
 	\
 	leal (RXd, RYd, 2), RT0d; \
 	addl RYd, RXd; \
 	addl (k + 8 * (n) + 4)(CTX), RT0d; \
 	roll $1, rc ## d; \
 	addl (k + 8 * (n))(CTX), RXd; \
 	xorl RXd, rc ## d; \
 	xorl RT0d, rd ## d; \
 	rorl $1, rd ## d;
 
 #define encrypt_cycle(a, b, c, d, nc) \
 	encrypt_round(##a, ##b, ##c, ##d, (nc) * 2); \
 	encrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1);
 
 #define decrypt_cycle(a, b, c, d, nc) \
 	decrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); \
 	decrypt_round(##a, ##b, ##c, ##d, (nc) * 2);
 
 #define inpack(in, n, x, m) \
 	movl (4 * (n))(in), x; \
 	xorl (w + 4 * (m))(CTX), x;
 
 #define outunpack(out, n, x, m) \
 	xorl (w + 4 * (m))(CTX), x; \
 	movl x, (4 * (n))(out);
 
 .align 8
 .globl _gcry_twofish_amd64_encrypt_block
 ELF(.type   _gcry_twofish_amd64_encrypt_block,@function;)
 
 _gcry_twofish_amd64_encrypt_block:
 	/* input:
 	 *	%rdi: context, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
 	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
 
 	movq %rdx, RX;
 	inpack(RX, 0, RAd, 0);
 	inpack(RX, 1, RBd, 1);
 	inpack(RX, 2, RCd, 2);
 	inpack(RX, 3, RDd, 3);
 
 	encrypt_cycle(RA, RB, RC, RD, 0);
 	encrypt_cycle(RA, RB, RC, RD, 1);
 	encrypt_cycle(RA, RB, RC, RD, 2);
 	encrypt_cycle(RA, RB, RC, RD, 3);
 	encrypt_cycle(RA, RB, RC, RD, 4);
 	encrypt_cycle(RA, RB, RC, RD, 5);
 	encrypt_cycle(RA, RB, RC, RD, 6);
 	encrypt_cycle(RA, RB, RC, RD, 7);
 
 	movq (0 * 8)(%rsp), RX; /*dst*/
 	outunpack(RX, 0, RCd, 4);
 	outunpack(RX, 1, RDd, 5);
 	outunpack(RX, 2, RAd, 6);
 	outunpack(RX, 3, RBd, 7);
 
 	movq (2 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 .align 8
 .globl _gcry_twofish_amd64_decrypt_block
 ELF(.type   _gcry_twofish_amd64_decrypt_block,@function;)
 
 _gcry_twofish_amd64_decrypt_block:
 	/* input:
 	 *	%rdi: context, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
 	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
 
 	movq %rdx, RX;
 	inpack(RX, 0, RCd, 4);
 	inpack(RX, 1, RDd, 5);
 	inpack(RX, 2, RAd, 6);
 	inpack(RX, 3, RBd, 7);
 
 	decrypt_cycle(RA, RB, RC, RD, 7);
 	decrypt_cycle(RA, RB, RC, RD, 6);
 	decrypt_cycle(RA, RB, RC, RD, 5);
 	decrypt_cycle(RA, RB, RC, RD, 4);
 	decrypt_cycle(RA, RB, RC, RD, 3);
 	decrypt_cycle(RA, RB, RC, RD, 2);
 	decrypt_cycle(RA, RB, RC, RD, 1);
 	decrypt_cycle(RA, RB, RC, RD, 0);
 
 	movq (0 * 8)(%rsp), RX; /*dst*/
 	outunpack(RX, 0, RAd, 0);
 	outunpack(RX, 1, RBd, 1);
 	outunpack(RX, 2, RCd, 2);
 	outunpack(RX, 3, RDd, 3);
 
 	movq (2 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 #undef CTX
 
 #undef RA
 #undef RB
 #undef RC
 #undef RD
 
 #undef RAd
 #undef RBd
 #undef RCd
 #undef RDd
 
 #undef RAbl
 #undef RBbl
 #undef RCbl
 #undef RDbl
 
 #undef RAbh
 #undef RBbh
 #undef RCbh
 #undef RDbh
 
 #undef RX
 #undef RY
 
 #undef RXd
 #undef RYd
 
 #undef RT0
 #undef RT1
 #undef RT2
 #undef RT3
 
 #undef RT0d
 #undef RT1d
 #undef RT2d
 #undef RT3d
 
 /***********************************************************************
  * AMD64 assembly implementation of the Twofish cipher, 3-way parallel
  ***********************************************************************/
 #define CTX %rdi
 #define RIO %rdx
 
 #define RAB0 %rax
 #define RAB1 %rbx
 #define RAB2 %rcx
 
 #define RAB0d %eax
 #define RAB1d %ebx
 #define RAB2d %ecx
 
 #define RAB0bh %ah
 #define RAB1bh %bh
 #define RAB2bh %ch
 
 #define RAB0bl %al
 #define RAB1bl %bl
 #define RAB2bl %cl
 
 #define RCD0 %r8
 #define RCD1 %r9
 #define RCD2 %r10
 
 #define RCD0d %r8d
 #define RCD1d %r9d
 #define RCD2d %r10d
 
 #define RX0 %rbp
 #define RX1 %r11
 #define RX2 %r12
 
 #define RX0d %ebp
 #define RX1d %r11d
 #define RX2d %r12d
 
 #define RY0 %r13
 #define RY1 %r14
 #define RY2 %r15
 
 #define RY0d %r13d
 #define RY1d %r14d
 #define RY2d %r15d
 
 #define RT0 %rdx
 #define RT1 %rsi
 
 #define RT0d %edx
 #define RT1d %esi
 
 #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
 	movzbl ab ## bl,		tmp2 ## d; \
 	movzbl ab ## bh,		tmp1 ## d; \
 	rorq $(rot),			ab; \
 	op1##l T0(CTX, tmp2, 4),	dst ## d; \
 	op2##l T1(CTX, tmp1, 4),	dst ## d;
 
 /*
  * Combined G1 & G2 function. Reordered with help of rotates to have moves
  * at beginning.
  */
 #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
 	/* G1,1 && G2,1 */ \
 	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
 	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
 	\
 	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
 	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
 	\
 	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
 	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
 	\
 	/* G1,2 && G2,2 */ \
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
 	movq ab ## 0, RT0; \
 	movq cd ## 0, ab ## 0; \
 	movq RT0, cd ## 0; \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
 	movq ab ## 1, RT0; \
 	movq cd ## 1, ab ## 1; \
 	movq RT0, cd ## 1; \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
 	movq ab ## 2, RT0; \
 	movq cd ## 2, ab ## 2; \
 	movq RT0, cd ## 2;
 
 #define enc_round_end(ab, x, y, n) \
 	addl y ## d,			x ## d; \
 	addl x ## d,			y ## d; \
 	addl k+4*(2*(n))(CTX),		x ## d; \
 	xorl ab ## d,			x ## d; \
 	addl k+4*(2*(n)+1)(CTX),	y ## d; \
 	shrq $32,			ab; \
 	roll $1,			ab ## d; \
 	xorl y ## d,			ab ## d; \
 	shlq $32,			ab; \
 	rorl $1,			x ## d; \
 	orq x,				ab;
 
 #define dec_round_end(ba, x, y, n) \
 	addl y ## d,			x ## d; \
 	addl x ## d,			y ## d; \
 	addl k+4*(2*(n))(CTX),		x ## d; \
 	addl k+4*(2*(n)+1)(CTX),	y ## d; \
 	xorl ba ## d,			y ## d; \
 	shrq $32,			ba; \
 	roll $1,			ba ## d; \
 	xorl x ## d,			ba ## d; \
 	shlq $32,			ba; \
 	rorl $1,			y ## d; \
 	orq y,				ba;
 
 #define encrypt_round3(ab, cd, n) \
 	g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
 	\
 	enc_round_end(ab ## 0, RX0, RY0, n); \
 	enc_round_end(ab ## 1, RX1, RY1, n); \
 	enc_round_end(ab ## 2, RX2, RY2, n);
 
 #define decrypt_round3(ba, dc, n) \
 	g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
 	\
 	dec_round_end(ba ## 0, RX0, RY0, n); \
 	dec_round_end(ba ## 1, RX1, RY1, n); \
 	dec_round_end(ba ## 2, RX2, RY2, n);
 
 #define encrypt_cycle3(ab, cd, n) \
 	encrypt_round3(ab, cd, n*2); \
 	encrypt_round3(ab, cd, (n*2)+1);
 
 #define decrypt_cycle3(ba, dc, n) \
 	decrypt_round3(ba, dc, (n*2)+1); \
 	decrypt_round3(ba, dc, (n*2));
 
 #define inpack3(xy, m) \
 	xorq w+4*m(CTX),		xy ## 0; \
 	xorq w+4*m(CTX),		xy ## 1; \
 	xorq w+4*m(CTX),		xy ## 2;
 
 #define outunpack3(xy, m) \
 	xorq w+4*m(CTX),		xy ## 0; \
 	xorq w+4*m(CTX),		xy ## 1; \
 	xorq w+4*m(CTX),		xy ## 2;
 
 #define inpack_enc3() \
 	inpack3(RAB, 0); \
 	inpack3(RCD, 2);
 
 #define outunpack_enc3() \
 	outunpack3(RAB, 6); \
 	outunpack3(RCD, 4);
 
 #define inpack_dec3() \
 	inpack3(RAB, 4); \
 	rorq $32,			RAB0; \
 	rorq $32,			RAB1; \
 	rorq $32,			RAB2; \
 	inpack3(RCD, 6); \
 	rorq $32,			RCD0; \
 	rorq $32,			RCD1; \
 	rorq $32,			RCD2;
 
 #define outunpack_dec3() \
 	rorq $32,			RCD0; \
 	rorq $32,			RCD1; \
 	rorq $32,			RCD2; \
 	outunpack3(RCD, 0); \
 	rorq $32,			RAB0; \
 	rorq $32,			RAB1; \
 	rorq $32,			RAB2; \
 	outunpack3(RAB, 2);
 
 .align 8
 ELF(.type __twofish_enc_blk3,@function;)
 
 __twofish_enc_blk3:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three plaintext blocks
 	 * output:
 	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks
 	 */
+	CFI_STARTPROC();
+
 	inpack_enc3();
 
 	encrypt_cycle3(RAB, RCD, 0);
 	encrypt_cycle3(RAB, RCD, 1);
 	encrypt_cycle3(RAB, RCD, 2);
 	encrypt_cycle3(RAB, RCD, 3);
 	encrypt_cycle3(RAB, RCD, 4);
 	encrypt_cycle3(RAB, RCD, 5);
 	encrypt_cycle3(RAB, RCD, 6);
 	encrypt_cycle3(RAB, RCD, 7);
 
 	outunpack_enc3();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
 
 .align 8
 ELF(.type  __twofish_dec_blk3,@function;)
 
 __twofish_dec_blk3:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three ciphertext blocks
 	 * output:
 	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks
 	 */
+	CFI_STARTPROC();
+
 	inpack_dec3();
 
 	decrypt_cycle3(RAB, RCD, 7);
 	decrypt_cycle3(RAB, RCD, 6);
 	decrypt_cycle3(RAB, RCD, 5);
 	decrypt_cycle3(RAB, RCD, 4);
 	decrypt_cycle3(RAB, RCD, 3);
 	decrypt_cycle3(RAB, RCD, 2);
 	decrypt_cycle3(RAB, RCD, 1);
 	decrypt_cycle3(RAB, RCD, 0);
 
 	outunpack_dec3();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
 .align 8
 .globl _gcry_twofish_amd64_ctr_enc
 ELF(.type   _gcry_twofish_amd64_ctr_enc,@function;)
 _gcry_twofish_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
 	movq %rcx, RX0;
 
 	/* load IV and byteswap */
 	movq 8(RX0), RT0;
 	movq 0(RX0), RT1;
 	movq RT0, RCD0;
 	movq RT1, RAB0;
 	bswapq RT0;
 	bswapq RT1;
 
 	/* construct IVs */
 	movq RT0, RCD1;
 	movq RT1, RAB1;
 	movq RT0, RCD2;
 	movq RT1, RAB2;
 	addq $1, RCD1;
 	adcq $0, RAB1;
 	bswapq RCD1;
 	bswapq RAB1;
 	addq $2, RCD2;
 	adcq $0, RAB2;
 	bswapq RCD2;
 	bswapq RAB2;
 	addq $3, RT0;
 	adcq $0, RT1;
 	bswapq RT0;
 	bswapq RT1;
 
 	/* store new IV */
 	movq RT0, 8(RX0);
 	movq RT1, 0(RX0);
 
 	call __twofish_enc_blk3;
 
 	movq (7 * 8)(%rsp), RX0; /*src*/
 	movq (6 * 8)(%rsp), RX1; /*dst*/
 
 	/* XOR key-stream with plaintext */
 	xorq (0 * 8)(RX0), RCD0;
 	xorq (1 * 8)(RX0), RAB0;
 	xorq (2 * 8)(RX0), RCD1;
 	xorq (3 * 8)(RX0), RAB1;
 	xorq (4 * 8)(RX0), RCD2;
 	xorq (5 * 8)(RX0), RAB2;
 	movq RCD0, (0 * 8)(RX1);
 	movq RAB0, (1 * 8)(RX1);
 	movq RCD1, (2 * 8)(RX1);
 	movq RAB1, (3 * 8)(RX1);
 	movq RCD2, (4 * 8)(RX1);
 	movq RAB2, (5 * 8)(RX1);
 
 	movq (0 * 8)(%rsp), %rbp;
 	movq (1 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
 .align 8
 .globl _gcry_twofish_amd64_cbc_dec
 ELF(.type   _gcry_twofish_amd64_cbc_dec,@function;)
 _gcry_twofish_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(9 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
 	movq %rcx, (8 * 8)(%rsp);
 	movq %rdx, RX0;
 
 	/* load input */
 	movq (0 * 8)(RX0), RAB0;
 	movq (1 * 8)(RX0), RCD0;
 	movq (2 * 8)(RX0), RAB1;
 	movq (3 * 8)(RX0), RCD1;
 	movq (4 * 8)(RX0), RAB2;
 	movq (5 * 8)(RX0), RCD2;
 
 	call __twofish_dec_blk3;
 
 	movq (8 * 8)(%rsp), RT0; /*iv*/
 	movq (7 * 8)(%rsp), RX0; /*src*/
 	movq (6 * 8)(%rsp), RX1; /*dst*/
 
 	movq (4 * 8)(RX0), RY0;
 	movq (5 * 8)(RX0), RY1;
 	xorq (0 * 8)(RT0), RCD0;
 	xorq (1 * 8)(RT0), RAB0;
 	xorq (0 * 8)(RX0), RCD1;
 	xorq (1 * 8)(RX0), RAB1;
 	xorq (2 * 8)(RX0), RCD2;
 	xorq (3 * 8)(RX0), RAB2;
 	movq RY0, (0 * 8)(RT0);
 	movq RY1, (1 * 8)(RT0);
 
 	movq RCD0, (0 * 8)(RX1);
 	movq RAB0, (1 * 8)(RX1);
 	movq RCD1, (2 * 8)(RX1);
 	movq RAB1, (3 * 8)(RX1);
 	movq RCD2, (4 * 8)(RX1);
 	movq RAB2, (5 * 8)(RX1);
 
 	movq (0 * 8)(%rsp), %rbp;
 	movq (1 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-9 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
 .align 8
 .globl _gcry_twofish_amd64_cfb_dec
 ELF(.type   _gcry_twofish_amd64_cfb_dec,@function;)
 _gcry_twofish_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
 	movq %rdx, RX0;
 	movq %rcx, RX1;
 
 	/* load input */
 	movq (0 * 8)(RX1), RAB0;
 	movq (1 * 8)(RX1), RCD0;
 	movq (0 * 8)(RX0), RAB1;
 	movq (1 * 8)(RX0), RCD1;
 	movq (2 * 8)(RX0), RAB2;
 	movq (3 * 8)(RX0), RCD2;
 
 	/* Update IV */
 	movq (4 * 8)(RX0), RY0;
 	movq (5 * 8)(RX0), RY1;
 	movq RY0, (0 * 8)(RX1);
 	movq RY1, (1 * 8)(RX1);
 
 	call __twofish_enc_blk3;
 
 	movq (7 * 8)(%rsp), RX0; /*src*/
 	movq (6 * 8)(%rsp), RX1; /*dst*/
 
 	xorq (0 * 8)(RX0), RCD0;
 	xorq (1 * 8)(RX0), RAB0;
 	xorq (2 * 8)(RX0), RCD1;
 	xorq (3 * 8)(RX0), RAB1;
 	xorq (4 * 8)(RX0), RCD2;
 	xorq (5 * 8)(RX0), RAB2;
 	movq RCD0, (0 * 8)(RX1);
 	movq RAB0, (1 * 8)(RX1);
 	movq RCD1, (2 * 8)(RX1);
 	movq RAB1, (3 * 8)(RX1);
 	movq RCD2, (4 * 8)(RX1);
 	movq RAB2, (5 * 8)(RX1);
 
 	movq (0 * 8)(%rsp), %rbp;
 	movq (1 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
 .align 8
 .globl _gcry_twofish_amd64_ocb_enc
 ELF(.type   _gcry_twofish_amd64_ocb_enc,@function;)
 _gcry_twofish_amd64_ocb_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_6
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, RX0;
 	movq %rcx, RX1;
 	movq %r8, RX2;
 	movq %r9, RY0;
 	movq %rsi, RY1;
 
 	/* Load offset */
 	movq (0 * 8)(RX1), RT0;
 	movq (1 * 8)(RX1), RT1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq (RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (0 * 8)(RX0), RAB0;
 	movq (1 * 8)(RX0), RCD0;
 	/* Store Offset_i */
 	movq RT0, (0 * 8)(RY1);
 	movq RT1, (1 * 8)(RY1);
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	xor RAB0, (0 * 8)(RX2);
 	xor RCD0, (1 * 8)(RX2);
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB0;
 	xorq RT1, RCD0;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq 8(RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (2 * 8)(RX0), RAB1;
 	movq (3 * 8)(RX0), RCD1;
 	/* Store Offset_i */
 	movq RT0, (2 * 8)(RY1);
 	movq RT1, (3 * 8)(RY1);
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	xor RAB1, (0 * 8)(RX2);
 	xor RCD1, (1 * 8)(RX2);
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB1;
 	xorq RT1, RCD1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq 16(RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (4 * 8)(RX0), RAB2;
 	movq (5 * 8)(RX0), RCD2;
 	/* Store Offset_i */
 	movq RT0, (4 * 8)(RY1);
 	movq RT1, (5 * 8)(RY1);
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	xor RAB2, (0 * 8)(RX2);
 	xor RCD2, (1 * 8)(RX2);
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB2;
 	xorq RT1, RCD2;
 
 	/* Store offset */
 	movq RT0, (0 * 8)(RX1);
 	movq RT1, (1 * 8)(RX1);
 
 	/* CX_i = ENCIPHER(K, PX_i)  */
 	call __twofish_enc_blk3;
 
 	movq (6 * 8)(%rsp), RX1; /*dst*/
 
 	/* C_i = CX_i xor Offset_i  */
 	xorq RCD0, (0 * 8)(RX1);
 	xorq RAB0, (1 * 8)(RX1);
 	xorq RCD1, (2 * 8)(RX1);
 	xorq RAB1, (3 * 8)(RX1);
 	xorq RCD2, (4 * 8)(RX1);
 	xorq RAB2, (5 * 8)(RX1);
 
 	movq (0 * 8)(%rsp), %rbp;
 	movq (1 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
 .align 8
 .globl _gcry_twofish_amd64_ocb_dec
 ELF(.type   _gcry_twofish_amd64_ocb_dec,@function;)
 _gcry_twofish_amd64_ocb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (3 blocks)
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_6
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %r8,  (7 * 8)(%rsp);
 	movq %rdx, RX0;
 	movq %rcx, RX1;
 	movq %r9, RY0;
 	movq %rsi, RY1;
 
 	/* Load offset */
 	movq (0 * 8)(RX1), RT0;
 	movq (1 * 8)(RX1), RT1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq (RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (0 * 8)(RX0), RAB0;
 	movq (1 * 8)(RX0), RCD0;
 	/* Store Offset_i */
 	movq RT0, (0 * 8)(RY1);
 	movq RT1, (1 * 8)(RY1);
 	/* CX_i = C_i xor Offset_i */
 	xorq RT0, RAB0;
 	xorq RT1, RCD0;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq 8(RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (2 * 8)(RX0), RAB1;
 	movq (3 * 8)(RX0), RCD1;
 	/* Store Offset_i */
 	movq RT0, (2 * 8)(RY1);
 	movq RT1, (3 * 8)(RY1);
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB1;
 	xorq RT1, RCD1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq 16(RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (4 * 8)(RX0), RAB2;
 	movq (5 * 8)(RX0), RCD2;
 	/* Store Offset_i */
 	movq RT0, (4 * 8)(RY1);
 	movq RT1, (5 * 8)(RY1);
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB2;
 	xorq RT1, RCD2;
 
 	/* Store offset */
 	movq RT0, (0 * 8)(RX1);
 	movq RT1, (1 * 8)(RX1);
 
 	/* PX_i = DECIPHER(K, CX_i)  */
 	call __twofish_dec_blk3;
 
 	movq (7 * 8)(%rsp), RX2; /*checksum*/
 	movq (6 * 8)(%rsp), RX1; /*dst*/
 
 	/* Load checksum */
 	movq (0 * 8)(RX2), RT0;
 	movq (1 * 8)(RX2), RT1;
 
 	/* P_i = PX_i xor Offset_i  */
 	xorq RCD0, (0 * 8)(RX1);
 	xorq RAB0, (1 * 8)(RX1);
 	xorq RCD1, (2 * 8)(RX1);
 	xorq RAB1, (3 * 8)(RX1);
 	xorq RCD2, (4 * 8)(RX1);
 	xorq RAB2, (5 * 8)(RX1);
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	xorq (0 * 8)(RX1), RT0;
 	xorq (1 * 8)(RX1), RT1;
 	xorq (2 * 8)(RX1), RT0;
 	xorq (3 * 8)(RX1), RT1;
 	xorq (4 * 8)(RX1), RT0;
 	xorq (5 * 8)(RX1), RT1;
 
 	/* Store checksum */
 	movq RT0, (0 * 8)(RX2);
 	movq RT1, (1 * 8)(RX2);
 
 	movq (0 * 8)(%rsp), %rbp;
 	movq (1 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
 .align 8
 .globl _gcry_twofish_amd64_ocb_auth
 ELF(.type   _gcry_twofish_amd64_ocb_auth,@function;)
 _gcry_twofish_amd64_ocb_auth:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (3 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rcx, (6 * 8)(%rsp);
 	movq %rsi, RX0;
 	movq %rdx, RX1;
 	movq %r8, RY0;
 
 	/* Load offset */
 	movq (0 * 8)(RX1), RT0;
 	movq (1 * 8)(RX1), RT1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq (RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (0 * 8)(RX0), RAB0;
 	movq (1 * 8)(RX0), RCD0;
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB0;
 	xorq RT1, RCD0;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq 8(RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (2 * 8)(RX0), RAB1;
 	movq (3 * 8)(RX0), RCD1;
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB1;
 	xorq RT1, RCD1;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	movq 16(RY0), RY2;
 	xorq (0 * 8)(RY2), RT0;
 	xorq (1 * 8)(RY2), RT1;
 	movq (4 * 8)(RX0), RAB2;
 	movq (5 * 8)(RX0), RCD2;
 	/* PX_i = P_i xor Offset_i */
 	xorq RT0, RAB2;
 	xorq RT1, RCD2;
 
 	/* Store offset */
 	movq RT0, (0 * 8)(RX1);
 	movq RT1, (1 * 8)(RX1);
 
 	/* C_i = ENCIPHER(K, PX_i)  */
 	call __twofish_enc_blk3;
 
 	movq (6 * 8)(%rsp), RX1; /*checksum*/
 
 	/* Checksum_i = C_i xor Checksum_i  */
 	xorq RCD0, RCD1;
 	xorq RAB0, RAB1;
 	xorq RCD1, RCD2;
 	xorq RAB1, RAB2;
 	xorq RCD2, (0 * 8)(RX1);
 	xorq RAB2, (1 * 8)(RX1);
 
 	movq (0 * 8)(%rsp), %rbp;
 	movq (1 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
 
 #endif /*USE_TWOFISH*/
 #endif /*__x86_64*/
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index db6e2182..74cad355 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -1,1012 +1,1048 @@
 /* twofish-avx2-amd64.S  -  AMD64/AVX2 assembly implementation of Twofish cipher
  *
  * Copyright (C) 2013-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \
     defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
 /* structure of TWOFISH_context: */
 #define s0	0
 #define s1	((s0) + 4 * 256)
 #define s2	((s1) + 4 * 256)
 #define s3	((s2) + 4 * 256)
 #define w	((s3) + 4 * 256)
 #define k	((w) + 4 * 8)
 
 /* register macros */
 #define CTX	%rdi
 
 #define RROUND  %rbp
 #define RROUNDd %ebp
 #define RS0	CTX
 #define RS1	%r8
 #define RS2	%r9
 #define RS3	%r10
 #define RK	%r11
 #define RW	%rax
 
 #define RA0	%ymm8
 #define RB0	%ymm9
 #define RC0	%ymm10
 #define RD0	%ymm11
 #define RA1	%ymm12
 #define RB1	%ymm13
 #define RC1	%ymm14
 #define RD1	%ymm15
 
 /* temp regs */
 #define RX0	%ymm0
 #define RY0	%ymm1
 #define RX1	%ymm2
 #define RY1	%ymm3
 #define RT0	%ymm4
 #define RIDX	%ymm5
 
 #define RX0x	%xmm0
 #define RY0x	%xmm1
 #define RX1x	%xmm2
 #define RY1x	%xmm3
 #define RT0x	%xmm4
 #define RIDXx	%xmm5
 
 #define RTMP0   RX0
 #define RTMP0x  RX0x
 #define RTMP1   RX1
 #define RTMP1x  RX1x
 #define RTMP2   RY0
 #define RTMP2x  RY0x
 #define RTMP3   RY1
 #define RTMP3x  RY1x
 #define RTMP4   RIDX
 #define RTMP4x  RIDXx
 
 /* vpgatherdd mask and '-1' */
 #define RNOT	%ymm6
 #define RNOTx	%xmm6
 
 /* byte mask, (-1 >> 24) */
 #define RBYTE	%ymm7
 
 /**********************************************************************
   16-way AVX2 twofish
  **********************************************************************/
 #define init_round_constants() \
 	vpcmpeqd RNOT, RNOT, RNOT; \
 	leaq k(CTX), RK; \
 	leaq w(CTX), RW; \
 	vpsrld $24, RNOT, RBYTE; \
 	leaq s1(CTX), RS1; \
 	leaq s2(CTX), RS2; \
 	leaq s3(CTX), RS3; \
 
 #define g16(ab, rs0, rs1, rs2, rs3, xy) \
 	vpand RBYTE, ab ## 0, RIDX; \
 	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
 	vpcmpeqd RNOT, RNOT, RNOT; \
 		\
 		vpand RBYTE, ab ## 1, RIDX; \
 		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
 		vpcmpeqd RNOT, RNOT, RNOT; \
 	\
 	vpsrld $8, ab ## 0, RIDX; \
 	vpand RBYTE, RIDX, RIDX; \
 	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
 	vpcmpeqd RNOT, RNOT, RNOT; \
 	vpxor RT0, xy ## 0, xy ## 0; \
 		\
 		vpsrld $8, ab ## 1, RIDX; \
 		vpand RBYTE, RIDX, RIDX; \
 		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
 		vpcmpeqd RNOT, RNOT, RNOT; \
 		vpxor RT0, xy ## 1, xy ## 1; \
 	\
 	vpsrld $16, ab ## 0, RIDX; \
 	vpand RBYTE, RIDX, RIDX; \
 	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
 	vpcmpeqd RNOT, RNOT, RNOT; \
 	vpxor RT0, xy ## 0, xy ## 0; \
 		\
 		vpsrld $16, ab ## 1, RIDX; \
 		vpand RBYTE, RIDX, RIDX; \
 		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
 		vpcmpeqd RNOT, RNOT, RNOT; \
 		vpxor RT0, xy ## 1, xy ## 1; \
 	\
 	vpsrld $24, ab ## 0, RIDX; \
 	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
 	vpcmpeqd RNOT, RNOT, RNOT; \
 	vpxor RT0, xy ## 0, xy ## 0; \
 		\
 		vpsrld $24, ab ## 1, RIDX; \
 		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
 		vpcmpeqd RNOT, RNOT, RNOT; \
 		vpxor RT0, xy ## 1, xy ## 1;
 
 #define g1_16(a, x) \
 	g16(a, RS0, RS1, RS2, RS3, x);
 
 #define g2_16(b, y) \
 	g16(b, RS1, RS2, RS3, RS0, y);
 
 #define encrypt_round_end16(a, b, c, d, nk, r) \
 	vpaddd RY0, RX0, RX0; \
 	vpaddd RX0, RY0, RY0; \
 	vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
 	vpaddd RT0, RX0, RX0; \
 	vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
 	vpaddd RT0, RY0, RY0; \
 	\
 	vpxor RY0, d ## 0, d ## 0; \
 	\
 	vpxor RX0, c ## 0, c ## 0; \
 	vpsrld $1, c ## 0, RT0; \
 	vpslld $31, c ## 0, c ## 0; \
 	vpor RT0, c ## 0, c ## 0; \
 	\
 		vpaddd RY1, RX1, RX1; \
 		vpaddd RX1, RY1, RY1; \
 		vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
 		vpaddd RT0, RX1, RX1; \
 		vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
 		vpaddd RT0, RY1, RY1; \
 		\
 		vpxor RY1, d ## 1, d ## 1; \
 		\
 		vpxor RX1, c ## 1, c ## 1; \
 		vpsrld $1, c ## 1, RT0; \
 		vpslld $31, c ## 1, c ## 1; \
 		vpor RT0, c ## 1, c ## 1; \
 
 #define encrypt_round16(a, b, c, d, nk, r) \
 	g2_16(b, RY); \
 	\
 	vpslld $1, b ## 0, RT0; \
 	vpsrld $31, b ## 0, b ## 0; \
 	vpor RT0, b ## 0, b ## 0; \
 	\
 		vpslld $1, b ## 1, RT0; \
 		vpsrld $31, b ## 1, b ## 1; \
 		vpor RT0, b ## 1, b ## 1; \
 	\
 	g1_16(a, RX); \
 	\
 	encrypt_round_end16(a, b, c, d, nk, r);
 
 #define encrypt_round_first16(a, b, c, d, nk, r) \
 	vpslld $1, d ## 0, RT0; \
 	vpsrld $31, d ## 0, d ## 0; \
 	vpor RT0, d ## 0, d ## 0; \
 	\
 		vpslld $1, d ## 1, RT0; \
 		vpsrld $31, d ## 1, d ## 1; \
 		vpor RT0, d ## 1, d ## 1; \
 	\
 	encrypt_round16(a, b, c, d, nk, r);
 
 #define encrypt_round_last16(a, b, c, d, nk, r) \
 	g2_16(b, RY); \
 	\
 	g1_16(a, RX); \
 	\
 	encrypt_round_end16(a, b, c, d, nk, r);
 
 #define decrypt_round_end16(a, b, c, d, nk, r) \
 	vpaddd RY0, RX0, RX0; \
 	vpaddd RX0, RY0, RY0; \
 	vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
 	vpaddd RT0, RX0, RX0; \
 	vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
 	vpaddd RT0, RY0, RY0; \
 	\
 	vpxor RX0, c ## 0, c ## 0; \
 	\
 	vpxor RY0, d ## 0, d ## 0; \
 	vpsrld $1, d ## 0, RT0; \
 	vpslld $31, d ## 0, d ## 0; \
 	vpor RT0, d ## 0, d ## 0; \
 	\
 		vpaddd RY1, RX1, RX1; \
 		vpaddd RX1, RY1, RY1; \
 		vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
 		vpaddd RT0, RX1, RX1; \
 		vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
 		vpaddd RT0, RY1, RY1; \
 		\
 		vpxor RX1, c ## 1, c ## 1; \
 		\
 		vpxor RY1, d ## 1, d ## 1; \
 		vpsrld $1, d ## 1, RT0; \
 		vpslld $31, d ## 1, d ## 1; \
 		vpor RT0, d ## 1, d ## 1;
 
 #define decrypt_round16(a, b, c, d, nk, r) \
 	g1_16(a, RX); \
 	\
 	vpslld $1, a ## 0, RT0; \
 	vpsrld $31, a ## 0, a ## 0; \
 	vpor RT0, a ## 0, a ## 0; \
 	\
 		vpslld $1, a ## 1, RT0; \
 		vpsrld $31, a ## 1, a ## 1; \
 		vpor RT0, a ## 1, a ## 1; \
 	\
 	g2_16(b, RY); \
 	\
 	decrypt_round_end16(a, b, c, d, nk, r);
 
 #define decrypt_round_first16(a, b, c, d, nk, r) \
 	vpslld $1, c ## 0, RT0; \
 	vpsrld $31, c ## 0, c ## 0; \
 	vpor RT0, c ## 0, c ## 0; \
 	\
 		vpslld $1, c ## 1, RT0; \
 		vpsrld $31, c ## 1, c ## 1; \
 		vpor RT0, c ## 1, c ## 1; \
 	\
 	decrypt_round16(a, b, c, d, nk, r)
 
 #define decrypt_round_last16(a, b, c, d, nk, r) \
 	g1_16(a, RX); \
 	\
 	g2_16(b, RY); \
 	\
 	decrypt_round_end16(a, b, c, d, nk, r);
 
 #define encrypt_cycle16(r) \
 	encrypt_round16(RA, RB, RC, RD, 0, r); \
 	encrypt_round16(RC, RD, RA, RB, 8, r);
 
 #define encrypt_cycle_first16(r) \
 	encrypt_round_first16(RA, RB, RC, RD, 0, r); \
 	encrypt_round16(RC, RD, RA, RB, 8, r);
 
 #define encrypt_cycle_last16(r) \
 	encrypt_round16(RA, RB, RC, RD, 0, r); \
 	encrypt_round_last16(RC, RD, RA, RB, 8, r);
 
 #define decrypt_cycle16(r) \
 	decrypt_round16(RC, RD, RA, RB, 8, r); \
 	decrypt_round16(RA, RB, RC, RD, 0, r);
 
 #define decrypt_cycle_first16(r) \
 	decrypt_round_first16(RC, RD, RA, RB, 8, r); \
 	decrypt_round16(RA, RB, RC, RD, 0, r);
 
 #define decrypt_cycle_last16(r) \
 	decrypt_round16(RC, RD, RA, RB, 8, r); \
 	decrypt_round_last16(RA, RB, RC, RD, 0, r);
 
 #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1,	x0, x1; \
 	vpunpcklqdq t1,	x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2,	t2, x2;
 
 #define read_blocks8(offs,a,b,c,d) \
 	vmovdqu 16*offs(RIO), a; \
 	vmovdqu 16*offs+32(RIO), b; \
 	vmovdqu 16*offs+64(RIO), c; \
 	vmovdqu 16*offs+96(RIO), d; \
 	\
 	transpose_4x4(a, b, c, d, RX0, RY0);
 
 #define write_blocks8(offs,a,b,c,d) \
 	transpose_4x4(a, b, c, d, RX0, RY0); \
 	\
 	vmovdqu a, 16*offs(RIO); \
 	vmovdqu b, 16*offs+32(RIO); \
 	vmovdqu c, 16*offs+64(RIO); \
 	vmovdqu d, 16*offs+96(RIO);
 
 #define inpack_enc8(a,b,c,d) \
 	vpbroadcastd 4*0(RW), RT0; \
 	vpxor RT0, a, a; \
 	\
 	vpbroadcastd 4*1(RW), RT0; \
 	vpxor RT0, b, b; \
 	\
 	vpbroadcastd 4*2(RW), RT0; \
 	vpxor RT0, c, c; \
 	\
 	vpbroadcastd 4*3(RW), RT0; \
 	vpxor RT0, d, d;
 
 #define outunpack_enc8(a,b,c,d) \
 	vpbroadcastd 4*4(RW), RX0; \
 	vpbroadcastd 4*5(RW), RY0; \
 	vpxor RX0, c, RX0; \
 	vpxor RY0, d, RY0; \
 	\
 	vpbroadcastd 4*6(RW), RT0; \
 	vpxor RT0, a, c; \
 	vpbroadcastd 4*7(RW), RT0; \
 	vpxor RT0, b, d; \
 	\
 	vmovdqa RX0, a; \
 	vmovdqa RY0, b;
 
 #define inpack_dec8(a,b,c,d) \
 	vpbroadcastd 4*4(RW), RX0; \
 	vpbroadcastd 4*5(RW), RY0; \
 	vpxor RX0, a, RX0; \
 	vpxor RY0, b, RY0; \
 	\
 	vpbroadcastd 4*6(RW), RT0; \
 	vpxor RT0, c, a; \
 	vpbroadcastd 4*7(RW), RT0; \
 	vpxor RT0, d, b; \
 	\
 	vmovdqa RX0, c; \
 	vmovdqa RY0, d;
 
 #define outunpack_dec8(a,b,c,d) \
 	vpbroadcastd 4*0(RW), RT0; \
 	vpxor RT0, a, a; \
 	\
 	vpbroadcastd 4*1(RW), RT0; \
 	vpxor RT0, b, b; \
 	\
 	vpbroadcastd 4*2(RW), RT0; \
 	vpxor RT0, c, c; \
 	\
 	vpbroadcastd 4*3(RW), RT0; \
 	vpxor RT0, d, d;
 
 #define transpose4x4_16(a,b,c,d) \
 	transpose_4x4(a ## 0, b ## 0, c ## 0, d ## 0, RX0, RY0); \
 	transpose_4x4(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0);
 
 #define inpack_enc16(a,b,c,d) \
 	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
 	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
 
 #define outunpack_enc16(a,b,c,d) \
 	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
 	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
 
 #define inpack_dec16(a,b,c,d) \
 	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
 	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
 
 #define outunpack_dec16(a,b,c,d) \
 	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
 	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
 
 .align 8
 ELF(.type __twofish_enc_blk16,@function;)
 __twofish_enc_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						plaintext blocks
 	 * output:
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	init_round_constants();
 
 	transpose4x4_16(RA, RB, RC, RD);
 	inpack_enc16(RA, RB, RC, RD);
 
 	encrypt_cycle_first16(0);
 	encrypt_cycle16(2);
 	encrypt_cycle16(4);
 	encrypt_cycle16(6);
 	encrypt_cycle16(8);
 	encrypt_cycle16(10);
 	encrypt_cycle16(12);
 	encrypt_cycle_last16(14);
 
 	outunpack_enc16(RA, RB, RC, RD);
 	transpose4x4_16(RA, RB, RC, RD);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
 
 .align 8
 ELF(.type __twofish_dec_blk16,@function;)
 __twofish_dec_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						plaintext blocks
 	 * output:
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	init_round_constants();
 
 	transpose4x4_16(RA, RB, RC, RD);
 	inpack_dec16(RA, RB, RC, RD);
 
 	decrypt_cycle_first16(14);
 	decrypt_cycle16(12);
 	decrypt_cycle16(10);
 	decrypt_cycle16(8);
 	decrypt_cycle16(6);
 	decrypt_cycle16(4);
 	decrypt_cycle16(2);
 	decrypt_cycle_last16(0);
 
 	outunpack_dec16(RA, RB, RC, RD);
 	transpose4x4_16(RA, RB, RC, RD);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
 	vpslldq $8, tmp, tmp; \
 	vpsubq tmp, x, x;
 
 .align 8
 .globl _gcry_twofish_avx2_ctr_enc
 ELF(.type   _gcry_twofish_avx2_ctr_enc,@function;)
 _gcry_twofish_avx2_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
 	vzeroupper;
 
-	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
 	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), RTMP4x;
 	vpshufb RTMP3x, RTMP4x, RTMP4x;
 	vmovdqa RTMP4x, RTMP0x;
 	inc_le128(RTMP4x, RNOTx, RTMP1x);
 	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
 	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
 
 	/* check need for handling 64-bit overflow and carry */
 	cmpq $(0xffffffffffffffff - 16), %rax;
 	ja .Lhandle_ctr_carry;
 
 	/* construct IVs */
 	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
 	vpshufb RTMP3, RTMP0, RB0;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
 	vpshufb RTMP3, RTMP0, RC0;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
 	vpshufb RTMP3, RTMP0, RD0;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
 	vpshufb RTMP3, RTMP0, RA1;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
 	vpshufb RTMP3, RTMP0, RB1;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
 	vpshufb RTMP3, RTMP0, RC1;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
 	vpshufb RTMP3, RTMP0, RD1;
 	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
 	vpshufb RTMP3x, RTMP0x, RTMP0x;
 
 	jmp .Lctr_carry_done;
 
 .Lhandle_ctr_carry:
 	/* construct IVs */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RB0; /* +3 ; +2 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RC0; /* +5 ; +4 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RD0; /* +7 ; +6 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RA1; /* +9 ; +8 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RC1; /* +13 ; +12 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vpshufb RTMP3, RTMP0, RD1; /* +15 ; +14 */
 	inc_le128(RTMP0, RNOT, RTMP1);
 	vextracti128 $1, RTMP0, RTMP0x;
 	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
 .align 4
 .Lctr_carry_done:
 	/* store new IV */
 	vmovdqu RTMP0x, (%rcx);
 
 	call __twofish_enc_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
 	vpxor (1 * 32)(%rdx), RB0, RB0;
 	vpxor (2 * 32)(%rdx), RC0, RC0;
 	vpxor (3 * 32)(%rdx), RD0, RD0;
 	vpxor (4 * 32)(%rdx), RA1, RA1;
 	vpxor (5 * 32)(%rdx), RB1, RB1;
 	vpxor (6 * 32)(%rdx), RC1, RC1;
 	vpxor (7 * 32)(%rdx), RD1, RD1;
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vmovdqu RB0, (1 * 32)(%rsi);
 	vmovdqu RC0, (2 * 32)(%rsi);
 	vmovdqu RD0, (3 * 32)(%rsi);
 	vmovdqu RA1, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RC1, (6 * 32)(%rsi);
 	vmovdqu RD1, (7 * 32)(%rsi);
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_twofish_avx2_cbc_dec
 ELF(.type   _gcry_twofish_avx2_cbc_dec,@function;)
 _gcry_twofish_avx2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	vmovdqu (0 * 32)(%rdx), RA0;
 	vmovdqu (1 * 32)(%rdx), RB0;
 	vmovdqu (2 * 32)(%rdx), RC0;
 	vmovdqu (3 * 32)(%rdx), RD0;
 	vmovdqu (4 * 32)(%rdx), RA1;
 	vmovdqu (5 * 32)(%rdx), RB1;
 	vmovdqu (6 * 32)(%rdx), RC1;
 	vmovdqu (7 * 32)(%rdx), RD1;
 
 	call __twofish_dec_blk16;
 
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RNOT;
 	vpxor RNOT, RA0, RA0;
 	vpxor (0 * 32 + 16)(%rdx), RB0, RB0;
 	vpxor (1 * 32 + 16)(%rdx), RC0, RC0;
 	vpxor (2 * 32 + 16)(%rdx), RD0, RD0;
 	vpxor (3 * 32 + 16)(%rdx), RA1, RA1;
 	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
 	vpxor (5 * 32 + 16)(%rdx), RC1, RC1;
 	vpxor (6 * 32 + 16)(%rdx), RD1, RD1;
 	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
 	vmovdqu RNOTx, (%rcx); /* store new IV */
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vmovdqu RB0, (1 * 32)(%rsi);
 	vmovdqu RC0, (2 * 32)(%rsi);
 	vmovdqu RD0, (3 * 32)(%rsi);
 	vmovdqu RA1, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RC1, (6 * 32)(%rsi);
 	vmovdqu RD1, (7 * 32)(%rsi);
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_twofish_avx2_cfb_dec
 ELF(.type   _gcry_twofish_avx2_cfb_dec,@function;)
 _gcry_twofish_avx2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	/* Load input */
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RA0;
 	vmovdqu (0 * 32 + 16)(%rdx), RB0;
 	vmovdqu (1 * 32 + 16)(%rdx), RC0;
 	vmovdqu (2 * 32 + 16)(%rdx), RD0;
 	vmovdqu (3 * 32 + 16)(%rdx), RA1;
 	vmovdqu (4 * 32 + 16)(%rdx), RB1;
 	vmovdqu (5 * 32 + 16)(%rdx), RC1;
 	vmovdqu (6 * 32 + 16)(%rdx), RD1;
 
 	/* Update IV */
 	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
 	vmovdqu RNOTx, (%rcx);
 
 	call __twofish_enc_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
 	vpxor (1 * 32)(%rdx), RB0, RB0;
 	vpxor (2 * 32)(%rdx), RC0, RC0;
 	vpxor (3 * 32)(%rdx), RD0, RD0;
 	vpxor (4 * 32)(%rdx), RA1, RA1;
 	vpxor (5 * 32)(%rdx), RB1, RB1;
 	vpxor (6 * 32)(%rdx), RC1, RC1;
 	vpxor (7 * 32)(%rdx), RD1, RD1;
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vmovdqu RB0, (1 * 32)(%rsi);
 	vmovdqu RC0, (2 * 32)(%rsi);
 	vmovdqu RD0, (3 * 32)(%rsi);
 	vmovdqu RA1, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RC1, (6 * 32)(%rsi);
 	vmovdqu RD1, (7 * 32)(%rsi);
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
 
 .align 8
 .globl _gcry_twofish_avx2_ocb_enc
 ELF(.type _gcry_twofish_avx2_ocb_enc,@function;)
 
 _gcry_twofish_avx2_ocb_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 	vmovdqu (%r8), RTMP1x;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), RTMP0x, RNOTx; \
 	  vpxor (l1reg), RNOTx, RTMP0x; \
 	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
 	  vpxor yreg, RTMP1, RTMP1; \
 	  vpxor yreg, RNOT, yreg; \
 	  vmovdqu RNOT, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, RA0);
 	OCB_INPUT(1, %r12, %r13, RB0);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, RC0);
 	OCB_INPUT(3, %r12, %r13, RD0);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, RA1);
 	OCB_INPUT(5, %r12, %r13, RB1);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, RC1);
 	OCB_INPUT(7, %r12, %r13, RD1);
 #undef OCB_INPUT
 
 	vextracti128 $1, RTMP1, RNOTx;
 	vmovdqu RTMP0x, (%rcx);
 	vpxor RNOTx, RTMP1x, RTMP1x;
 	vmovdqu RTMP1x, (%r8);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor (0 * 32)(%rsi), RA0, RA0;
 	vpxor (1 * 32)(%rsi), RB0, RB0;
 	vpxor (2 * 32)(%rsi), RC0, RC0;
 	vpxor (3 * 32)(%rsi), RD0, RD0;
 	vpxor (4 * 32)(%rsi), RA1, RA1;
 	vpxor (5 * 32)(%rsi), RB1, RB1;
 	vpxor (6 * 32)(%rsi), RC1, RC1;
 	vpxor (7 * 32)(%rsi), RD1, RD1;
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vmovdqu RB0, (1 * 32)(%rsi);
 	vmovdqu RC0, (2 * 32)(%rsi);
 	vmovdqu RD0, (3 * 32)(%rsi);
 	vmovdqu RA1, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RC1, (6 * 32)(%rsi);
 	vmovdqu RD1, (7 * 32)(%rsi);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
 
 .align 8
 .globl _gcry_twofish_avx2_ocb_dec
 ELF(.type _gcry_twofish_avx2_ocb_dec,@function;)
 
 _gcry_twofish_avx2_ocb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: offset
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rdx), yreg; \
 	  vpxor (l0reg), RTMP0x, RNOTx; \
 	  vpxor (l1reg), RNOTx, RTMP0x; \
 	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
 	  vpxor yreg, RNOT, yreg; \
 	  vmovdqu RNOT, (n * 32)(%rsi);
 
 	movq (0 * 8)(%r9), %r10;
 	movq (1 * 8)(%r9), %r11;
 	movq (2 * 8)(%r9), %r12;
 	movq (3 * 8)(%r9), %r13;
 	OCB_INPUT(0, %r10, %r11, RA0);
 	OCB_INPUT(1, %r12, %r13, RB0);
 	movq (4 * 8)(%r9), %r10;
 	movq (5 * 8)(%r9), %r11;
 	movq (6 * 8)(%r9), %r12;
 	movq (7 * 8)(%r9), %r13;
 	OCB_INPUT(2, %r10, %r11, RC0);
 	OCB_INPUT(3, %r12, %r13, RD0);
 	movq (8 * 8)(%r9), %r10;
 	movq (9 * 8)(%r9), %r11;
 	movq (10 * 8)(%r9), %r12;
 	movq (11 * 8)(%r9), %r13;
 	OCB_INPUT(4, %r10, %r11, RA1);
 	OCB_INPUT(5, %r12, %r13, RB1);
 	movq (12 * 8)(%r9), %r10;
 	movq (13 * 8)(%r9), %r11;
 	movq (14 * 8)(%r9), %r12;
 	movq (15 * 8)(%r9), %r13;
 	OCB_INPUT(6, %r10, %r11, RC1);
 	OCB_INPUT(7, %r12, %r13, RD1);
 #undef OCB_INPUT
 
 	vmovdqu RTMP0x, (%rcx);
 	mov %r8, %rcx
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_dec_blk16;
 
 	vmovdqu (%rcx), RTMP1x;
 
 	vpxor (0 * 32)(%rsi), RA0, RA0;
 	vpxor (1 * 32)(%rsi), RB0, RB0;
 	vpxor (2 * 32)(%rsi), RC0, RC0;
 	vpxor (3 * 32)(%rsi), RD0, RD0;
 	vpxor (4 * 32)(%rsi), RA1, RA1;
 	vpxor (5 * 32)(%rsi), RB1, RB1;
 	vpxor (6 * 32)(%rsi), RC1, RC1;
 	vpxor (7 * 32)(%rsi), RD1, RD1;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
 	vmovdqu RA0, (0 * 32)(%rsi);
 	vpxor RA0, RTMP1, RTMP1;
 	vmovdqu RB0, (1 * 32)(%rsi);
 	vpxor RB0, RTMP1, RTMP1;
 	vmovdqu RC0, (2 * 32)(%rsi);
 	vpxor RC0, RTMP1, RTMP1;
 	vmovdqu RD0, (3 * 32)(%rsi);
 	vpxor RD0, RTMP1, RTMP1;
 	vmovdqu RA1, (4 * 32)(%rsi);
 	vpxor RA1, RTMP1, RTMP1;
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vpxor RB1, RTMP1, RTMP1;
 	vmovdqu RC1, (6 * 32)(%rsi);
 	vpxor RC1, RTMP1, RTMP1;
 	vmovdqu RD1, (7 * 32)(%rsi);
 	vpxor RD1, RTMP1, RTMP1;
 
 	vextracti128 $1, RTMP1, RNOTx;
 	vpxor RNOTx, RTMP1x, RTMP1x;
 	vmovdqu RTMP1x, (%rcx);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
 
 .align 8
 .globl _gcry_twofish_avx2_ocb_auth
 ELF(.type _gcry_twofish_avx2_ocb_auth,@function;)
 
 _gcry_twofish_avx2_ocb_auth:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: abuf (16 blocks)
 	 *	%rdx: offset
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rdx), RTMP0x;
 
 	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
 
 #define OCB_INPUT(n, l0reg, l1reg, yreg) \
 	  vmovdqu (n * 32)(%rsi), yreg; \
 	  vpxor (l0reg), RTMP0x, RNOTx; \
 	  vpxor (l1reg), RNOTx, RTMP0x; \
 	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
 	  vpxor yreg, RNOT, yreg;
 
 	movq (0 * 8)(%r8), %r10;
 	movq (1 * 8)(%r8), %r11;
 	movq (2 * 8)(%r8), %r12;
 	movq (3 * 8)(%r8), %r13;
 	OCB_INPUT(0, %r10, %r11, RA0);
 	OCB_INPUT(1, %r12, %r13, RB0);
 	movq (4 * 8)(%r8), %r10;
 	movq (5 * 8)(%r8), %r11;
 	movq (6 * 8)(%r8), %r12;
 	movq (7 * 8)(%r8), %r13;
 	OCB_INPUT(2, %r10, %r11, RC0);
 	OCB_INPUT(3, %r12, %r13, RD0);
 	movq (8 * 8)(%r8), %r10;
 	movq (9 * 8)(%r8), %r11;
 	movq (10 * 8)(%r8), %r12;
 	movq (11 * 8)(%r8), %r13;
 	OCB_INPUT(4, %r10, %r11, RA1);
 	OCB_INPUT(5, %r12, %r13, RB1);
 	movq (12 * 8)(%r8), %r10;
 	movq (13 * 8)(%r8), %r11;
 	movq (14 * 8)(%r8), %r12;
 	movq (15 * 8)(%r8), %r13;
 	OCB_INPUT(6, %r10, %r11, RC1);
 	OCB_INPUT(7, %r12, %r13, RD1);
 #undef OCB_INPUT
 
 	vmovdqu RTMP0x, (%rdx);
 
 	movq (0 * 8)(%rsp), %r10;
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_enc_blk16;
 
 	vpxor RA0, RB0, RA0;
 	vpxor RC0, RD0, RC0;
 	vpxor RA1, RB1, RA1;
 	vpxor RC1, RD1, RC1;
 
 	vpxor RA0, RC0, RA0;
 	vpxor RA1, RC1, RA1;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor RA1, RA0, RTMP1;
 
 	vextracti128 $1, RTMP1, RNOTx;
 	vpxor (%rcx), RTMP1x, RTMP1x;
 	vpxor RNOTx, RTMP1x, RTMP1x;
 	vmovdqu RTMP1x, (%rcx);
 
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
 
 .align 16
 
 /* For CTR-mode IV byteswap */
  _gcry_twofish_bswap128_mask:
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;)
 
 #endif /*defined(USE_TWOFISH) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S
index e98b831c..5631dc56 100644
--- a/cipher/whirlpool-sse2-amd64.S
+++ b/cipher/whirlpool-sse2-amd64.S
@@ -1,342 +1,348 @@
 /* whirlpool-sse2-amd64.S  -  AMD64 assembly implementation of Whirlpool
  *
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_WHIRLPOOL)
 
-#ifdef __PIC__
-#  define RIP %rip
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
 /* look-up table offsets on RTAB */
 #define RC (0)
 #define C0 (RC + (8 * 10))
 #define C1 (C0 + (8 * 256))
 #define C2 (C1 + (8 * 256))
 #define C3 (C2 + (8 * 256))
 #define C4 (C3 + (8 * 256))
 #define C5 (C4 + (8 * 256))
 #define C6 (C5 + (8 * 256))
 #define C7 (C6 + (8 * 256))
 
 /* stack variables */
 #define STACK_DATAP  (0)
 #define STACK_STATEP (STACK_DATAP + 8)
 #define STACK_ROUNDS (STACK_STATEP + 8)
 #define STACK_NBLKS  (STACK_ROUNDS + 8)
 #define STACK_RBP    (STACK_NBLKS + 8)
 #define STACK_RBX    (STACK_RBP + 8)
 #define STACK_R12    (STACK_RBX + 8)
 #define STACK_R13    (STACK_R12 + 8)
 #define STACK_R14    (STACK_R13 + 8)
 #define STACK_R15    (STACK_R14 + 8)
 #define STACK_MAX    (STACK_R15 + 8)
 
 /* register macros */
 #define RTAB	%rbp
 
 #define RI1	%rax
 #define RI2	%rbx
 #define RI3	%rcx
 #define RI4	%rdx
 
 #define RI1d	%eax
 #define RI2d	%ebx
 #define RI3d	%ecx
 #define RI4d	%edx
 
 #define RI1bl	%al
 #define RI2bl	%bl
 #define RI3bl	%cl
 #define RI4bl	%dl
 
 #define RI1bh	%ah
 #define RI2bh	%bh
 #define RI3bh	%ch
 #define RI4bh	%dh
 
 #define RB0	%r8
 #define RB1	%r9
 #define RB2	%r10
 #define RB3	%r11
 #define RB4	%r12
 #define RB5	%r13
 #define RB6	%r14
 #define RB7	%r15
 
 #define RT0	%rsi
 #define RT1	%rdi
 
 #define RT0d	%esi
 #define RT1d	%edi
 
 #define XKEY0	%xmm0
 #define XKEY1	%xmm1
 #define XKEY2	%xmm2
 #define XKEY3	%xmm3
 #define XKEY4	%xmm4
 #define XKEY5	%xmm5
 #define XKEY6	%xmm6
 #define XKEY7	%xmm7
 
 #define XSTATE0	%xmm8
 #define XSTATE1	%xmm9
 #define XSTATE2	%xmm10
 #define XSTATE3	%xmm11
 #define XSTATE4	%xmm12
 #define XSTATE5	%xmm13
 #define XSTATE6	%xmm14
 #define XSTATE7	%xmm15
 
 /***********************************************************************
  * AMD64 assembly implementation of Whirlpool.
  *  - Using table-lookups
  *  - Store state in XMM registers
  ***********************************************************************/
 #define __do_whirl(op, ri, \
 		   b0, b1, b2, b3, b4, b5, b6, b7, \
 		   load_ri, load_arg) \
 	movzbl		ri ## bl,	RT0d; \
 	movzbl		ri ## bh,	RT1d; \
 	shrq		$16,		ri; \
 	op ## q		C7(RTAB,RT0,8),	b7; \
 	op ## q		C6(RTAB,RT1,8),	b6; \
 	movzbl		ri ## bl,	RT0d; \
 	movzbl		ri ## bh,	RT1d; \
 	shrq		$16,		ri; \
 	op ## q		C5(RTAB,RT0,8),	b5; \
 	op ## q		C4(RTAB,RT1,8),	b4; \
 	movzbl		ri ## bl,	RT0d; \
 	movzbl		ri ## bh,	RT1d; \
 	shrl		$16,		ri ## d; \
 	op ## q		C3(RTAB,RT0,8),	b3; \
 	op ## q		C2(RTAB,RT1,8),	b2; \
 	movzbl		ri ## bl,	RT0d; \
 	movzbl		ri ## bh,	RT1d; \
 	load_ri(	load_arg,	ri); \
 	op ## q		C1(RTAB,RT0,8),	b1; \
 	op ## q		C0(RTAB,RT1,8),	b0;
 
 #define do_whirl(op, ri, rb_add, load_ri, load_arg) \
 	__do_whirl(op, ##ri, rb_add, load_ri, load_arg)
 
 #define dummy(...) /*_*/
 
 #define do_movq(src, dst) movq src, dst;
 
 #define RB_ADD0 RB0, RB1, RB2, RB3, RB4, RB5, RB6, RB7
 #define RB_ADD1 RB1, RB2, RB3, RB4, RB5, RB6, RB7, RB0
 #define RB_ADD2 RB2, RB3, RB4, RB5, RB6, RB7, RB0, RB1
 #define RB_ADD3 RB3, RB4, RB5, RB6, RB7, RB0, RB1, RB2
 #define RB_ADD4 RB4, RB5, RB6, RB7, RB0, RB1, RB2, RB3
 #define RB_ADD5 RB5, RB6, RB7, RB0, RB1, RB2, RB3, RB4
 #define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5
 #define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6
 
 .align 8
 .globl _gcry_whirlpool_transform_amd64
 ELF(.type  _gcry_whirlpool_transform_amd64,@function;)
 
 _gcry_whirlpool_transform_amd64:
 	/* input:
 	 *	%rdi: state
 	 *	%rsi: inblk
 	 *	%rdx: nblks
 	 *      %rcx: look-up tables
 	 */
+	CFI_STARTPROC();
 	cmp $0, %rdx;
 	je .Lskip;
 
 	subq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(STACK_MAX);
 	movq %rbp, STACK_RBP(%rsp);
 	movq %rbx, STACK_RBX(%rsp);
 	movq %r12, STACK_R12(%rsp);
 	movq %r13, STACK_R13(%rsp);
 	movq %r14, STACK_R14(%rsp);
 	movq %r15, STACK_R15(%rsp);
+	CFI_REL_OFFSET(%rbp, STACK_RBP);
+	CFI_REL_OFFSET(%rbx, STACK_RBX);
+	CFI_REL_OFFSET(%r12, STACK_R12);
+	CFI_REL_OFFSET(%r13, STACK_R13);
+	CFI_REL_OFFSET(%r14, STACK_R14);
+	CFI_REL_OFFSET(%r15, STACK_R15);
 
 	movq %rdx, STACK_NBLKS(%rsp);
 	movq %rdi, STACK_STATEP(%rsp);
 	movq %rsi, STACK_DATAP(%rsp);
 
 	movq %rcx, RTAB;
 
 	jmp .Lfirst_block;
 
 .align 8
 .Lblock_loop:
 	movq STACK_DATAP(%rsp), %rsi;
 	movq RI1, %rdi;
 
 .Lfirst_block:
 	/* load data_block */
 	movq 0*8(%rsi), RB0;
 	movq 1*8(%rsi), RB1;
 	bswapq RB0;
 	movq 2*8(%rsi), RB2;
 	bswapq RB1;
 	movq 3*8(%rsi), RB3;
 	bswapq RB2;
 	movq 4*8(%rsi), RB4;
 	bswapq RB3;
 	movq 5*8(%rsi), RB5;
 	bswapq RB4;
 	movq RB0, XSTATE0;
 	movq 6*8(%rsi), RB6;
 	bswapq RB5;
 	movq RB1, XSTATE1;
 	movq 7*8(%rsi), RB7;
 	bswapq RB6;
 	movq RB2, XSTATE2;
 	bswapq RB7;
 	movq RB3, XSTATE3;
 	movq RB4, XSTATE4;
 	movq RB5, XSTATE5;
 	movq RB6, XSTATE6;
 	movq RB7, XSTATE7;
 
 	/* load key */
 	movq 0*8(%rdi), XKEY0;
 	movq 1*8(%rdi), XKEY1;
 	movq 2*8(%rdi), XKEY2;
 	movq 3*8(%rdi), XKEY3;
 	movq 4*8(%rdi), XKEY4;
 	movq 5*8(%rdi), XKEY5;
 	movq 6*8(%rdi), XKEY6;
 	movq 7*8(%rdi), XKEY7;
 
 	movq XKEY0, RI1;
 	movq XKEY1, RI2;
 	movq XKEY2, RI3;
 	movq XKEY3, RI4;
 
 	/* prepare and store state */
 	pxor XKEY0, XSTATE0;
 	pxor XKEY1, XSTATE1;
 	pxor XKEY2, XSTATE2;
 	pxor XKEY3, XSTATE3;
 	pxor XKEY4, XSTATE4;
 	pxor XKEY5, XSTATE5;
 	pxor XKEY6, XSTATE6;
 	pxor XKEY7, XSTATE7;
 
 	movq XSTATE0, 0*8(%rdi);
 	movq XSTATE1, 1*8(%rdi);
 	movq XSTATE2, 2*8(%rdi);
 	movq XSTATE3, 3*8(%rdi);
 	movq XSTATE4, 4*8(%rdi);
 	movq XSTATE5, 5*8(%rdi);
 	movq XSTATE6, 6*8(%rdi);
 	movq XSTATE7, 7*8(%rdi);
 
 	addq $64, STACK_DATAP(%rsp);
 	movl $(0), STACK_ROUNDS(%rsp);
 .align 8
 .Lround_loop:
 	do_whirl(mov, RI1 /*XKEY0*/, RB_ADD0, do_movq, XKEY4);
 	do_whirl(xor, RI2 /*XKEY1*/, RB_ADD1, do_movq, XKEY5);
 	do_whirl(xor, RI3 /*XKEY2*/, RB_ADD2, do_movq, XKEY6);
 	do_whirl(xor, RI4 /*XKEY3*/, RB_ADD3, do_movq, XKEY7);
 	do_whirl(xor, RI1 /*XKEY0*/, RB_ADD4, do_movq, XSTATE0);
 	do_whirl(xor, RI2 /*XKEY1*/, RB_ADD5, do_movq, XSTATE1);
 	do_whirl(xor, RI3 /*XKEY2*/, RB_ADD6, do_movq, XSTATE2);
 	do_whirl(xor, RI4 /*XKEY3*/, RB_ADD7, do_movq, XSTATE3);
 
 	movl STACK_ROUNDS(%rsp), RT0d;
 	movq RB1, XKEY1;
 	addl $1, STACK_ROUNDS(%rsp);
 	movq RB2, XKEY2;
 	movq RB3, XKEY3;
 	xorq RC(RTAB,RT0,8), RB0; /* Add round constant */
 	movq RB4, XKEY4;
 	movq RB5, XKEY5;
 	movq RB0, XKEY0;
 	movq RB6, XKEY6;
 	movq RB7, XKEY7;
 
 	do_whirl(xor, RI1 /*XSTATE0*/, RB_ADD0, do_movq, XSTATE4);
 	do_whirl(xor, RI2 /*XSTATE1*/, RB_ADD1, do_movq, XSTATE5);
 	do_whirl(xor, RI3 /*XSTATE2*/, RB_ADD2, do_movq, XSTATE6);
 	do_whirl(xor, RI4 /*XSTATE3*/, RB_ADD3, do_movq, XSTATE7);
 
 	cmpl $10, STACK_ROUNDS(%rsp);
 	je .Lis_last_round;
 
 	do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, do_movq, XKEY0);
 	do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, do_movq, XKEY1);
 	do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, do_movq, XKEY2);
 	do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, do_movq, XKEY3);
 	movq RB0, XSTATE0;
 	movq RB1, XSTATE1;
 	movq RB2, XSTATE2;
 	movq RB3, XSTATE3;
 	movq RB4, XSTATE4;
 	movq RB5, XSTATE5;
 	movq RB6, XSTATE6;
 	movq RB7, XSTATE7;
 
 	jmp .Lround_loop;
 .align 8
 .Lis_last_round:
 	do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, dummy, _);
 	movq STACK_STATEP(%rsp), RI1;
 	do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, dummy, _);
 	do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, dummy, _);
 	do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, dummy, _);
 
 	/* store state */
 	xorq RB0, 0*8(RI1);
 	xorq RB1, 1*8(RI1);
 	xorq RB2, 2*8(RI1);
 	xorq RB3, 3*8(RI1);
 	xorq RB4, 4*8(RI1);
 	xorq RB5, 5*8(RI1);
 	xorq RB6, 6*8(RI1);
 	xorq RB7, 7*8(RI1);
 
 	subq $1, STACK_NBLKS(%rsp);
 	jnz .Lblock_loop;
 
 	movq STACK_RBP(%rsp), %rbp;
 	movq STACK_RBX(%rsp), %rbx;
 	movq STACK_R12(%rsp), %r12;
 	movq STACK_R13(%rsp), %r13;
 	movq STACK_R14(%rsp), %r14;
 	movq STACK_R15(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(-STACK_MAX);
 .Lskip:
 	movl $(STACK_MAX + 8), %eax;
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;)
 
 #endif
 #endif
diff --git a/configure.ac b/configure.ac
index b54b212b..1aafc320 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,2786 +1,2813 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 # Copyright (C) 2012-2017  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # (Process this file with autoconf to produce a configure script.)
 AC_REVISION($Revision$)
 AC_PREREQ(2.60)
 min_automake_version="1.14"
 
 # To build a release you need to create a tag with the version number
 # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force".  Please
 # bump the version number immediately after the release and do another
 # commit and push so that the git magic is able to work.  See below
 # for the LT versions.
 m4_define([mym4_package],[libgcrypt])
 m4_define([mym4_major], [1])
 m4_define([mym4_minor], [9])
 m4_define([mym4_micro], [0])
 
 # Below is m4 magic to extract and compute the git revision number,
 # the decimalized short revision number, a beta version string and a
 # flag indicating a development version (mym4_isbeta).  Note that the
 # m4 processing is done by autoconf and not during the configure run.
 m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \
                            mym4_package mym4_major mym4_minor mym4_micro),[:]))
 m4_define([mym4_isbeta],       m4_argn(2, mym4_verslist))
 m4_define([mym4_version],      m4_argn(4, mym4_verslist))
 m4_define([mym4_revision],     m4_argn(7, mym4_verslist))
 m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist))
 m4_esyscmd([echo ]mym4_version[>VERSION])
 AC_INIT([mym4_package],[mym4_version], [https://bugs.gnupg.org])
 
 # LT Version numbers, remember to change them just *before* a release.
 #   (Code changed:			REVISION++)
 #   (Interfaces added/removed/changed:	CURRENT++, REVISION=0)
 #   (Interfaces added:			AGE++)
 #   (Interfaces removed:		AGE=0)
 #
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
 LIBGCRYPT_LT_CURRENT=23
 LIBGCRYPT_LT_AGE=3
 LIBGCRYPT_LT_REVISION=0
 ################################################
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
 AC_SUBST(LIBGCRYPT_LT_AGE)
 AC_SUBST(LIBGCRYPT_LT_REVISION)
 
 # If the API is changed in an incompatible way: increment the next counter.
 #
 # 1.6: ABI and API change but the change is to most users irrelevant
 #      and thus the API version number has not been incremented.
 LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
 NEED_GPG_ERROR_VERSION=1.25
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
 AM_INIT_AUTOMAKE([serial-tests dist-bzip2])
 AC_CONFIG_HEADER(config.h)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_LIBOBJ_DIR([compat])
 AC_CANONICAL_HOST
 AM_MAINTAINER_MODE
 AM_SILENT_RULES
 
 AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory])
 
 AH_TOP([
 #ifndef _GCRYPT_CONFIG_H_INCLUDED
 #define _GCRYPT_CONFIG_H_INCLUDED
 
 /* Enable gpg-error's strerror macro for W32CE.  */
 #define GPG_ERR_ENABLE_ERRNO_MACROS 1
 ])
 
 AH_BOTTOM([
 #define _GCRYPT_IN_LIBGCRYPT 1
 
 /* If the configure check for endianness has been disabled, get it from
    OS macros.  This is intended for making fat binary builds on OS X.  */
 #ifdef DISABLED_ENDIAN_CHECK
 # if defined(__BIG_ENDIAN__)
 #  define WORDS_BIGENDIAN 1
 # elif defined(__LITTLE_ENDIAN__)
 #  undef WORDS_BIGENDIAN
 # else
 #  error "No endianness found"
 # endif
 #endif /*DISABLED_ENDIAN_CHECK*/
 
 /* We basically use the original Camellia source.  Make sure the symbols
    properly prefixed.  */
 #define CAMELLIA_EXT_SYM_PREFIX _gcry_
 
 #endif /*_GCRYPT_CONFIG_H_INCLUDED*/
 ])
 
 AH_VERBATIM([_REENTRANT],
 [/* To allow the use of Libgcrypt in multithreaded programs we have to use
     special features from the library. */
 #ifndef _REENTRANT
 # define _REENTRANT 1
 #endif
 ])
 
 
 ######################
 ##  Basic checks.  ### (we need some results later on (e.g. $GCC)
 ######################
 
 AC_PROG_MAKE_SET
 missing_dir=`cd $ac_aux_dir && pwd`
 AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
 AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
 AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
 AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
 # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
 AC_PROG_CC
 AC_PROG_CPP
 AM_PROG_CC_C_O
 AM_PROG_AS
 AC_ISC_POSIX
 AC_PROG_INSTALL
 AC_PROG_AWK
 
 AC_GNU_SOURCE
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
 dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has
 dnl the precedence over the run path, so that if a compatible MPFR library
 dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested
 dnl MPFR library will be this library instead of the MPFR library from the
 dnl build tree. Other OS with the same issue might be added later.
 dnl
 dnl References:
 dnl   https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732
 dnl   http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html
 dnl
 dnl We need to check whether --disable-new-dtags is supported as alternate
 dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc).
 dnl
 case $host in
   *-*-linux*)
     if test -n "$LD_LIBRARY_PATH"; then
       saved_LDFLAGS="$LDFLAGS"
       LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags"
       LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE"
       AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker)
       AC_LINK_IFELSE([AC_LANG_SOURCE([[
 int main (void) { return 0; }
       ]])],
       [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))],
       [AC_MSG_RESULT(no)
        LDADD_FOR_TESTS_KLUDGE=""
       ])
       LDFLAGS="$saved_LDFLAGS"
     fi
     ;;
 esac
 AC_SUBST([LDADD_FOR_TESTS_KLUDGE])
 
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \
                           mym4_minor mym4_micro)
 AC_SUBST(VERSION_NUMBER)
 
 # We need to compile and run a program on the build machine.  A
 # comment in libgpg-error says that the AC_PROG_CC_FOR_BUILD macro in
 # the AC archive is broken for autoconf 2.57.  Given that there is no
 # newer version of that macro, we assume that it is also broken for
 # autoconf 2.61 and thus we use a simple but usually sufficient
 # approach.
 AC_MSG_CHECKING(for cc for build)
 if test "$cross_compiling" = "yes"; then
   CC_FOR_BUILD="${CC_FOR_BUILD-cc}"
 else
   CC_FOR_BUILD="${CC_FOR_BUILD-$CC}"
 fi
 AC_MSG_RESULT($CC_FOR_BUILD)
 AC_ARG_VAR(CC_FOR_BUILD,[build system C compiler])
 
 
 LT_PREREQ([2.2.6])
 LT_INIT([win32-dll disable-static])
 LT_LANG([Windows Resource])
 
 
 ##########################
 ## General definitions. ##
 ##########################
 
 # Used by libgcrypt-config
 LIBGCRYPT_CONFIG_LIBS="-lgcrypt"
 LIBGCRYPT_CONFIG_CFLAGS=""
 LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
 available_pubkey_ciphers="dsa elgamal rsa ecc"
 enabled_pubkey_ciphers=""
 
 # Definitions for message digests.
 available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512"
 available_digests="$available_digests sha3 tiger whirlpool stribog blake2"
 available_digests="$available_digests sm3"
 enabled_digests=""
 
 # Definitions for kdfs (optional ones)
 available_kdfs="s2k pkdf2 scrypt"
 enabled_kdfs=""
 
 # Definitions for random modules.
 available_random_modules="linux egd unix"
 auto_random_modules="$available_random_modules"
 
 # Supported thread backends.
 LIBGCRYPT_THREAD_MODULES=""
 
 # Other definitions.
 have_w32_system=no
 have_w32ce_system=no
 have_pthread=no
 
 
 # Setup some stuff depending on host.
 case "${host}" in
     *-*-mingw32*)
       ac_cv_have_dev_random=no
       have_w32_system=yes
       case "${host}" in
         *-mingw32ce*)
             have_w32ce_system=yes
             available_random_modules="w32ce"
             ;;
         *)
             available_random_modules="w32"
             ;;
       esac
       AC_DEFINE(USE_ONLY_8DOT3,1,
                 [set this to limit filenames to the 8.3 format])
       AC_DEFINE(HAVE_DRIVE_LETTERS,1,
                 [defined if we must run on a stupid file system])
       AC_DEFINE(HAVE_DOSISH_SYSTEM,1,
                 [defined if we run on some of the PCDOS like systems
                  (DOS, Windoze. OS/2) with special properties like
                   no file modes])
       ;;
 
     i?86-emx-os2 | i?86-*-os2*emx)
         # OS/2 with the EMX environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     i?86-*-msdosdjgpp*)
         # DOS with the DJGPP environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     *-*-hpux*)
         if test -z "$GCC" ; then
             CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE"
         fi
         ;;
     *-dec-osf4*)
         if test -z "$GCC" ; then
             # Suppress all warnings
             # to get rid of the unsigned/signed char mismatch warnings.
             CFLAGS="$CFLAGS -w"
         fi
         ;;
     m68k-atari-mint)
         ;;
     *-apple-darwin*)
         AC_DEFINE(_DARWIN_C_SOURCE, 900000L,
                   Expose all libc features (__DARWIN_C_FULL).)
         ;;
     *)
       ;;
 esac
 
 if test "$have_w32_system" = yes; then
    AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system])
    if test "$have_w32ce_system" = yes; then
      AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE])
    fi
 fi
 AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes)
 AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes)
 
 
 
 # A printable OS Name is sometimes useful.
 case "${host}" in
     *-*-mingw32ce*)
         PRINTABLE_OS_NAME="W32CE"
         ;;
 
     *-*-mingw32*)
         PRINTABLE_OS_NAME="W32"
         ;;
 
     i?86-emx-os2 | i?86-*-os2*emx )
         PRINTABLE_OS_NAME="OS/2"
         ;;
 
     i?86-*-msdosdjgpp*)
         PRINTABLE_OS_NAME="MSDOS/DJGPP"
         ;;
 
     *-linux*)
         PRINTABLE_OS_NAME="GNU/Linux"
         ;;
 
     *)
         PRINTABLE_OS_NAME=`uname -s || echo "Unknown"`
         ;;
 esac
 
 NAME_OF_DEV_RANDOM="/dev/random"
 NAME_OF_DEV_URANDOM="/dev/urandom"
 
 AC_ARG_ENABLE(endian-check,
               AC_HELP_STRING([--disable-endian-check],
 	      [disable the endian check and trust the OS provided macros]),
 	      endiancheck=$enableval,endiancheck=yes)
 if test x"$endiancheck" = xyes ; then
   AC_C_BIGENDIAN
 else
   AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness])
 fi
 
 AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
 
 if test "$ac_cv_sizeof_unsigned_short" = "0" \
    || test "$ac_cv_sizeof_unsigned_int" = "0" \
    || test "$ac_cv_sizeof_unsigned_long" = "0"; then
     AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]);
 fi
 
 # Ensure that we have UINT64_C before we bother to check for uint64_t
 AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works],
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <inttypes.h>]],
        [[uint64_t foo=UINT64_C(42);]])],
      gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no))
 if test "$gnupg_cv_uint64_c_works" = "yes" ; then
    AC_CHECK_SIZEOF(uint64_t)
 fi
 
 # Do we have any 64-bit data types?
 if test "$ac_cv_sizeof_unsigned_int" != "8" \
    && test "$ac_cv_sizeof_unsigned_long" != "8" \
    && test "$ac_cv_sizeof_unsigned_long_long" != "8" \
    && test "$ac_cv_sizeof_uint64_t" != "8"; then
     AC_MSG_ERROR([[
 ***
 *** No 64-bit integer type available.
 *** It is not possible to build Libgcrypt on this platform.
 ***]])
 fi
 
 
 # If not specified otherwise, all available algorithms will be
 # included.
 default_ciphers="$available_ciphers"
 default_pubkey_ciphers="$available_pubkey_ciphers"
 default_digests="$available_digests"
 default_kdfs="$available_kdfs"
 # Blacklist MD2 by default
 default_digests=`echo $default_digests | sed -e 's/md2//g'`
 
 # Substitutions to set generated files in a Emacs buffer to read-only.
 AC_SUBST(emacs_local_vars_begin, ['Local Variables:'])
 AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t'])
 AC_SUBST(emacs_local_vars_end, ['End:'])
 
 ############################
 ## Command line switches. ##
 ############################
 
 # Implementation of the --enable-ciphers switch.
 AC_ARG_ENABLE(ciphers,
 	      AC_HELP_STRING([--enable-ciphers=ciphers],
 			     [select the symmetric ciphers to include]),
 	      [enabled_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_ciphers=""])
 if test "x$enabled_ciphers" = "x" \
    -o "$enabled_ciphers" = "yes"  \
    -o "$enabled_ciphers" = "no"; then
    enabled_ciphers=$default_ciphers
 fi
 AC_MSG_CHECKING([which symmetric ciphers to include])
 for cipher in $enabled_ciphers; do
     LIST_MEMBER($cipher, $available_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported cipher "$cipher" specified])
     fi
 done
 AC_MSG_RESULT([$enabled_ciphers])
 
 # Implementation of the --enable-pubkey-ciphers switch.
 AC_ARG_ENABLE(pubkey-ciphers,
 	      AC_HELP_STRING([--enable-pubkey-ciphers=ciphers],
 			     [select the public-key ciphers to include]),
 	      [enabled_pubkey_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_pubkey_ciphers=""])
 if test "x$enabled_pubkey_ciphers" = "x" \
    -o "$enabled_pubkey_ciphers" = "yes"  \
    -o "$enabled_pubkey_ciphers" = "no"; then
    enabled_pubkey_ciphers=$default_pubkey_ciphers
 fi
 AC_MSG_CHECKING([which public-key ciphers to include])
 for cipher in $enabled_pubkey_ciphers; do
     LIST_MEMBER($cipher, $available_pubkey_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported public-key cipher specified])
     fi
 done
 AC_MSG_RESULT([$enabled_pubkey_ciphers])
 
 # Implementation of the --enable-digests switch.
 AC_ARG_ENABLE(digests,
 	      AC_HELP_STRING([--enable-digests=digests],
 			     [select the message digests to include]),
 	      [enabled_digests=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_digests=""])
 if test "x$enabled_digests" = "x" \
    -o "$enabled_digests" = "yes"  \
    -o "$enabled_digests" = "no"; then
    enabled_digests=$default_digests
 fi
 AC_MSG_CHECKING([which message digests to include])
 for digest in $enabled_digests; do
     LIST_MEMBER($digest, $available_digests)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported message digest specified])
     fi
 done
 AC_MSG_RESULT([$enabled_digests])
 
 # Implementation of the --enable-kdfs switch.
 AC_ARG_ENABLE(kdfs,
       AC_HELP_STRING([--enable-kfds=kdfs],
 		     [select the KDFs to include]),
       [enabled_kdfs=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
       [enabled_kdfs=""])
 if test "x$enabled_kdfs" = "x" \
    -o "$enabled_kdfs" = "yes"  \
    -o "$enabled_kdfs" = "no"; then
    enabled_kdfs=$default_kdfs
 fi
 AC_MSG_CHECKING([which key derivation functions to include])
 for kdf in $enabled_kdfs; do
     LIST_MEMBER($kdf, $available_kdfs)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported key derivation function specified])
     fi
 done
 AC_MSG_RESULT([$enabled_kdfs])
 
 # Implementation of the --enable-random switch.
 AC_ARG_ENABLE(random,
 	      AC_HELP_STRING([--enable-random=name],
 	                     [select which random number generator to use]),
 	      [random=`echo $enableval | tr '[A-Z]' '[a-z]'`],
 	      [])
 if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then
     random=default
 fi
 AC_MSG_CHECKING([which random module to use])
 if test "$random" != "default" -a "$random" != "auto"; then
     LIST_MEMBER($random, $available_random_modules)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported random module specified])
     fi
 fi
 AC_MSG_RESULT($random)
 
 # Implementation of the --disable-dev-random switch.
 AC_MSG_CHECKING([whether use of /dev/random is requested])
 AC_ARG_ENABLE(dev-random,
 [  --disable-dev-random    disable the use of dev random],
     try_dev_random=$enableval, try_dev_random=yes)
 AC_MSG_RESULT($try_dev_random)
 
 # Implementation of the --with-egd-socket switch.
 AC_ARG_WITH(egd-socket,
     [  --with-egd-socket=NAME  Use NAME for the EGD socket)],
             egd_socket_name="$withval", egd_socket_name="" )
 AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
 # Implementation of the --enable-random-daemon
 AC_MSG_CHECKING([whether the experimental random daemon is requested])
 AC_ARG_ENABLE([random-daemon],
               AC_HELP_STRING([--enable-random-daemon],
                              [Build and support the experimental gcryptrnd]),
               [use_random_daemon=$enableval],
               [use_random_daemon=no])
 AC_MSG_RESULT($use_random_daemon)
 if test x$use_random_daemon = xyes ; then
     AC_DEFINE(USE_RANDOM_DAEMON,1,
               [Define to support the experimental random daemon])
 fi
 AM_CONDITIONAL(USE_RANDOM_DAEMON, test x$use_random_daemon = xyes)
 
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI assembler modules are requested])
 AC_ARG_ENABLE([asm],
               AC_HELP_STRING([--disable-asm],
 	                     [Disable MPI assembler modules]),
               [try_asm_modules=$enableval],
               [try_asm_modules=yes])
 AC_MSG_RESULT($try_asm_modules)
 
 # Implementation of the --enable-m-guard switch.
 AC_MSG_CHECKING([whether memory guard is requested])
 AC_ARG_ENABLE(m-guard,
               AC_HELP_STRING([--enable-m-guard],
                              [Enable memory guard facility]),
               [use_m_guard=$enableval], [use_m_guard=no])
 AC_MSG_RESULT($use_m_guard)
 if test "$use_m_guard" = yes ; then
     AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature])
 fi
 
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
               AC_HELP_STRING([--enable-large-data-tests],
                  [Enable the real long ruinning large data tests]),
 	      large_data_tests=$enableval,large_data_tests=no)
 AC_MSG_RESULT($large_data_tests)
 AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests)
 
 
 # Implementation of the --with-capabilities switch.
 # Check whether we want to use Linux capabilities
 AC_MSG_CHECKING([whether use of capabilities is requested])
 AC_ARG_WITH(capabilities,
             AC_HELP_STRING([--with-capabilities],
                            [Use linux capabilities [default=no]]),
             [use_capabilities="$withval"],[use_capabilities=no])
 AC_MSG_RESULT($use_capabilities)
 
 # Implementation of the --enable-hmac-binary-check.
 AC_MSG_CHECKING([whether a HMAC binary check is requested])
 AC_ARG_ENABLE(hmac-binary-check,
               AC_HELP_STRING([--enable-hmac-binary-check],
                              [Enable library integrity check]),
               [use_hmac_binary_check=$enableval],
               [use_hmac_binary_check=no])
 AC_MSG_RESULT($use_hmac_binary_check)
 if test "$use_hmac_binary_check" = yes ; then
     AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1,
               [Define to support an HMAC based integrity check])
 fi
 
 
 # Implementation of the --disable-jent-support switch.
 AC_MSG_CHECKING([whether jitter entropy support is requested])
 AC_ARG_ENABLE(jent-support,
               AC_HELP_STRING([--disable-jent-support],
                         [Disable support for the Jitter entropy collector]),
 	      jentsupport=$enableval,jentsupport=yes)
 AC_MSG_RESULT($jentsupport)
 
 # Implementation of the --disable-padlock-support switch.
 AC_MSG_CHECKING([whether padlock support is requested])
 AC_ARG_ENABLE(padlock-support,
               AC_HELP_STRING([--disable-padlock-support],
        	         [Disable support for the PadLock Engine of VIA processors]),
 	      padlocksupport=$enableval,padlocksupport=yes)
 AC_MSG_RESULT($padlocksupport)
 
 # Implementation of the --disable-aesni-support switch.
 AC_MSG_CHECKING([whether AESNI support is requested])
 AC_ARG_ENABLE(aesni-support,
               AC_HELP_STRING([--disable-aesni-support],
                  [Disable support for the Intel AES-NI instructions]),
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
 # Implementation of the --disable-shaext-support switch.
 AC_MSG_CHECKING([whether SHAEXT support is requested])
 AC_ARG_ENABLE(shaext-support,
               AC_HELP_STRING([--disable-shaext-support],
                  [Disable support for the Intel SHAEXT instructions]),
               shaextsupport=$enableval,shaextsupport=yes)
 AC_MSG_RESULT($shaextsupport)
 
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
               AC_HELP_STRING([--disable-pclmul-support],
                  [Disable support for the Intel PCLMUL instructions]),
 	      pclmulsupport=$enableval,pclmulsupport=yes)
 AC_MSG_RESULT($pclmulsupport)
 
 # Implementation of the --disable-sse41-support switch.
 AC_MSG_CHECKING([whether SSE4.1 support is requested])
 AC_ARG_ENABLE(sse41-support,
               AC_HELP_STRING([--disable-sse41-support],
                  [Disable support for the Intel SSE4.1 instructions]),
 	      sse41support=$enableval,sse41support=yes)
 AC_MSG_RESULT($sse41support)
 
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
               AC_HELP_STRING([--disable-drng-support],
                  [Disable support for the Intel DRNG (RDRAND instruction)]),
 	      drngsupport=$enableval,drngsupport=yes)
 AC_MSG_RESULT($drngsupport)
 
 # Implementation of the --disable-avx-support switch.
 AC_MSG_CHECKING([whether AVX support is requested])
 AC_ARG_ENABLE(avx-support,
               AC_HELP_STRING([--disable-avx-support],
                  [Disable support for the Intel AVX instructions]),
 	      avxsupport=$enableval,avxsupport=yes)
 AC_MSG_RESULT($avxsupport)
 
 # Implementation of the --disable-avx2-support switch.
 AC_MSG_CHECKING([whether AVX2 support is requested])
 AC_ARG_ENABLE(avx2-support,
               AC_HELP_STRING([--disable-avx2-support],
                  [Disable support for the Intel AVX2 instructions]),
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
               AC_HELP_STRING([--disable-neon-support],
                  [Disable support for the ARM NEON instructions]),
 	      neonsupport=$enableval,neonsupport=yes)
 AC_MSG_RESULT($neonsupport)
 
 # Implementation of the --disable-arm-crypto-support switch.
 AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested])
 AC_ARG_ENABLE(arm-crypto-support,
               AC_HELP_STRING([--disable-arm-crypto-support],
                  [Disable support for the ARMv8 Crypto Extension instructions]),
 	      armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
               AC_HELP_STRING([--disable-O-flag-munging],
                  [Disable modification of the cc -O flag]),
               [enable_o_flag_munging=$enableval],
               [enable_o_flag_munging=yes])
 AC_MSG_RESULT($enable_o_flag_munging)
 AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes")
 
 # Implementation of the --disable-amd64-as-feature-detection switch.
 AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection])
 AC_ARG_ENABLE(amd64-as-feature-detection,
               AC_HELP_STRING([--disable-amd64-as-feature-detection],
                  [Disable the auto-detection of AMD64 as(1) features]),
 	      amd64_as_feature_detection=$enableval,
               amd64_as_feature_detection=yes)
 AC_MSG_RESULT($amd64_as_feature_detection)
 
 
 AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME",
                    [A human readable text with the name of the OS])
 
 # For some systems we know that we have ld_version scripts.
 # Use it then as default.
 have_ld_version_script=no
 case "${host}" in
     *-*-linux*)
 	have_ld_version_script=yes
         ;;
     *-*-gnu*)
 	have_ld_version_script=yes
         ;;
 esac
 AC_ARG_ENABLE([ld-version-script],
               AC_HELP_STRING([--enable-ld-version-script],
                              [enable/disable use of linker version script.
                               (default is system dependent)]),
               [have_ld_version_script=$enableval],
               [ : ] )
 AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes")
 
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM",
                    [defined to the name of the strong random device])
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM",
                    [defined to the name of the weaker random device])
 
 
 ###############################
 #### Checks for libraries. ####
 ###############################
 
 #
 # gpg-error is required.
 #
 AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION")
 if test "x$GPG_ERROR_LIBS" = "x"; then
   AC_MSG_ERROR([libgpg-error is needed.
                 See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .])
 fi
 
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
 #
 # Check whether the GNU Pth library is available.  We require this
 # to build the optional gcryptrnd program.
 #
 AC_ARG_WITH(pth-prefix,
             AC_HELP_STRING([--with-pth-prefix=PFX],
                            [prefix where GNU Pth is installed (optional)]),
      pth_config_prefix="$withval", pth_config_prefix="")
 if test x$pth_config_prefix != x ; then
    PTH_CONFIG="$pth_config_prefix/bin/pth-config"
 fi
 if test "$use_random_daemon" = "yes"; then
   AC_PATH_PROG(PTH_CONFIG, pth-config, no)
   if test "$PTH_CONFIG" = "no"; then
     AC_MSG_WARN([[
 ***
 *** To build the Libgcrypt's random number daemon
 *** we need the support of the GNU Portable Threads Library.
 *** Download it from ftp://ftp.gnu.org/gnu/pth/
 *** On a Debian GNU/Linux system you might want to try
 ***   apt-get install libpth-dev
 ***]])
   else
     GNUPG_PTH_VERSION_CHECK([1.3.7])
     if test $have_pth = yes; then
        PTH_CFLAGS=`$PTH_CONFIG --cflags`
        PTH_LIBS=`$PTH_CONFIG --ldflags`
        PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`"
        AC_DEFINE(USE_GNU_PTH, 1,
                 [Defined if the GNU Portable Thread Library should be used])
        AC_DEFINE(HAVE_PTH, 1,
                 [Defined if the GNU Pth is available])
     fi
   fi
 fi
 AC_SUBST(PTH_CFLAGS)
 AC_SUBST(PTH_LIBS)
 
 #
 # Check whether pthreads is available
 #
 if test "$have_w32_system" != yes; then
   AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes)
   if test "$have_pthread" = yes; then
     AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.])
   fi
 fi
 
 
 # Solaris needs -lsocket and -lnsl. Unisys system includes
 # gethostbyname in libsocket but needs libnsl for socket.
 AC_SEARCH_LIBS(setsockopt, [socket], ,
 	[AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])])
 AC_SEARCH_LIBS(setsockopt, [nsl])
 
 ##################################
 #### Checks for header files. ####
 ##################################
 
 AC_HEADER_STDC
 AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h sys/auxv.h)
 INSERT_SYS_SELECT_H=
 if test x"$ac_cv_header_sys_select_h" = xyes; then
   INSERT_SYS_SELECT_H=" include <sys/select.h>"
 fi
 AC_SUBST(INSERT_SYS_SELECT_H)
 
 
 ##########################################
 #### Checks for typedefs, structures, ####
 ####  and compiler characteristics.   ####
 ##########################################
 
 AC_C_CONST
 AC_C_INLINE
 AC_TYPE_SIZE_T
 AC_TYPE_SIGNAL
 AC_DECL_SYS_SIGLIST
 AC_TYPE_PID_T
 
 GNUPG_CHECK_TYPEDEF(byte, HAVE_BYTE_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(ushort, HAVE_USHORT_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(ulong, HAVE_ULONG_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(u16, HAVE_U16_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(u32, HAVE_U32_TYPEDEF)
 
 gl_TYPE_SOCKLEN_T
 case "${host}" in
   *-*-mingw32*)
     # socklen_t may or may not be defined depending on what headers
     # are included.  To be safe we use int as this is the actual type.
     FALLBACK_SOCKLEN_T="typedef int gcry_socklen_t;"
     ;;
   *)
     if test ".$gl_cv_socklen_t_equiv" = "."; then
       FALLBACK_SOCKLEN_T="typedef socklen_t gcry_socklen_t;"
     else
       FALLBACK_SOCKLEN_T="typedef ${gl_cv_socklen_t_equiv} gcry_socklen_t;"
     fi
 esac
 AC_SUBST(FALLBACK_SOCKLEN_T)
 
 
 #
 # Check for __builtin_bswap32 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap32,
        [gcry_cv_have_builtin_bswap32],
        [gcry_cv_have_builtin_bswap32=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [int x = 0; int y = __builtin_bswap32(x); return y;])],
           [gcry_cv_have_builtin_bswap32=yes])])
 if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
              [Defined if compiler has '__builtin_bswap32' intrinsic])
 fi
 
 
 #
 # Check for __builtin_bswap64 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap64,
        [gcry_cv_have_builtin_bswap64],
        [gcry_cv_have_builtin_bswap64=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [long long x = 0; long long y = __builtin_bswap64(x); return y;])],
           [gcry_cv_have_builtin_bswap64=yes])])
 if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
              [Defined if compiler has '__builtin_bswap64' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctz,
        [gcry_cv_have_builtin_ctz],
        [gcry_cv_have_builtin_ctz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_ctz(x); return y;])],
           [gcry_cv_have_builtin_ctz=yes])])
 if test "$gcry_cv_have_builtin_ctz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZ, 1,
              [Defined if compiler has '__builtin_ctz' intrinsic])
 fi
 
 
 #
 # Check for __sync_synchronize intrinsic.
 #
 AC_CACHE_CHECK(for __sync_synchronize,
        [gcry_cv_have_sync_synchronize],
        [gcry_cv_have_sync_synchronize=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [__sync_synchronize(); return 0;])],
           [gcry_cv_have_sync_synchronize=yes])])
 if test "$gcry_cv_have_sync_synchronize" = "yes" ; then
    AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1,
              [Defined if compiler has '__sync_synchronize' intrinsic])
 fi
 
 
 #
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,
        [gcry_cv_have_vla],
        [gcry_cv_have_vla=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void f1(char *, int);
             char foo(int i) {
               char b[(i < 0 ? 0 : i) + 1];
               f1(b, sizeof b); return b[0];}]])],
           [gcry_cv_have_vla=yes])])
 if test "$gcry_cv_have_vla" = "yes" ; then
    AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported])
 fi
 
 
 #
 # Check for ELF visibility support.
 #
 AC_CACHE_CHECK(whether the visibility attribute is supported,
        gcry_cv_visibility_attribute,
        [gcry_cv_visibility_attribute=no
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo __attribute__ ((visibility ("hidden"))) = 1;
             int bar __attribute__ ((visibility ("protected"))) = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
             if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then
                 if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then
                     gcry_cv_visibility_attribute=yes
                 fi
             fi
         fi
        ])
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken visibility attribute,
        gcry_cv_broken_visibility_attribute,
        [gcry_cv_broken_visibility_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo (int x);
             int bar (int x) __asm__ ("foo")
                             __attribute__ ((visibility ("hidden")));
             int bar (int x) { return x; }
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep '\.hidden@<:@ 	_@:>@foo' conftest.s >/dev/null 2>&1;
             then
                gcry_cv_broken_visibility_attribute=no
            fi
         fi
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken alias attribute,
        gcry_cv_broken_alias_attribute,
        [gcry_cv_broken_alias_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[extern int foo (int x) __asm ("xyzzy");
             int bar (int x) { return x; }
             extern __typeof (bar) foo __attribute ((weak, alias ("bar")));
             extern int dfoo;
             extern __typeof (dfoo) dfoo __asm ("abccb");
             int dfoo = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \
               grep 'abccb' conftest.s >/dev/null 2>&1; then
               gcry_cv_broken_alias_attribute=no
            fi
         fi
         ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(if gcc supports -fvisibility=hidden,
        gcry_cv_gcc_has_f_visibility,
        [gcry_cv_gcc_has_f_visibility=no
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-fvisibility=hidden"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],
                           gcry_cv_gcc_has_f_visibility=yes)
         CFLAGS=$_gcc_cflags_save;
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes" \
    && test "$gcry_cv_broken_visibility_attribute" != "yes" \
    && test "$gcry_cv_broken_alias_attribute" != "yes" \
    && test "$gcry_cv_gcc_has_f_visibility" = "yes"
  then
    AC_DEFINE(GCRY_USE_VISIBILITY, 1,
                [Define to use the GNU C visibility attribute.])
    CFLAGS="$CFLAGS -fvisibility=hidden"
 fi
 
 
 # Following attribute tests depend on warnings to cause compile to fail,
 # so set -Werror temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether the compiler supports the GCC style aligned attribute
 #
 AC_CACHE_CHECK([whether the GCC style aligned attribute is supported],
        [gcry_cv_gcc_attribute_aligned],
        [gcry_cv_gcc_attribute_aligned=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct { int a; } foo __attribute__ ((aligned (16)));]])],
           [gcry_cv_gcc_attribute_aligned=yes])])
 if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1,
      [Defined if a GCC style "__attribute__ ((aligned (n))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style packed attribute
 #
 AC_CACHE_CHECK([whether the GCC style packed attribute is supported],
        [gcry_cv_gcc_attribute_packed],
        [gcry_cv_gcc_attribute_packed=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct foolong_s { long b; } __attribute__ ((packed));
             struct foo_s { char a; struct foolong_s b; }
               __attribute__ ((packed));
             enum bar {
               FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))),
             };]])],
           [gcry_cv_gcc_attribute_packed=yes])])
 if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1,
      [Defined if a GCC style "__attribute__ ((packed))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style may_alias attribute
 #
 AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported],
        [gcry_cv_gcc_attribute_may_alias],
        [gcry_cv_gcc_attribute_may_alias=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[typedef struct foo_s { int a; }
             __attribute__ ((may_alias)) foo_t;]])],
           [gcry_cv_gcc_attribute_may_alias=yes])])
 if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1,
      [Defined if a GCC style "__attribute__ ((may_alias))" is supported])
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether the compiler supports 'asm' or '__asm__' keyword for
 # assembler blocks.
 #
 AC_CACHE_CHECK([whether 'asm' assembler keyword is supported],
        [gcry_cv_have_asm],
        [gcry_cv_have_asm=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm("":::"memory"); }]])],
           [gcry_cv_have_asm=yes])])
 AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported],
        [gcry_cv_have___asm__],
        [gcry_cv_have___asm__=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { __asm__("":::"memory"); }]])],
           [gcry_cv_have___asm__=yes])])
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_DEFINE(asm,__asm__,
         [Define to supported assembler block keyword, if plain 'asm' was not
          supported])
    fi
 fi
 
 
 #
 # Check whether the compiler supports inline assembly memory barrier.
 #
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
           [gcry_cv_have_asm_volatile_memory],
           [gcry_cv_have_asm_volatile_memory=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void a(void) { __asm__ volatile("":::"memory"); }]])],
              [gcry_cv_have_asm_volatile_memory=yes])])
    fi
 else
    AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
        [gcry_cv_have_asm_volatile_memory],
        [gcry_cv_have_asm_volatile_memory=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm volatile("":::"memory"); }]])],
           [gcry_cv_have_asm_volatile_memory=yes])])
 fi
 if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1,
      [Define if inline asm memory barrier is supported])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARM
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations],
        [gcry_cv_gcc_arm_platform_as_ok],
        [gcry_cv_gcc_arm_platform_as_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
                 "add %r0, %r0, %r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
             );]])],
           [gcry_cv_gcc_arm_platform_as_ok=yes])])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARM assembly implementations])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARMv8/Aarch64
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations],
        [gcry_cv_gcc_aarch64_platform_as_ok],
        [gcry_cv_gcc_aarch64_platform_as_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
             );]])],
           [gcry_cv_gcc_aarch64_platform_as_ok=yes])])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations])
 fi
 
+#
+# Check whether GCC assembler supports for CFI directives.
+#
+AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
+       [gcry_cv_gcc_asm_cfi_directives],
+       [gcry_cv_gcc_asm_cfi_directives=no
+        AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(
+                ".text\n\t"
+                "ac_test:\n\t"
+                ".cfi_startproc\n\t"
+                ".cfi_remember_state\n\t"
+                ".cfi_adjust_cfa_offset 8\n\t"
+                ".cfi_rel_offset 0, 8\n\t"
+                ".cfi_def_cfa_register 1\n\t"
+                ".cfi_register 2, 3\n\t"
+                ".cfi_restore 2\n\t"
+                ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
+                ".cfi_restore_state\n\t"
+                ".long 0\n\t"
+                ".cfi_endproc\n\t"
+            );]])],
+          [gcry_cv_gcc_asm_cfi_directives=yes])])
+if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
+             [Defined if underlying assembler supports for CFI directives])
+fi
+
 
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
 #
 GNUPG_SYS_SYMBOL_UNDERSCORE()
 
 
 #################################
 ####                         ####
 #### Setup assembler stuff.  ####
 #### Define mpi_cpu_arch.    ####
 ####                         ####
 #################################
 AC_ARG_ENABLE(mpi-path,
               AC_HELP_STRING([--enable-mpi-path=EXTRA_PATH],
 	      [prepend EXTRA_PATH to list of CPU specific optimizations]),
 	      mpi_extra_path="$enableval",mpi_extra_path="")
 AC_MSG_CHECKING(architecture and mpi assembler functions)
 if test -f $srcdir/mpi/config.links ; then
     . $srcdir/mpi/config.links
     AC_CONFIG_LINKS("$mpi_ln_list")
     ac_cv_mpi_sflags="$mpi_sflags"
     AC_MSG_RESULT($mpi_cpu_arch)
 else
     AC_MSG_RESULT(failed)
     AC_MSG_ERROR([mpi/config.links missing!])
 fi
 MPI_SFLAGS="$ac_cv_mpi_sflags"
 AC_SUBST(MPI_SFLAGS)
 
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
    shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
    padlocksupport="n/a"
    jentsupport="n/a"
    drngsupport="n/a"
 fi
 
 if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
    fi
 fi
 
 
 #############################################
 ####                                     ####
 #### Platform specific compiler checks.  ####
 ####                                     ####
 #############################################
 
 
 # Following tests depend on warnings to cause compile to fail, so set -Werror
 # temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute],
        [gcry_cv_gcc_attribute_ms_abi],
        [gcry_cv_gcc_attribute_ms_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((ms_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_ms_abi=yes])])
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1,
      [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute])
 fi
 
 
 #
 # Check whether compiler supports 'sysv_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute],
        [gcry_cv_gcc_attribute_sysv_abi],
        [gcry_cv_gcc_attribute_sysv_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((sysv_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_sysv_abi=yes])])
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1,
      [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute])
 fi
 
 
 #
 # Check whether default calling convention is 'ms_abi'.
 #
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'ms_abi'],
           [gcry_cv_gcc_default_abi_is_ms_abi],
           [gcry_cv_gcc_default_abi_is_ms_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((ms_abi))(*msabi_func)(void);
                  /* warning on SysV abi targets, passes on Windows based targets */
                  msabi_func = def_func;
                  return msabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_ms_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1,
         [Defined if default calling convention is 'ms_abi'])
    fi
 fi
 
 
 #
 # Check whether default calling convention is 'sysv_abi'.
 #
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'],
           [gcry_cv_gcc_default_abi_is_sysv_abi],
           [gcry_cv_gcc_default_abi_is_sysv_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((sysv_abi))(*sysvabi_func)(void);
                  /* warning on MS ABI targets, passes on SysV ABI targets */
                  sysvabi_func = def_func;
                  return sysvabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_sysv_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1,
         [Defined if default calling convention is 'sysv_abi'])
    fi
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports SSSE3 instructions
 # This is required for the AES-NI instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
        [gcry_cv_gcc_inline_asm_ssse3],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
             }]])],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1,
      [Defined if inline assembler supports SSSE3 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PCLMUL instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
        [gcry_cv_gcc_inline_asm_pclmul],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
             }]])],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SHA Extensions instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
        [gcry_cv_gcc_inline_asm_shaext],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_shaext="n/a"
         else
           gcry_cv_gcc_inline_asm_shaext=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
             }]])],
           [gcry_cv_gcc_inline_asm_shaext=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
      [Defined if inline assembler supports SHA Extensions instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
        [gcry_cv_gcc_inline_asm_sse41],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
             }]])],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1,
      [Defined if inline assembler supports SSE4.1 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
        [gcry_cv_gcc_inline_asm_avx],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
             }]])],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
      [Defined if inline assembler supports AVX instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
        [gcry_cv_gcc_inline_asm_avx2],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
             }]])],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1,
      [Defined if inline assembler supports AVX2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
        [gcry_cv_gcc_inline_asm_bmi2],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[unsigned int a(unsigned int x, unsigned int y) {
               unsigned int tmp1, tmp2;
               asm ("rorxl %2, %1, %0"
                    : "=r" (tmp1)
                    : "rm0" (x), "J" (32 - ((23) & 31)));
               asm ("andnl %2, %1, %0"
                    : "=r" (tmp2)
                    : "r0" (x), "rm" (y));
               return tmp1 + tmp2;
             }]])],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1,
      [Defined if inline assembler supports BMI2 instructions])
 fi
 
 
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
     # Add '-Wa,--divide' to CPPFLAGS and try check again.
     #
     _gcc_cppflags_save="$CPPFLAGS"
     CPPFLAGS="$CPPFLAGS -Wa,--divide"
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
             [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
       CPPFLAGS="$_gcc_cppflags_save"
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
        [gcry_cv_gcc_amd64_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
 		"asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
 		/* Test if assembler allows use of '/' for constant division
 		 * (Solaris/x86 issue). If previous constant division check
 		 * and "-Wa,--divide" workaround failed, this causes assembly
 		 * to be disable on this machine. */
 		"xorl \$(123456789/12345678), %ebp;\n\t"
             );]])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
      AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
       AC_COMPILE_IFELSE([AC_LANG_SOURCE(
         [[__asm__(
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
           );]])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for assembly
 # implementations that use Intel syntax
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations],
        [gcry_cv_gcc_platform_as_ok_for_intel_syntax],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
                 "pxor xmm1, xmm7;\n\t"
                 /* Intel syntax implementation also use GAS macros, so check
                  * for them here. */
                 "VAL_A = xmm4\n\t"
                 "VAL_B = xmm2\n\t"
                 ".macro SET_VAL_A p1\n\t"
                 "  VAL_A = \\\\p1 \n\t"
                 ".endm\n\t"
                 ".macro SET_VAL_B p1\n\t"
                 "  VAL_B = \\\\p1 \n\t"
                 ".endm\n\t"
                 "vmovdqa VAL_A, VAL_B;\n\t"
                 "SET_VAL_A eax\n\t"
                 "SET_VAL_B ebp\n\t"
                 "add VAL_A, VAL_B;\n\t"
                 "add VAL_B, 0b10101;\n\t"
             );]])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
   AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1,
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
-
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
 AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture],
        [gcry_cv_cc_arm_arch_is_v6],
        [if test "$mpi_cpu_arch" != "arm" ; then
           gcry_cv_cc_arm_arch_is_v6="n/a"
         else
           gcry_cv_cc_arm_arch_is_v6=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[
            #if defined(__arm__) && \
              ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
              || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
              || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
              || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
              || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
              || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
              || defined(__ARM_ARCH_7EM__))
              /* empty */
            #else
              /* fail compile if not ARMv6. */
              not_armv6 not_armv6 = (not_armv6)not_armv6;
            #endif
           ]])],
           [gcry_cv_cc_arm_arch_is_v6=yes])
         fi])
 if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
    AC_DEFINE(HAVE_ARM_ARCH_V6,1,
      [Defined if ARM architecture is v6 or newer])
 fi
 
 
 #
 # Check whether GCC inline assembler supports NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
        [gcry_cv_gcc_inline_asm_neon],
        [if test "$mpi_cpu_arch" != "arm" ; then
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
                 "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
                 "vrev64.8 %q0, %q3;\n\t"
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
      [Defined if inline assembler supports NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch32_crypto],
        [if test "$mpi_cpu_arch" != "arm" ; then
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
 
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha1su0.32 q0, q0, q0;\n\t"
                 "sha1su1.32 q0, q0;\n\t"
 
                 "sha256h.32 q0, q0, q0;\n\t"
                 "sha256h2.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha256su0.32 q0, q0;\n\t"
                 "sha256su1.32 q0, q0, q15;\n\t"
 
                 "aese.8 q0, q0;\n\t"
                 "aesd.8 q0, q0;\n\t"
                 "aesmc.8 q0, q0;\n\t"
                 "aesimc.8 q0, q0;\n\t"
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1,
      [Defined if inline assembler supports AArch32 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions],
        [gcry_cv_gcc_inline_asm_aarch64_neon],
        [if test "$mpi_cpu_arch" != "aarch64" ; then
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".cpu generic+simd\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1,
      [Defined if inline assembler supports AArch64 NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch64_crypto],
        [if test "$mpi_cpu_arch" != "aarch64" ; then
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
 
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
 
                 "sha1h s0, s0;\n\t"
                 "sha1c q0, s0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
                 "sha1su1 v0.4s, v0.4s;\n\t"
 
                 "sha256h q0, q0, v0.4s;\n\t"
                 "sha256h2 q0, q0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha256su0 v0.4s, v0.4s;\n\t"
                 "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
 
                 "aese v0.16b, v0.16b;\n\t"
                 "aesd v0.16b, v0.16b;\n\t"
                 "aesmc v0.16b, v0.16b;\n\t"
                 "aesimc v0.16b, v0.16b;\n\t"
 
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1,
      [Defined if inline assembler supports AArch64 Crypto Extension instructions])
 fi
 
 
 #######################################
 #### Checks for library functions. ####
 #######################################
 
 AC_FUNC_VPRINTF
 # We have replacements for these in src/missing-string.c
 AC_CHECK_FUNCS(stpcpy strcasecmp)
 # We have replacements for these in src/g10lib.h
 AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
 AC_CHECK_FUNCS(explicit_bzero getentropy)
 
 GNUPG_CHECK_MLOCK
 
 #
 # Replacement functions.
 #
 AC_REPLACE_FUNCS([getpid clock])
 
 
 #
 # Check whether it is necessary to link against libdl.
 #
 DL_LIBS=""
 if test "$use_hmac_binary_check" = yes ; then
   _gcry_save_libs="$LIBS"
   LIBS=""
   AC_SEARCH_LIBS(dlopen, c dl,,,)
   DL_LIBS=$LIBS
   LIBS="$_gcry_save_libs"
   LIBGCRYPT_CONFIG_LIBS="${LIBGCRYPT_CONFIG_LIBS} ${DL_LIBS}"
 fi
 AC_SUBST(DL_LIBS)
 
 
 #
 # Check whether we can use Linux capabilities as requested.
 #
 if test "$use_capabilities" = "yes" ; then
 use_capabilities=no
 AC_CHECK_HEADERS(sys/capability.h)
 if test "$ac_cv_header_sys_capability_h" = "yes" ; then
   AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1)
   if test "$ac_cv_lib_cap_cap_init" = "yes"; then
      AC_DEFINE(USE_CAPABILITIES,1,
                [define if capabilities should be used])
      LIBS="$LIBS -lcap"
      use_capabilities=yes
   fi
 fi
 if test "$use_capabilities" = "no" ; then
     AC_MSG_WARN([[
 ***
 *** The use of capabilities on this system is not possible.
 *** You need a recent Linux kernel and some patches:
 ***   fcaps-2.2.9-990610.patch      (kernel patch for 2.2.9)
 ***   fcap-module-990613.tar.gz     (kernel module)
 ***   libcap-1.92.tar.gz            (user mode library and utilities)
 *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN
 *** set (filesystems menu). Be warned: This code is *really* ALPHA.
 ***]])
 fi
 fi
 
 # Check whether a random device is available.
 if test "$try_dev_random" = yes ; then
     AC_CACHE_CHECK(for random device, ac_cv_have_dev_random,
     [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then
       ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi])
     if test "$ac_cv_have_dev_random" = yes; then
         AC_DEFINE(HAVE_DEV_RANDOM,1,
                  [defined if the system supports a random device] )
     fi
 else
     AC_MSG_CHECKING(for random device)
     ac_cv_have_dev_random=no
     AC_MSG_RESULT(has been disabled)
 fi
 
 # Figure out the random modules for this configuration.
 if test "$random" = "default"; then
 
     # Select default value.
     if test "$ac_cv_have_dev_random" = yes; then
         # Try Linuxish random device.
         random_modules="linux"
     else
         case "${host}" in
         *-*-mingw32ce*)
           # WindowsCE random device.
           random_modules="w32ce"
           ;;
         *-*-mingw32*|*-*-cygwin*)
           # Windows random device.
           random_modules="w32"
           ;;
         *)
           # Build everything, allow to select at runtime.
           random_modules="$auto_random_modules"
           ;;
         esac
     fi
 else
     if test "$random" = "auto"; then
         # Build everything, allow to select at runtime.
         random_modules="$auto_random_modules"
     else
         random_modules="$random"
     fi
 fi
 
 
 #
 # Other defines
 #
 if test mym4_isgit = "yes"; then
     AC_DEFINE(IS_DEVELOPMENT_VERSION,1,
               [Defined if this is not a regular release])
 fi
 
 
 AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes)
 
 
 # This is handy for debugging so the compiler doesn't rearrange
 # things and eliminate variables.
 AC_ARG_ENABLE(optimization,
        AC_HELP_STRING([--disable-optimization],
 		      [disable compiler optimization]),
                       [if test $enableval = no ; then
                          CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'`
                        fi])
 
 AC_MSG_NOTICE([checking for cc features])
 # CFLAGS mangling when using gcc.
 if test "$GCC" = yes; then
     AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks])
     _gcc_cflags_save=$CFLAGS
     CFLAGS="-fno-delete-null-pointer-checks"
     AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
     AC_MSG_RESULT($_gcc_wopt)
     CFLAGS=$_gcc_cflags_save;
     if test x"$_gcc_wopt" = xyes ; then
        CFLAGS="$CFLAGS -fno-delete-null-pointer-checks"
     fi
 
     CFLAGS="$CFLAGS -Wall"
     if test "$USE_MAINTAINER_MODE" = "yes"; then
         CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes"
         CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security"
 
         # If -Wno-missing-field-initializers is supported we can enable a
         # a bunch of really useful warnings.
         AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wno-missing-field-initializers"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast"
           CFLAGS="$CFLAGS -Wwrite-strings"
           CFLAGS="$CFLAGS -Wdeclaration-after-statement"
           CFLAGS="$CFLAGS -Wno-missing-field-initializers"
           CFLAGS="$CFLAGS -Wno-sign-compare"
         fi
 
         AC_MSG_CHECKING([if gcc supports -Wpointer-arith])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wpointer-arith"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -Wpointer-arith"
         fi
     fi
 fi
 
 # Check whether as(1) supports a noeexecstack feature.  This test
 # includes an override option.
 CL_AS_NOEXECSTACK
 
 
 AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION)
 AC_SUBST(LIBGCRYPT_CONFIG_LIBS)
 AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS)
 AC_SUBST(LIBGCRYPT_CONFIG_HOST)
 AC_SUBST(LIBGCRYPT_THREAD_MODULES)
 
 AC_CONFIG_COMMANDS([gcrypt-conf],[[
 chmod +x src/libgcrypt-config
 ]],[[
 prefix=$prefix
 exec_prefix=$exec_prefix
 libdir=$libdir
 datadir=$datadir
 DATADIRNAME=$DATADIRNAME
 ]])
 
 #####################
 #### Conclusion. ####
 #####################
 
 # Check that requested feature can actually be used and define
 # ENABLE_foo_SUPPORT macros.
 
 if test x"$aesnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$shaextsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
     shaextsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$sse41support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then
     sse41support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
 if test x"$shaextsupport" = xyes ; then
   AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
             [Enable support for Intel SHAEXT instructions.])
 fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
 fi
 if test x"$sse41support" = xyes ; then
   AC_DEFINE(ENABLE_SSE41_SUPPORT, 1,
             [Enable support for Intel SSE4.1 instructions.])
 fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
 fi
 if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
 fi
 if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
 if test x"$jentsupport" = xyes ; then
   AC_DEFINE(ENABLE_JENT_SUPPORT, 1,
             [Enable support for the jitter entropy collector.])
 fi
 if test x"$padlocksupport" = xyes ; then
   AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1,
             [Enable support for the PadLock engine.])
 fi
 if test x"$drngsupport" = xyes ; then
   AC_DEFINE(ENABLE_DRNG_SUPPORT, 1,
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
 
 # Define conditional sources and config.h symbols depending on the
 # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules.
 
 LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo"
 
          # Build with the SSSE3 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo"
 
          # Build with the ARMv8/AArch32 CE implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aarch64.lo"
 
          # Build with the ARMv8/AArch64 CE implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
          # Build with the AES-NI implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aesni.lo"
 
          # Build with the Padlock implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-padlock.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo"
    AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
 
          if test x"$avx2support" = xyes ; then
             # Build with the AVX2 implementation
             GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo"
          fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-aarch64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the SSE2 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo"
       ;;
    esac
 
    if test x"$avx2support" = xyes ; then
       # Build with the AVX2 implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-avx2-amd64.lo"
    fi
 
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo"
    AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(seed, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo"
    AC_DEFINE(USE_SEED, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
 
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aarch64.lo"
       ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 
    if test x"$avx2support" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX2 implementation
         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx2-amd64.lo"
       fi
    fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo"
    AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo"
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
    AC_DEFINE(USE_DSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo"
    AC_DEFINE(USE_RSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(elgamal, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo"
    AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(ecc, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \
                           ecc.lo ecc-curves.lo ecc-misc.lo \
                           ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo"
    AC_DEFINE(USE_ECC, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(crc, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
    AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
 
    case "${host}" in
       i?86-*-* | x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(gostr3411-94, $enabled_digests)
 if test "$found" = "1" ; then
    # GOST R 34.11-94 internally uses GOST 28147-89
    LIST_MEMBER(gost28147, $enabled_ciphers)
    if test "$found" = "1" ; then
       GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo"
       AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included])
    fi
 fi
 
 LIST_MEMBER(stribog, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo"
    AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo"
    AC_DEFINE(USE_MD2, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md4, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo"
    AC_DEFINE(USE_MD4, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md5, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo"
    AC_DEFINE(USE_MD5, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rmd160, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo"
    AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(sha256, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo"
    AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
        # Build with the SHAEXT implementation
        GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-intel-shaext.lo"
      ;;
    esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo"
    AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          :
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo"
    AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(whirlpool, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo"
    AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool-sse2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2b-amd64-avx2.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2s-amd64-avx.lo"
       ;;
    esac
 fi
 
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
 case "${host}" in
   x86_64-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx2-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv7-neon.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch64-ce.lo"
   ;;
 esac
 
 case "$mpi_cpu_arch" in
   x86)
     # Build with the SHAEXT implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-intel-shaext.lo"
   ;;
 esac
 
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
    AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
    AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(linux, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndlinux.lo"
    AC_DEFINE(USE_RNDLINUX, 1, [Defined if the /dev/random RNG should be used.])
 fi
 
 LIST_MEMBER(unix, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo"
    AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.])
 fi
 
 LIST_MEMBER(egd, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo"
    AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.])
 fi
 
 LIST_MEMBER(w32, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo"
    AC_DEFINE(USE_RNDW32, 1,
              [Defined if the Windows specific RNG should be used.])
 fi
 
 LIST_MEMBER(w32ce, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo"
    AC_DEFINE(USE_RNDW32CE, 1,
              [Defined if the WindowsCE specific RNG should be used.])
 fi
 
 AC_SUBST([GCRYPT_CIPHERS])
 AC_SUBST([GCRYPT_PUBKEY_CIPHERS])
 AC_SUBST([GCRYPT_DIGESTS])
 AC_SUBST([GCRYPT_KDFS])
 AC_SUBST([GCRYPT_RANDOM])
 
 AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers)
 AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers)
 AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests)
 
 # For printing the configuration we need a colon separated list of
 # algorithm names.
 tmp=`echo "$enabled_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp",
                    [List of available cipher algorithms])
 tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp",
                    [List of available public key cipher algorithms])
 tmp=`echo "$enabled_digests" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp",
                    [List of available digest algorithms])
 tmp=`echo "$enabled_kdfs" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp",
                    [List of available KDF algorithms])
 
 
 #
 # Define conditional sources depending on the used hardware platform.
 # Note that all possible modules must also be listed in
 # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES).
 #
 GCRYPT_HWF_MODULES=
 case "$mpi_cpu_arch" in
      x86)
         AC_DEFINE(HAVE_CPU_ARCH_X86, 1,   [Defined for the x86 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo"
         ;;
      alpha)
         AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms])
         ;;
      sparc)
         AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms])
         ;;
      mips)
         AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1,  [Defined for MIPS platforms])
         ;;
      m68k)
         AC_DEFINE(HAVE_CPU_ARCH_M68K, 1,  [Defined for M68k platforms])
         ;;
      ppc)
         AC_DEFINE(HAVE_CPU_ARCH_PPC, 1,   [Defined for PPC platforms])
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
      aarch64)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM AArch64 platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
 
 #
 # Option to disable building of doc file
 #
 build_doc=yes
 AC_ARG_ENABLE([doc], AC_HELP_STRING([--disable-doc],
                                     [do not build the documentation]),
                      build_doc=$enableval, build_doc=yes)
 AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno])
 
 
 #
 # Provide information about the build.
 #
 BUILD_REVISION="mym4_revision"
 AC_SUBST(BUILD_REVISION)
 AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION",
                    [GIT commit id revision used to build this package])
 
 changequote(,)dnl
 BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'`
 changequote([,])dnl
 BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec"
 BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,`
 AC_SUBST(BUILD_VERSION)
 AC_SUBST(BUILD_FILEVERSION)
 
 AC_ARG_ENABLE([build-timestamp],
   AC_HELP_STRING([--enable-build-timestamp],
                  [set an explicit build timestamp for reproducibility.
                   (default is the current time in ISO-8601 format)]),
      [if test "$enableval" = "yes"; then
         BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date`
       else
         BUILD_TIMESTAMP="$enableval"
       fi],
      [BUILD_TIMESTAMP="<none>"])
 AC_SUBST(BUILD_TIMESTAMP)
 AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP",
                    [The time this package was configured for a build])
 
 
 # And create the files.
 AC_CONFIG_FILES([
 Makefile
 m4/Makefile
 compat/Makefile
 mpi/Makefile
 cipher/Makefile
 random/Makefile
 doc/Makefile
 src/Makefile
 src/gcrypt.h
 src/libgcrypt-config
 src/libgcrypt.pc
 src/versioninfo.rc
 tests/Makefile
 ])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
 
 
 detection_module="${GCRYPT_HWF_MODULES%.lo}"
 test -n "$detection_module" || detection_module="none"
 
 # Give some feedback
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:])
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Platform:                 ],[$PRINTABLE_OS_NAME ($host)])
 GCRY_MSG_SHOW([Hardware detection module:],[$detection_module])
 GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers])
 GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests])
 GCRY_MSG_WRAP([Enabled kdf algorithms:   ],[$enabled_kdfs])
 GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers])
 GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
 GCRY_MSG_SHOW([],[])
 
 if test "x${gpg_config_script_warn}" != x; then
 cat <<G10EOF
         Mismatches between the target platform and the to
         be used libraries have been been detected for:
          ${gpg_config_script_warn}
         Please check above for warning messages.
 
 G10EOF
 fi
 
 if test "$gcry_cv_gcc_attribute_aligned" != "yes" ; then
 cat <<G10EOF
    Please not that your compiler does not support the GCC style
    aligned attribute. Using this software may evoke bus errors.
 
 G10EOF
 fi
 
 if test -n "$gpl"; then
   echo "Please note that you are building a version of Libgcrypt with"
   echo "  $gpl"
   echo "included.  These parts are licensed under the GPL and thus the"
   echo "use of this library has to comply with the conditions of the GPL."
   echo ""
 fi
diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h
index ce446744..37d5722a 100644
--- a/mpi/amd64/func_abi.h
+++ b/mpi/amd64/func_abi.h
@@ -1,19 +1,54 @@
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+#endif
+
 #ifdef USE_MS_ABI
  /* Store registers and move four first input arguments from MS ABI to
   * SYSV ABI.  */
  #define FUNC_ENTRY() \
+	CFI_STARTPROC(); \
 	pushq %rsi; \
+	CFI_PUSH(%rsi); \
 	pushq %rdi; \
+	CFI_PUSH(%rdi); \
 	movq %rdx, %rsi; \
 	movq %rcx, %rdi; \
 	movq %r8, %rdx; \
 	movq %r9, %rcx;
 
  /* Restore registers.  */
  #define FUNC_EXIT() \
 	popq %rdi; \
-	popq %rsi;
+	CFI_POP(%rdi); \
+	popq %rsi; \
+	CFI_POP(%rsi); \
+	ret; \
+	CFI_ENDPROC();
 #else
- #define FUNC_ENTRY() /**/
- #define FUNC_EXIT() /**/
+ #define FUNC_ENTRY() \
+	CFI_STARTPROC();
+
+ #define FUNC_EXIT() \
+	ret; \
+	CFI_ENDPROC();
 #endif
diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S
index 6a902621..157e5f1e 100644
--- a/mpi/amd64/mpih-add1.S
+++ b/mpi/amd64/mpih-add1.S
@@ -1,65 +1,64 @@
 /* AMD64 (x86_64) add_n -- Add two limb vectors of the same length > 0 and store
  *		   sum in a third limb vector.
  *
  *      Copyright (C) 1992, 1994, 1995, 1998, 
  *                    2001, 2002, 2006 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  *
  * Note: This code is heavily based on the GNU MP Library.
  *	 Actually it's the same code with only minor changes in the
  *	 way the data is stored; this is to support the abstraction
  *	 of an optional secure memory allocation which may be used
  *	 to avoid revealing of sensitive data due to paging etc.
  */
 
 
 #include "sysdep.h"
 #include "asm-syntax.h"
 
 
 /*******************
  *  mpi_limb_t
  *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	rdi
  *		   mpi_ptr_t s1_ptr,		rsi
  *		   mpi_ptr_t s2_ptr,		rdx
  *		   mpi_size_t size)		rcx
  */
 
 .text
 	.globl C_SYMBOL_NAME(_gcry_mpih_add_n)
 C_SYMBOL_NAME(_gcry_mpih_add_n:)
 	FUNC_ENTRY()
 	leaq	(%rsi,%rcx,8), %rsi
 	leaq	(%rdi,%rcx,8), %rdi
 	leaq	(%rdx,%rcx,8), %rdx
 	negq	%rcx
 	xorl	%eax, %eax		/* clear cy */
 
 	ALIGN(4)			/* minimal alignment for claimed speed */
 .Loop:	movq	(%rsi,%rcx,8), %rax
 	movq	(%rdx,%rcx,8), %r10
 	adcq	%r10, %rax
 	movq	%rax, (%rdi,%rcx,8)
 	incq	%rcx
 	jne	.Loop
 
 	movq	%rcx, %rax		/* zero %rax */
 	adcq	%rax, %rax
 	FUNC_EXIT()
 	ret
-	
\ No newline at end of file