diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index 77143ff0..ec945f84 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -1,532 +1,506 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code is described in an Intel White-Paper: ; "Fast SHA-256 Implementations on Intel Architecture Processors" ; ; To find it, surf to http://www.intel.com/p/en_US/embedded ; and search for that title. ; The paper is expected to be released roughly at the end of April, 2012 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna * * Note: Based on the SSSE3 implementation. */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256) #include "asm-common-amd64.h" .intel_syntax noprefix #define VMOVDQ vmovdqu /* assume buffers not aligned */ -.macro ROR p1 p2 - /* shld is faster than ror on Intel Sandybridge */ - shld \p1, \p1, (32 - \p2) -.endm +#define ROR(p1, p2) \ + /* shld is faster than ror on Intel Sandybridge */ \ + shld p1, p1, (32 - p2); /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p1, \p2 - vpshufb \p1, \p1, \p3 -.endm +#define COPY_XMM_AND_BSWAP(p1, p2, p3) \ + VMOVDQ p1, p2; \ + vpshufb p1, p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ -X0 = xmm4 -X1 = xmm5 -X2 = xmm6 -X3 = xmm7 +#define X0 xmm4 +#define X1 xmm5 +#define X2 xmm6 +#define X3 xmm7 -XTMP0 = xmm0 -XTMP1 = xmm1 -XTMP2 = xmm2 -XTMP3 = xmm3 -XTMP4 = xmm8 -XFER = xmm9 +#define XTMP0 xmm0 +#define XTMP1 xmm1 +#define XTMP2 xmm2 +#define XTMP3 xmm3 +#define XTMP4 xmm8 +#define XFER xmm9 -SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = xmm12 +#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK xmm12 -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ -SRND = rdi /* clobbers INP */ -c = ecx -d = r8d -e = edx +#define SRND rdi /* clobbers INP */ +#define c ecx +#define d r8d +#define e edx -TBL = rbp -a = eax -b = ebx +#define TBL rbp +#define a eax +#define b ebx -f = r9d -g = r10d -h = r11d +#define f r9d +#define g r10d +#define h r11d -y0 = r13d -y1 = r14d -y2 = r15d +#define y0 r13d +#define y1 r14d +#define y2 r15d #define _INP_END_SIZE 8 #define _INP_SIZE 8 #define _XFER_SIZE 8 #define _XMM_SAVE_SIZE 0 /* STACK_SIZE plus pushes must be an odd multiple of 8 */ #define _ALIGN_SIZE 8 #define _INP_END 0 #define _INP (_INP_END + _INP_END_SIZE) #define _XFER (_INP + _INP_SIZE) #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) -/* rotate_Xs - * Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS - * Rotate values of symbols a...h */ -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - /* compute s0 four at a time and s1 two at a time - * compute W[-16] + W[-7] 4 at a time */ - mov y0, e /* y0 = e */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - /* compute s0 */ - vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpslld XTMP2, XTMP1, (32-7) - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - vpsrld XTMP3, XTMP1, 7 - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + +#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* compute s0 four at a time and s1 two at a time */; \ + /* compute W[-16] + W[-7] 4 at a time */; \ + mov y0, e /* y0 = e */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + /* compute s0 */; \ + vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpslld XTMP2, XTMP1, (32-7); \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + vpsrld XTMP3, XTMP1, 7; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - vpslld XTMP2, XTMP1, (32-18) - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - vpsrld XTMP4, XTMP1, 18 - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - vpxor XTMP4, XTMP4, XTMP3 - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - /* compute low s1 */ - vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + vpslld XTMP2, XTMP1, (32-18); \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + vpsrld XTMP4, XTMP1, 18; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + vpxor XTMP4, XTMP4, XTMP3; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + /* compute low s1 */; \ + vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - xor y2, g /* y2 = f^g */ - vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - vpxor XTMP2, XTMP2, XTMP3 - add y2, y0 /* y2 = S1 + CH */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ - vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - /* compute high s1 */ - vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + xor y2, g /* y2 = f^g */; \ + vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + add y2, y0 /* y2 = S1 + CH */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ + vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + /* compute high s1 */; \ + vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - mov y0, e /* y0 = e */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - vpxor XTMP2, XTMP2, XTMP3 - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ - vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ + vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS -rotate_Xs -.endm +#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ + FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ + FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ + FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ -.macro DO_ROUND i1 - mov y0, e /* y0 = e */ - ROR y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ROR y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - and y2, e /* y2 = (f^g)&e */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - add y2, y0 /* y2 = S1 + CH */ - ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + add y2, y0 /* y2 = S1 + CH */; \ + ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ - ROTATE_ARGS -.endm /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest ;; arg 3 : Num blocks */ .text .globl _gcry_sha256_transform_amd64_avx ELF(.type _gcry_sha256_transform_amd64_avx,@function;) .align 16 _gcry_sha256_transform_amd64_avx: CFI_STARTPROC() vzeroupper push rbx CFI_PUSH(rbx) push rbp CFI_PUSH(rbp) push r13 CFI_PUSH(r13) push r14 CFI_PUSH(r14) push r15 CFI_PUSH(r15) sub rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(STACK_SIZE); shl NUM_BLKS, 6 /* convert to bytes */ jz .Ldone_hash add NUM_BLKS, INP /* pointer to end of data */ mov [rsp + _INP_END], NUM_BLKS /* load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] .Loop0: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP /* schedule 48 input dwords, by doing 3 rounds of 16 each */ mov SRND, 3 .align 16 .Loop1: vpaddd XFER, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) - vpaddd XFER, X0, [TBL + 1*16] + vpaddd XFER, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) - vpaddd XFER, X0, [TBL + 2*16] + vpaddd XFER, X2, [TBL + 2*16] vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) - vpaddd XFER, X0, [TBL + 3*16] + vpaddd XFER, X3, [TBL + 3*16] vmovdqa [rsp + _XFER], XFER add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 mov SRND, 2 .Loop2: vpaddd X0, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, a, b, c, d, e, f, g, h) + DO_ROUND(1, h, a, b, c, d, e, f, g) + DO_ROUND(2, g, h, a, b, c, d, e, f) + DO_ROUND(3, f, g, h, a, b, c, d, e) vpaddd X1, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], X1 add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, e, f, g, h, a, b, c, d) + DO_ROUND(1, d, e, f, g, h, a, b, c) + DO_ROUND(2, c, d, e, f, g, h, a, b) + DO_ROUND(3, b, c, d, e, f, g, h, a) vmovdqa X0, X2 vmovdqa X1, X3 sub SRND, 1 jne .Loop2 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 cmp INP, [rsp + _INP_END] jne .Loop0 .Ldone_hash: vzeroall vmovdqa [rsp + _XFER], XFER xor eax, eax add rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); pop r15 CFI_POP(r15) pop r14 CFI_POP(r14) pop r13 CFI_POP(r13) pop rbp CFI_POP(rbp) pop rbx CFI_POP(rbx) ret CFI_ENDPROC() .align 16 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 /* shuffle xBxA -> 00BA */ .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 /* shuffle xDxC -> DC00 */ .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index 52be1a07..faefba17 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -1,575 +1,520 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code is described in an Intel White-Paper: ; "Fast SHA-256 Implementations on Intel Architecture Processors" ; ; To find it, surf to http://www.intel.com/p/en_US/embedded ; and search for that title. ; The paper is expected to be released roughly at the end of April, 2012 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 2 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SHA256) #include "asm-common-amd64.h" .intel_syntax noprefix #define VMOVDQ vmovdqu /* ; assume buffers not aligned */ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros */ /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ -X0 = ymm4 -X1 = ymm5 -X2 = ymm6 -X3 = ymm7 +#define X0 ymm4 +#define X1 ymm5 +#define X2 ymm6 +#define X3 ymm7 /* XMM versions of above */ -XWORD0 = xmm4 -XWORD1 = xmm5 -XWORD2 = xmm6 -XWORD3 = xmm7 - -XTMP0 = ymm0 -XTMP1 = ymm1 -XTMP2 = ymm2 -XTMP3 = ymm3 -XTMP4 = ymm8 -XFER = ymm9 -XTMP5 = ymm11 - -SHUF_00BA = ymm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = ymm12 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = ymm13 - -X_BYTE_FLIP_MASK = xmm13 /* XMM version of BYTE_FLIP_MASK */ - -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ -c = ecx -d = r8d -e = edx /* clobbers NUM_BLKS */ -y3 = edi /* clobbers INP */ - -TBL = rbp -SRND = CTX /* SRND is same register as CTX */ - -a = eax -b = ebx -f = r9d -g = r10d -h = r11d -old_h = r11d - -T1 = r12d -y0 = r13d -y1 = r14d -y2 = r15d - - -_XFER_SIZE = 2*64*4 /* 2 blocks, 64 rounds, 4 bytes/round */ -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 -_RSP_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -_RSP = _CTX + _CTX_SIZE -STACK_SIZE = _RSP + _RSP_SIZE - -/* rotate_Xs */ -/* Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS */ -/* Rotate values of symbols a...h */ -.macro ROTATE_ARGS -old_h = h -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro ONE_ROUND_PART1 XFER - /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); - * d += h; - * h += Sum0 (a) + Maj (a, b, c); - * - * Ch(x, y, z) => ((x & y) + (~x & z)) - * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) - */ - - mov y3, e - add h, [\XFER] - and y3, f - rorx y0, e, 25 - rorx y1, e, 11 +#define XWORD0 xmm4 +#define XWORD1 xmm5 +#define XWORD2 xmm6 +#define XWORD3 xmm7 + +#define XTMP0 ymm0 +#define XTMP1 ymm1 +#define XTMP2 ymm2 +#define XTMP3 ymm3 +#define XTMP4 ymm8 +#define XFER ymm9 +#define XTMP5 ymm11 + +#define SHUF_00BA ymm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 ymm12 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK ymm13 + +#define X_BYTE_FLIP_MASK xmm13 /* XMM version of BYTE_FLIP_MASK */ + +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ +#define c ecx +#define d r8d +#define e edx /* clobbers NUM_BLKS */ +#define y3 edi /* clobbers INP */ + +#define TBL rbp +#define SRND CTX /* SRND is same register as CTX */ + +#define a eax +#define b ebx +#define f r9d +#define g r10d +#define h r11d +#define old_h r11d + +#define T1 r12d +#define y0 r13d +#define y1 r14d +#define y2 r15d + + +#define _XFER_SIZE 2*64*4 /* 2 blocks, 64 rounds, 4 bytes/round */ +#define _XMM_SAVE_SIZE 0 +#define _INP_END_SIZE 8 +#define _INP_SIZE 8 +#define _CTX_SIZE 8 +#define _RSP_SIZE 8 + +#define _XFER 0 +#define _XMM_SAVE _XFER + _XFER_SIZE +#define _INP_END _XMM_SAVE + _XMM_SAVE_SIZE +#define _INP _INP_END + _INP_END_SIZE +#define _CTX _INP + _INP_SIZE +#define _RSP _CTX + _CTX_SIZE +#define STACK_SIZE _RSP + _RSP_SIZE + +#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ + /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \ + /* d += h; */ \ + /* h += Sum0 (a) + Maj (a, b, c); */ \ + \ + /* Ch(x, y, z) => ((x & y) + (~x & z)) */ \ + /* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \ + \ + mov y3, e; \ + add h, [XFERIN]; \ + and y3, f; \ + rorx y0, e, 25; \ + rorx y1, e, 11; \ + lea h, [h + y3]; \ + andn y3, e, g; \ + rorx T1, a, 13; \ + xor y0, y1; \ lea h, [h + y3] - andn y3, e, g - rorx T1, a, 13 - xor y0, y1 - lea h, [h + y3] -.endm -.macro ONE_ROUND_PART2 - rorx y2, a, 22 - rorx y1, e, 6 - mov y3, a - xor T1, y2 - xor y0, y1 - xor y3, b - lea h, [h + y0] - mov y0, a - rorx y2, a, 2 - add d, h - and y3, c - xor T1, y2 - lea h, [h + y3] - lea h, [h + T1] - and y0, b - lea h, [h + y0] -.endm - -.macro ONE_ROUND XFER - ONE_ROUND_PART1 \XFER - ONE_ROUND_PART2 -.endm - -.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ - vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */ - vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ - vpsrld XTMP2, XTMP1, 7 - vpslld XTMP3, XTMP1, (32-7) - vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */ - vpsrld XTMP2, XTMP1,18 - - ONE_ROUND 0*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ - vpslld XTMP1, XTMP1, (32-18) - vpxor XTMP3, XTMP3, XTMP1 - vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */ - vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */ - vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - - ONE_ROUND 1*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - vpxor XTMP2, XTMP2, XTMP3 - vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */ - - ONE_ROUND 2*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */ - vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - vpxor XTMP2, XTMP2, XTMP3 - vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */ - vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */ - vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - vpaddd XFER, X0, [TBL + \XFEROUT] - - ONE_ROUND_PART1 3*4+\XFER - vmovdqa [rsp + _XFER + \XFEROUT], XFER - ONE_ROUND_PART2 - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS XFER -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND 0*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND 1*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND 2*4+\XFER - ROTATE_ARGS - -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ +#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ + rorx y2, a, 22; \ + rorx y1, e, 6; \ + mov y3, a; \ + xor T1, y2; \ + xor y0, y1; \ + xor y3, b; \ + lea h, [h + y0]; \ + mov y0, a; \ + rorx y2, a, 2; \ + add d, h; \ + and y3, c; \ + xor T1, y2; \ + lea h, [h + y3]; \ + lea h, [h + T1]; \ + and y0, b; \ + lea h, [h + y0] - ONE_ROUND 3*4+\XFER - ROTATE_ARGS -.endm +#define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \ + ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \ + ONE_ROUND_PART2(a, b, c, d, e, f, g, h) + +#define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \ + vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \ + vpsrld XTMP2, XTMP1, 7; \ + vpslld XTMP3, XTMP1, (32-7); \ + vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */; \ + vpsrld XTMP2, XTMP1,18; \ + \ + ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \ + \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \ + vpslld XTMP1, XTMP1, (32-18); \ + vpxor XTMP3, XTMP3, XTMP1; \ + vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */; \ + vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + \ + ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \ + \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */; \ + \ + ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \ + \ + /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */; \ + vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + vpxor XTMP2, XTMP2, XTMP3; \ + vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */; \ + vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */; \ + vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + vpaddd XFER, X0, [TBL + XFEROUT]; \ + \ + ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \ + vmovdqa [rsp + _XFER + XFEROUT], XFER; \ + ONE_ROUND_PART2(f, g, h, a, b, c, d, e); + +#define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \ + ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \ + ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \ + ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \ + ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest ;; arg 3 : Num blocks */ .text .globl _gcry_sha256_transform_amd64_avx2 ELF(.type _gcry_sha256_transform_amd64_avx2,@function) .align 32 _gcry_sha256_transform_amd64_avx2: CFI_STARTPROC() xor eax, eax cmp rdx, 0 je .Lnowork push rbx CFI_PUSH(rbx) push rbp CFI_PUSH(rbp) push r12 CFI_PUSH(r12) push r13 CFI_PUSH(r13) push r14 CFI_PUSH(r14) push r15 CFI_PUSH(r15) vzeroupper vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] mov rax, rsp CFI_DEF_CFA_REGISTER(rax); sub rsp, STACK_SIZE and rsp, ~63 mov [rsp + _RSP], rax CFI_CFA_ON_STACK(_RSP, 6 * 8) shl NUM_BLKS, 6 /* convert to bytes */ lea NUM_BLKS, [NUM_BLKS + INP - 64] /* pointer to last block */ mov [rsp + _INP_END], NUM_BLKS /* ; load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] mov [rsp + _CTX], CTX .Loop0: lea TBL, [.LK256 ADD_RIP] /* ; Load first 16 dwords from two blocks */ VMOVDQ XTMP0, [INP + 0*32] VMOVDQ XTMP1, [INP + 1*32] VMOVDQ XTMP2, [INP + 2*32] VMOVDQ XTMP3, [INP + 3*32] /* ; byte swap data */ vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK /* ; transpose data into high/low halves */ vperm2i128 X0, XTMP0, XTMP2, 0x20 vperm2i128 X1, XTMP0, XTMP2, 0x31 vperm2i128 X2, XTMP1, XTMP3, 0x20 vperm2i128 X3, XTMP1, XTMP3, 0x31 .Last_block_enter: add INP, 64 mov [rsp + _INP], INP /* ; schedule 48 input dwords, by doing 3 rounds of 12 each */ xor SRND, SRND vpaddd XFER, X0, [TBL + 0*32] vmovdqa [rsp + _XFER + 0*32], XFER vpaddd XFER, X1, [TBL + 1*32] vmovdqa [rsp + _XFER + 1*32], XFER vpaddd XFER, X2, [TBL + 2*32] vmovdqa [rsp + _XFER + 2*32], XFER vpaddd XFER, X3, [TBL + 3*32] vmovdqa [rsp + _XFER + 3*32], XFER .align 16 .Loop1: - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32, SRND + 4*32 - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32, SRND + 5*32 - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32, SRND + 6*32 - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32, SRND + 7*32 + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d) + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d) add SRND, 4*32 cmp SRND, 3 * 4*32 jb .Loop1 /* ; Do last 16 rounds with no scheduling */ - DO_4ROUNDS rsp + _XFER + (3*4*32 + 0*32) - DO_4ROUNDS rsp + _XFER + (3*4*32 + 1*32) - DO_4ROUNDS rsp + _XFER + (3*4*32 + 2*32) - DO_4ROUNDS rsp + _XFER + (3*4*32 + 3*32) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h) + DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d) mov CTX, [rsp + _CTX] mov INP, [rsp + _INP] - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) cmp INP, [rsp + _INP_END] ja .Ldone_hash /* ;;; Do second block using previously scheduled results */ xor SRND, SRND .align 16 .Loop3: - DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16 - DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16 + DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h) + DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d) add SRND, 2*32 cmp SRND, 4 * 4*32 jb .Loop3 mov CTX, [rsp + _CTX] mov INP, [rsp + _INP] add INP, 64 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) cmp INP, [rsp + _INP_END] jb .Loop0 ja .Ldone_hash .Ldo_last_block: /* ;;; do last block */ lea TBL, [.LK256 ADD_RIP] VMOVDQ XWORD0, [INP + 0*16] VMOVDQ XWORD1, [INP + 1*16] VMOVDQ XWORD2, [INP + 2*16] VMOVDQ XWORD3, [INP + 3*16] vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK jmp .Last_block_enter .Lonly_one_block: /* ; load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] mov [rsp + _CTX], CTX jmp .Ldo_last_block .Ldone_hash: vzeroall /* burn stack */ vmovdqa [rsp + _XFER + 0 * 32], ymm0 vmovdqa [rsp + _XFER + 1 * 32], ymm0 vmovdqa [rsp + _XFER + 2 * 32], ymm0 vmovdqa [rsp + _XFER + 3 * 32], ymm0 vmovdqa [rsp + _XFER + 4 * 32], ymm0 vmovdqa [rsp + _XFER + 5 * 32], ymm0 vmovdqa [rsp + _XFER + 6 * 32], ymm0 vmovdqa [rsp + _XFER + 7 * 32], ymm0 vmovdqa [rsp + _XFER + 8 * 32], ymm0 vmovdqa [rsp + _XFER + 9 * 32], ymm0 vmovdqa [rsp + _XFER + 10 * 32], ymm0 vmovdqa [rsp + _XFER + 11 * 32], ymm0 vmovdqa [rsp + _XFER + 12 * 32], ymm0 vmovdqa [rsp + _XFER + 13 * 32], ymm0 vmovdqa [rsp + _XFER + 14 * 32], ymm0 vmovdqa [rsp + _XFER + 15 * 32], ymm0 xor eax, eax mov rsp, [rsp + _RSP] CFI_DEF_CFA_REGISTER(rsp) pop r15 CFI_POP(r15) pop r14 CFI_POP(r14) pop r13 CFI_POP(r13) pop r12 CFI_POP(r12) pop rbp CFI_POP(rbp) pop rbx CFI_POP(rbx) .Lnowork: ret CFI_ENDPROC() .align 64 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 /* shuffle xBxA -> 00BA */ .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 /* shuffle xDxC -> DC00 */ .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 0fb94c1b..098b0eb6 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -1,553 +1,528 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code is described in an Intel White-Paper: ; "Fast SHA-256 Implementations on Intel Architecture Processors" ; ; To find it, surf to http://www.intel.com/p/en_US/embedded ; and search for that title. ; The paper is expected to be released roughly at the end of April, 2012 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna * * Note: original implementation was named as SHA256-SSE4. However, only SSSE3 * is required. */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256) #include "asm-common-amd64.h" .intel_syntax noprefix #define MOVDQ movdqu /* assume buffers not aligned */ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p1, \p2 - pshufb \p1, \p3 -.endm +#define COPY_XMM_AND_BSWAP(p1, p2, p3) \ + MOVDQ p1, p2; \ + pshufb p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ -X0 = xmm4 -X1 = xmm5 -X2 = xmm6 -X3 = xmm7 +#define X0 xmm4 +#define X1 xmm5 +#define X2 xmm6 +#define X3 xmm7 -XTMP0 = xmm0 -XTMP1 = xmm1 -XTMP2 = xmm2 -XTMP3 = xmm3 -XTMP4 = xmm8 -XFER = xmm9 +#define XTMP0 xmm0 +#define XTMP1 xmm1 +#define XTMP2 xmm2 +#define XTMP3 xmm3 +#define XTMP4 xmm8 +#define XFER xmm9 -SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ -SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ -BYTE_FLIP_MASK = xmm12 +#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK xmm12 -NUM_BLKS = rdx /* 3rd arg */ -CTX = rsi /* 2nd arg */ -INP = rdi /* 1st arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define CTX rsi /* 2nd arg */ +#define INP rdi /* 1st arg */ -SRND = rdi /* clobbers INP */ -c = ecx -d = r8d -e = edx +#define SRND rdi /* clobbers INP */ +#define c ecx +#define d r8d +#define e edx -TBL = rbp -a = eax -b = ebx +#define TBL rbp +#define a eax +#define b ebx -f = r9d -g = r10d -h = r11d +#define f r9d +#define g r10d +#define h r11d -y0 = r13d -y1 = r14d -y2 = r15d +#define y0 r13d +#define y1 r14d +#define y2 r15d #define _INP_END_SIZE 8 #define _INP_SIZE 8 #define _XFER_SIZE 8 #define _XMM_SAVE_SIZE 0 /* STACK_SIZE plus pushes must be an odd multiple of 8 */ #define _ALIGN_SIZE 8 #define _INP_END 0 #define _INP (_INP_END + _INP_END_SIZE) #define _XFER (_INP + _INP_SIZE) #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) -/* rotate_Xs - * Rotate values of symbols X0...X3 */ -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -/* ROTATE_ARGS - * Rotate values of symbols a...h */ -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - /* compute s0 four at a time and s1 two at a time - * compute W[-16] + W[-7] 4 at a time */ - movdqa XTMP0, X3 - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - movdqa XTMP1, X1 - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - /* compute s0 */ - palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ - movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pslld XTMP1, (32-7) - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - psrld XTMP2, 7 - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + +#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + /* compute s0 four at a time and s1 two at a time */; \ + /* compute W[-16] + W[-7] 4 at a time */; \ + movdqa XTMP0, X3; \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + movdqa XTMP1, X1; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + /* compute s0 */; \ + palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ + movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pslld XTMP1, (32-7); \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + psrld XTMP2, 7; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */ - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - pslld XTMP3, (32-18) - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - psrld XTMP2, 18 - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - pxor XTMP1, XTMP3 - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pxor XTMP1, XTMP4 /* XTMP1 = s0 */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - /* compute low s1 */ - pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + pslld XTMP3, (32-18); \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + psrld XTMP2, 18; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + pxor XTMP1, XTMP3; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + /* compute low s1 */; \ + pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */ - mov y0, e /* y0 = e */ - mov y1, a /* y1 = a */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - xor y2, g /* y2 = f^g */ - psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - pxor XTMP2, XTMP3 - add y2, y0 /* y2 = S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ - pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - /* compute high s1 */ - pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \ + mov y0, e /* y0 = e */; \ + mov y1, a /* y1 = a */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ + xor y2, g /* y2 = f^g */; \ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + pxor XTMP2, XTMP3; \ + add y2, y0 /* y2 = S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ + pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + /* compute high s1 */; \ + pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS - movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */ - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - mov y2, f /* y2 = f */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - xor y2, g /* y2 = f^g */ - psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - and y2, e /* y2 = (f^g)&e */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - pxor XTMP2, XTMP3 - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, y0 /* y2 = S1 + CH */ - add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ - pxor X0, XTMP2 /* X0 = s1 {xDxC} */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + mov y2, f /* y2 = f */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + xor y2, g /* y2 = f^g */; \ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + pxor XTMP2, XTMP3; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, y0 /* y2 = S1 + CH */; \ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ + pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ -ROTATE_ARGS -rotate_Xs -.endm +#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ + FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ + FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ + FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ + FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ -.macro DO_ROUND i1 - mov y0, e /* y0 = e */ - ror y0, (25-11) /* y0 = e >> (25-11) */ - mov y1, a /* y1 = a */ - xor y0, e /* y0 = e ^ (e >> (25-11)) */ - ror y1, (22-13) /* y1 = a >> (22-13) */ - mov y2, f /* y2 = f */ - xor y1, a /* y1 = a ^ (a >> (22-13) */ - ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ - xor y2, g /* y2 = f^g */ - xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ - ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ - and y2, e /* y2 = (f^g)&e */ - xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ - ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ - xor y2, g /* y2 = CH = ((f^g)&e)^g */ - add y2, y0 /* y2 = S1 + CH */ - ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ - add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ - mov y0, a /* y0 = a */ - add h, y2 /* h = h + S1 + CH + k + w */ - mov y2, a /* y2 = a */ - or y0, c /* y0 = a|c */ - add d, h /* d = d + h + S1 + CH + k + w */ - and y2, c /* y2 = a&c */ - and y0, b /* y0 = (a|c)&b */ - add h, y1 /* h = h + S1 + CH + k + w + S0 */ - or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ +#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ + mov y0, e /* y0 = e */; \ + ror y0, (25-11) /* y0 = e >> (25-11) */; \ + mov y1, a /* y1 = a */; \ + xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ + ror y1, (22-13) /* y1 = a >> (22-13) */; \ + mov y2, f /* y2 = f */; \ + xor y1, a /* y1 = a ^ (a >> (22-13) */; \ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ + xor y2, g /* y2 = f^g */; \ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ + and y2, e /* y2 = (f^g)&e */; \ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ + xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ + add y2, y0 /* y2 = S1 + CH */; \ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ + add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ + mov y0, a /* y0 = a */; \ + add h, y2 /* h = h + S1 + CH + k + w */; \ + mov y2, a /* y2 = a */; \ + or y0, c /* y0 = a|c */; \ + add d, h /* d = d + h + S1 + CH + k + w */; \ + and y2, c /* y2 = a&c */; \ + and y0, b /* y0 = (a|c)&b */; \ + add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ - ROTATE_ARGS -.endm /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest ;; arg 3 : Num blocks */ .text .globl _gcry_sha256_transform_amd64_ssse3 ELF(.type _gcry_sha256_transform_amd64_ssse3,@function;) .align 16 _gcry_sha256_transform_amd64_ssse3: CFI_STARTPROC() push rbx CFI_PUSH(rbx) push rbp CFI_PUSH(rbp) push r13 CFI_PUSH(r13) push r14 CFI_PUSH(r14) push r15 CFI_PUSH(r15) sub rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(STACK_SIZE); shl NUM_BLKS, 6 /* convert to bytes */ jz .Ldone_hash add NUM_BLKS, INP /* pointer to end of data */ mov [rsp + _INP_END], NUM_BLKS /* load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] movdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] movdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] movdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] .Loop0: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) + COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP /* schedule 48 input dwords, by doing 3 rounds of 16 each */ mov SRND, 3 .align 16 .Loop1: movdqa XFER, [TBL + 0*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 1*16] - paddd XFER, X0 + paddd XFER, X1 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) movdqa XFER, [TBL + 2*16] - paddd XFER, X0 + paddd XFER, X2 movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 3*16] - paddd XFER, X0 + paddd XFER, X3 movdqa [rsp + _XFER], XFER add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED + FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 mov SRND, 2 .Loop2: paddd X0, [TBL + 0*16] movdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, a, b, c, d, e, f, g, h) + DO_ROUND(1, h, a, b, c, d, e, f, g) + DO_ROUND(2, g, h, a, b, c, d, e, f) + DO_ROUND(3, f, g, h, a, b, c, d, e) paddd X1, [TBL + 1*16] movdqa [rsp + _XFER], X1 add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 + DO_ROUND(0, e, f, g, h, a, b, c, d) + DO_ROUND(1, d, e, f, g, h, a, b, c) + DO_ROUND(2, c, d, e, f, g, h, a, b) + DO_ROUND(3, b, c, d, e, f, g, h, a) movdqa X0, X2 movdqa X1, X3 sub SRND, 1 jne .Loop2 - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h + addm([4*0 + CTX],a) + addm([4*1 + CTX],b) + addm([4*2 + CTX],c) + addm([4*3 + CTX],d) + addm([4*4 + CTX],e) + addm([4*5 + CTX],f) + addm([4*6 + CTX],g) + addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 cmp INP, [rsp + _INP_END] jne .Loop0 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 pxor xmm3, xmm3 pxor xmm4, xmm4 pxor xmm5, xmm5 pxor xmm6, xmm6 pxor xmm7, xmm7 pxor xmm8, xmm8 pxor xmm9, xmm9 pxor xmm10, xmm10 pxor xmm11, xmm11 pxor xmm12, xmm12 .Ldone_hash: pxor XFER, XFER movdqa [rsp + _XFER], XFER xor eax, eax add rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); pop r15 CFI_POP(r15) pop r14 CFI_POP(r14) pop r13 CFI_POP(r13) pop rbp CFI_POP(rbp) pop rbx CFI_POP(rbx) ret CFI_ENDPROC() .align 16 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 /* shuffle xBxA -> 00BA */ .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 /* shuffle xDxC -> DC00 */ .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 991fd639..75f7b070 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -1,431 +1,461 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ -msg = rdi /* ARG1 */ -digest = rsi /* ARG2 */ -msglen = rdx /* ARG3 */ -T1 = rcx -T2 = r8 -a_64 = r9 -b_64 = r10 -c_64 = r11 -d_64 = r12 -e_64 = r13 -f_64 = r14 -g_64 = r15 -h_64 = rbx -tmp0 = rax +#define msg rdi /* ARG1 */ +#define digest rsi /* ARG2 */ +#define msglen rdx /* ARG3 */ +#define T1 rcx +#define T2 r8 +#define a_64 r9 +#define b_64 r10 +#define c_64 r11 +#define d_64 r12 +#define e_64 r13 +#define f_64 r14 +#define g_64 r15 +#define h_64 rbx +#define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ -frame_W = 0 /* Message Schedule */ -frame_W_size = (80 * 8) -frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ -frame_WK_size = (2 * 8) -frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) -frame_GPRSAVE_size = (5 * 8) -frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) +#define frame_W 0 /* Message Schedule */ +#define frame_W_size (80 * 8) +#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +#define frame_WK_size (2 * 8) +#define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) +#define frame_GPRSAVE_size (5 * 8) +#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ #define MSG(i) msg + 8*(i) /* Input message (arg1) */ #define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ #define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ #define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ #define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ -.macro RotateState - /* Rotate symbles a..h right */ - __TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = __TMP -.endm - -.macro RORQ p1 p2 - /* shld is faster than ror on Intel Sandybridge */ - shld \p1, \p1, (64 - \p2) -.endm - -.macro SHA512_Round t - /* Compute Round %%t */ - mov T1, f_64 /* T1 = f */ - mov tmp0, e_64 /* tmp = e */ - xor T1, g_64 /* T1 = f ^ g */ - RORQ tmp0, 23 /* 41 ; tmp = e ror 23 */ - and T1, e_64 /* T1 = (f ^ g) & e */ - xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ - xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ - add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ - RORQ tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ - xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ - mov T2, a_64 /* T2 = a */ - add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ - RORQ tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ - add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ - mov tmp0, a_64 /* tmp = a */ - xor T2, c_64 /* T2 = a ^ c */ - and tmp0, c_64 /* tmp = a & c */ - and T2, b_64 /* T2 = (a ^ c) & b */ - xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ - mov tmp0, a_64 /* tmp = a */ - RORQ tmp0, 5 /* 39 ; tmp = a ror 5 */ - xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ - add d_64, T1 /* e(next_state) = d + T1 */ - RORQ tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ - xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ - lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ - RORQ tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ - add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ - RotateState -.endm - -.macro SHA512_2Sched_2Round_avx t -/* ; Compute rounds %%t-2 and %%t-1 - ; Compute message schedule QWORDS %%t and %%t+1 - - ; Two rounds are computed based on the values for K[t-2]+W[t-2] and - ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message - ; scheduler. - ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - ; They are then added to their respective SHA512 constants at - ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - ; For brievity, the comments following vectored instructions only refer to - ; the first of a pair of QWORDS. - ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} - ; The computation of the message schedule and the rounds are tightly - ; stitched to take advantage of instruction-level parallelism. - ; For clarity, integer instructions (for the rounds calculation) are indented - ; by one tab. Vectored instructions (for the message scheduler) are indented - ; by two tabs. */ - - vmovdqa xmm4, [W_t(\t-2)] /* XMM4 = W[t-2] */ - vmovdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ - mov T1, f_64 - vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */ - mov tmp0, e_64 - vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */ - xor T1, g_64 - RORQ tmp0, 23 /* 41 */ - vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */ - and T1, e_64 - xor tmp0, e_64 - vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */ - xor T1, g_64 - add T1, [WK_2(\t)]; - vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */ - RORQ tmp0, 4 /* 18 */ - vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */ - xor tmp0, e_64 - mov T2, a_64 - add T1, h_64 - vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */ - RORQ tmp0, 14 /* 14 */ - add T1, tmp0 - vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */ - mov tmp0, a_64 - xor T2, c_64 - vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */ - and tmp0, c_64 - and T2, b_64 - vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */ - xor T2, tmp0 - mov tmp0, a_64 - vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */ - RORQ tmp0, 5 /* 39 */ - vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */ - xor tmp0, a_64 - add d_64, T1 - RORQ tmp0, 6 /* 34 */ - xor tmp0, a_64 - vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */ - lea h_64, [T1 + T2] - RORQ tmp0, 28 /* 28 */ - vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */ - add h_64, tmp0 - RotateState - vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */ - mov T1, f_64 - vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */ - mov tmp0, e_64 - xor T1, g_64 - vpaddq xmm0, xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */ - vmovdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ - RORQ tmp0, 23 /* 41 */ - and T1, e_64 - xor tmp0, e_64 - xor T1, g_64 - vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */ - add T1, [WK_2(\t+1)] - vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */ - RORQ tmp0, 4 /* 18 */ - vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */ - xor tmp0, e_64 - vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ - mov T2, a_64 - add T1, h_64 - RORQ tmp0, 14 /* 14 */ - add T1, tmp0 - vmovdqa [W_t(\t)], xmm0 /* Store W[t] */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ - mov tmp0, a_64 - xor T2, c_64 - and tmp0, c_64 - and T2, b_64 - xor T2, tmp0 - mov tmp0, a_64 - RORQ tmp0, 5 /* 39 */ - xor tmp0, a_64 - add d_64, T1 - RORQ tmp0, 6 /* 34 */ - xor tmp0, a_64 - lea h_64, [T1 + T2] - RORQ tmp0, 28 /* 28 */ - add h_64, tmp0 - RotateState -.endm +#define RORQ(p1, p2) \ + /* shld is faster than ror on Intel Sandybridge */ \ + shld p1, p1, (64 - p2) + +#define SHA512_Round(t, a, b, c, d, e, f, g, h) \ + /* Compute Round %%t */; \ + mov T1, f /* T1 = f */; \ + mov tmp0, e /* tmp = e */; \ + xor T1, g /* T1 = f ^ g */; \ + RORQ( tmp0, 23) /* 41 ; tmp = e ror 23 */; \ + and T1, e /* T1 = (f ^ g) & e */; \ + xor tmp0, e /* tmp = (e ror 23) ^ e */; \ + xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ + add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ + RORQ( tmp0, 4) /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ + xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ + mov T2, a /* T2 = a */; \ + add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ + RORQ( tmp0, 14) /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ + mov tmp0, a /* tmp = a */; \ + xor T2, c /* T2 = a ^ c */; \ + and tmp0, c /* tmp = a & c */; \ + and T2, b /* T2 = (a ^ c) & b */; \ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ + mov tmp0, a /* tmp = a */; \ + RORQ( tmp0, 5) /* 39 ; tmp = a ror 5 */; \ + xor tmp0, a /* tmp = (a ror 5) ^ a */; \ + add d, T1 /* e(next_state) = d + T1 */; \ + RORQ( tmp0, 6) /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ + xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ + lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ + RORQ( tmp0, 28) /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ + add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + +#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \ + /* \ + ; Compute rounds %%t-2 and %%t-1 \ + ; Compute message schedule QWORDS %%t and %%t+1 \ + ; \ + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ + ; scheduler. \ + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ + ; They are then added to their respective SHA512 constants at \ + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ + ; For brievity, the comments following vectored instructions only refer to \ + ; the first of a pair of QWORDS. \ + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \ + ; The computation of the message schedule and the rounds are tightly \ + ; stitched to take advantage of instruction-level parallelism. \ + ; For clarity, integer instructions (for the rounds calculation) are indented \ + ; by one tab. Vectored instructions (for the message scheduler) are indented \ + ; by two tabs. \ + */ \ + \ + vmovdqa xmm4, [W_t(t-2)] /* XMM4 = W[t-2] */; \ + vmovdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ + mov T1, f; \ + vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */; \ + mov tmp0, e; \ + vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */; \ + xor T1, g; \ + RORQ( tmp0, 23) /* 41 */; \ + vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */; \ + and T1, e; \ + xor tmp0, e; \ + vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \ + xor T1, g; \ + add T1, [WK_2(t)]; \ + vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */; \ + RORQ( tmp0, 4) /* 18 */; \ + vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */; \ + xor tmp0, e; \ + mov T2, a; \ + add T1, h; \ + vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \ + RORQ( tmp0, 14) /* 14 */; \ + add T1, tmp0; \ + vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */; \ + mov tmp0, a; \ + xor T2, c; \ + vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */; \ + and tmp0, c; \ + and T2, b; \ + vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \ + xor T2, tmp0; \ + mov tmp0, a; \ + vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */; \ + RORQ( tmp0, 5) /* 39 */; \ + vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \ + xor tmp0, a; \ + add d, T1; \ + RORQ( tmp0, 6) /* 34 */; \ + xor tmp0, a; \ + vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \ + lea h, [T1 + T2]; \ + RORQ( tmp0, 28) /* 28 */; \ + vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */; \ + add h, tmp0 + +#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \ + vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \ + mov T1, f; \ + vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */; \ + mov tmp0, e; \ + xor T1, g; \ + vpaddq xmm0, xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */; \ + vmovdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ + RORQ( tmp0, 23) /* 41 */; \ + and T1, e; \ + xor tmp0, e; \ + xor T1, g; \ + vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */; \ + add T1, [WK_2(t+1)]; \ + vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */; \ + RORQ( tmp0, 4) /* 18 */; \ + vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \ + xor tmp0, e; \ + vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ + mov T2, a; \ + add T1, h; \ + RORQ( tmp0, 14) /* 14 */; \ + add T1, tmp0; \ + vmovdqa [W_t(t)], xmm0 /* Store W[t] */; \ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ + mov tmp0, a; \ + xor T2, c; \ + and tmp0, c; \ + and T2, b; \ + xor T2, tmp0; \ + mov tmp0, a; \ + RORQ( tmp0, 5) /* 39 */; \ + xor tmp0, a; \ + add d, T1; \ + RORQ( tmp0, 6) /* 34 */; \ + xor tmp0, a; \ + lea h, [T1 + T2]; \ + RORQ( tmp0, 28) /* 28 */; \ + add h, tmp0 + +#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \ + SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_avx(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks */ .globl _gcry_sha512_transform_amd64_avx ELF(.type _gcry_sha512_transform_amd64_avx,@function;) .align 16 _gcry_sha512_transform_amd64_avx: CFI_STARTPROC() xor eax, eax cmp msglen, 0 je .Lnowork vzeroupper /* Allocate Stack Space */ sub rsp, frame_size CFI_ADJUST_CFA_OFFSET(frame_size); /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbx mov [rsp + frame_GPRSAVE + 8 * 1], r12 mov [rsp + frame_GPRSAVE + 8 * 2], r13 mov [rsp + frame_GPRSAVE + 8 * 3], r14 mov [rsp + frame_GPRSAVE + 8 * 4], r15 CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0); CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1); CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2); CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3); CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4); .Lupdateblock: /* Load state variables */ mov a_64, [DIGEST(0)] mov b_64, [DIGEST(1)] mov c_64, [DIGEST(2)] mov d_64, [DIGEST(3)] mov e_64, [DIGEST(4)] mov f_64, [DIGEST(5)] mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] - t = 0 - .rept 80/2 + 1 - /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ - /* +1 iteration because the scheduler leads hashing by 1 iteration */ - .if t < 2 - /* BSWAP 2 QWORDS */ - vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] - vmovdqu xmm0, [MSG(t)] - vpshufb xmm0, xmm0, xmm1 /* BSWAP */ - vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - vmovdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ - .elseif t < 16 - /* BSWAP 2 QWORDS, Compute 2 Rounds */ - vmovdqu xmm0, [MSG(t)] - vpshufb xmm0, xmm0, xmm1 /* BSWAP */ - SHA512_Round (t - 2) /* Round t-2 */ - vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - SHA512_Round (t - 1) /* Round t-1 */ - vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ - .elseif t < 79 - /* Schedule 2 QWORDS; Compute 2 Rounds */ - SHA512_2Sched_2Round_avx t - .else - /* Compute 2 Rounds */ - SHA512_Round (t - 2) - SHA512_Round (t - 1) - .endif - t = ((t)+2) - .endr + /* BSWAP 2 QWORDS */ + vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + vmovdqu xmm0, [MSG(0)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + vmovdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ + + #define T_2_14(t, a, b, c, d, e, f, g, h) \ + /* BSWAP 2 QWORDS, Compute 2 Rounds */; \ + vmovdqu xmm0, [MSG(t)]; \ + vpshufb xmm0, xmm0, xmm1 /* BSWAP */; \ + SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64); \ + vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ + + #define T_16_78(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64) + + #define T_80(t, a, b, c, d, e, f, g, h) \ + /* Compute 2 Rounds */; \ + SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64) + + T_2_14(2, a, b, c, d, e, f, g, h) + T_2_14(4, g, h, a, b, c, d, e, f) + T_2_14(6, e, f, g, h, a, b, c, d) + T_2_14(8, c, d, e, f, g, h, a, b) + T_2_14(10, a, b, c, d, e, f, g, h) + T_2_14(12, g, h, a, b, c, d, e, f) + T_2_14(14, e, f, g, h, a, b, c, d) + T_16_78(16, c, d, e, f, g, h, a, b) + T_16_78(18, a, b, c, d, e, f, g, h) + T_16_78(20, g, h, a, b, c, d, e, f) + T_16_78(22, e, f, g, h, a, b, c, d) + T_16_78(24, c, d, e, f, g, h, a, b) + T_16_78(26, a, b, c, d, e, f, g, h) + T_16_78(28, g, h, a, b, c, d, e, f) + T_16_78(30, e, f, g, h, a, b, c, d) + T_16_78(32, c, d, e, f, g, h, a, b) + T_16_78(34, a, b, c, d, e, f, g, h) + T_16_78(36, g, h, a, b, c, d, e, f) + T_16_78(38, e, f, g, h, a, b, c, d) + T_16_78(40, c, d, e, f, g, h, a, b) + T_16_78(42, a, b, c, d, e, f, g, h) + T_16_78(44, g, h, a, b, c, d, e, f) + T_16_78(46, e, f, g, h, a, b, c, d) + T_16_78(48, c, d, e, f, g, h, a, b) + T_16_78(50, a, b, c, d, e, f, g, h) + T_16_78(52, g, h, a, b, c, d, e, f) + T_16_78(54, e, f, g, h, a, b, c, d) + T_16_78(56, c, d, e, f, g, h, a, b) + T_16_78(58, a, b, c, d, e, f, g, h) + T_16_78(60, g, h, a, b, c, d, e, f) + T_16_78(62, e, f, g, h, a, b, c, d) + T_16_78(64, c, d, e, f, g, h, a, b) + T_16_78(66, a, b, c, d, e, f, g, h) + T_16_78(68, g, h, a, b, c, d, e, f) + T_16_78(70, e, f, g, h, a, b, c, d) + T_16_78(72, c, d, e, f, g, h, a, b) + T_16_78(74, a, b, c, d, e, f, g, h) + T_16_78(76, g, h, a, b, c, d, e, f) + T_16_78(78, e, f, g, h, a, b, c, d) + T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 add [DIGEST(1)], b_64 add [DIGEST(2)], c_64 add [DIGEST(3)], d_64 add [DIGEST(4)], e_64 add [DIGEST(5)], f_64 add [DIGEST(6)], g_64 add [DIGEST(7)], h_64 /* Advance to next message block */ add msg, 16*8 dec msglen jnz .Lupdateblock /* Restore GPRs */ mov rbx, [rsp + frame_GPRSAVE + 8 * 0] mov r12, [rsp + frame_GPRSAVE + 8 * 1] mov r13, [rsp + frame_GPRSAVE + 8 * 2] mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] CFI_RESTORE(rbx) CFI_RESTORE(r12) CFI_RESTORE(r13) CFI_RESTORE(r14) CFI_RESTORE(r15) vzeroall /* Burn stack */ - t = 0 - .rept frame_W_size / 32 - vmovups [rsp + frame_W + (t) * 32], ymm0 - t = ((t)+1) - .endr + mov eax, 0 +.Lerase_stack: + vmovdqu [rsp + rax], ymm0 + add eax, 32 + cmp eax, frame_W_size + jne .Lerase_stack vmovdqu [rsp + frame_WK], xmm0 xor eax, eax /* Restore Stack Pointer */ add rsp, frame_size CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: ret CFI_ENDPROC() /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Binary Data */ .align 16 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .LXMM_QWORD_BSWAP: .octa 0x08090a0b0c0d0e0f0001020304050607 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif #endif diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 3b28ab6c..7f119e6c 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -1,568 +1,502 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ -Y_0 = ymm4 -Y_1 = ymm5 -Y_2 = ymm6 -Y_3 = ymm7 - -YTMP0 = ymm0 -YTMP1 = ymm1 -YTMP2 = ymm2 -YTMP3 = ymm3 -YTMP4 = ymm8 -XFER = YTMP0 - -BYTE_FLIP_MASK = ymm9 -MASK_YMM_LO = ymm10 -MASK_YMM_LOx = xmm10 - -INP = rdi /* 1st arg */ -CTX = rsi /* 2nd arg */ -NUM_BLKS = rdx /* 3rd arg */ -c = rcx -d = r8 -e = rdx -y3 = rdi - -TBL = rbp - -a = rax -b = rbx - -f = r9 -g = r10 -h = r11 -old_h = rax - -T1 = r12 -y0 = r13 -y1 = r14 -y2 = r15 - -y4 = r12 +#define Y_0 ymm4 +#define Y_1 ymm5 +#define Y_2 ymm6 +#define Y_3 ymm7 + +#define YTMP0 ymm0 +#define YTMP1 ymm1 +#define YTMP2 ymm2 +#define YTMP3 ymm3 +#define YTMP4 ymm8 +#define XFER YTMP0 + +#define BYTE_FLIP_MASK ymm9 +#define MASK_YMM_LO ymm10 +#define MASK_YMM_LOx xmm10 + +#define INP rdi /* 1st arg */ +#define CTX rsi /* 2nd arg */ +#define NUM_BLKS rdx /* 3rd arg */ +#define c rcx +#define d r8 +#define e rdx +#define y3 rdi + +#define TBL rbp + +#define a rax +#define b rbx + +#define f r9 +#define g r10 +#define h r11 + +#define T1 r12 +#define y0 r13 +#define y1 r14 +#define y2 r15 + +#define y4 r12 /* Local variables (stack frame) */ #define frame_XFER 0 #define frame_XFER_size (4*4*8) #define frame_SRND (frame_XFER + frame_XFER_size) #define frame_SRND_size (1*8) #define frame_INP (frame_SRND + frame_SRND_size) #define frame_INP_size (1*8) #define frame_NBLKS (frame_INP + frame_INP_size) #define frame_NBLKS_size (1*8) #define frame_RSPSAVE (frame_NBLKS + frame_NBLKS_size) #define frame_RSPSAVE_size (1*8) #define frame_GPRSAVE (frame_RSPSAVE + frame_RSPSAVE_size) #define frame_GPRSAVE_size (6*8) #define frame_size (frame_GPRSAVE + frame_GPRSAVE_size) #define VMOVDQ vmovdqu /*; assume buffers not aligned */ /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ -.macro addm p1 p2 - add \p2, \p1 - mov \p1, \p2 -.endm +#define addm(p1, p2) \ + add p2, p1; \ + mov p1, p2; /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */ /* Load ymm with mem and byte swap each dword */ -.macro COPY_YMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p1, \p2 - vpshufb \p1, \p1, \p3 -.endm -/* rotate_Ys */ -/* Rotate values of symbols Y0...Y3 */ -.macro rotate_Ys - __Y_ = Y_0 - Y_0 = Y_1 - Y_1 = Y_2 - Y_2 = Y_3 - Y_3 = __Y_ -.endm - -/* RotateState */ -.macro RotateState - /* Rotate symbles a..h right */ - old_h = h - __TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = __TMP_ -.endm +#define COPY_YMM_AND_BSWAP(p1, p2, p3) \ + VMOVDQ p1, p2; \ + vpshufb p1, p1, p3 /* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */ /* YDST = {YSRC1, YSRC2} >> RVAL*8 */ -.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL - vperm2f128 \YDST, \YSRC1, \YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */ - vpalignr \YDST, \YDST, \YSRC2, \RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ -.endm - -.macro ONE_ROUND_PART1 XFER - /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); - * d += h; - * h += Sum0 (a) + Maj (a, b, c); - * - * Ch(x, y, z) => ((x & y) + (~x & z)) - * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) - */ - - mov y3, e - add h, [\XFER] - and y3, f - rorx y0, e, 41 - rorx y1, e, 18 +#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ + vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \ + vpalignr YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ + +#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ + /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \ + * d += h; \ + * h += Sum0 (a) + Maj (a, b, c); \ + * \ + * Ch(x, y, z) => ((x & y) + (~x & z)) \ + * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \ + */ \ + \ + mov y3, e; \ + add h, [XFERIN]; \ + and y3, f; \ + rorx y0, e, 41; \ + rorx y1, e, 18; \ + lea h, [h + y3]; \ + andn y3, e, g; \ + rorx T1, a, 34; \ + xor y0, y1; \ lea h, [h + y3] - andn y3, e, g - rorx T1, a, 34 - xor y0, y1 - lea h, [h + y3] -.endm -.macro ONE_ROUND_PART2 - rorx y2, a, 39 - rorx y1, e, 14 - mov y3, a - xor T1, y2 - xor y0, y1 - xor y3, b - lea h, [h + y0] - mov y0, a - rorx y2, a, 28 - add d, h - and y3, c - xor T1, y2 - lea h, [h + y3] - lea h, [h + T1] - and y0, b - lea h, [h + y0] -.endm - -.macro ONE_ROUND XFER - ONE_ROUND_PART1 \XFER - ONE_ROUND_PART2 -.endm - -.macro FOUR_ROUNDS_AND_SCHED X -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - /* Extract w[t-7] */ - MY_VPALIGNR YTMP0, Y_3, Y_2, 8 /* YTMP0 = W[-7] */ - /* Calculate w[t-16] + w[t-7] */ - vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */ - /* Extract w[t-15] */ - MY_VPALIGNR YTMP1, Y_1, Y_0, 8 /* YTMP1 = W[-15] */ - - /* Calculate sigma0 */ - - /* Calculate w[t-15] ror 1 */ - vpsrlq YTMP2, YTMP1, 1 - vpsllq YTMP3, YTMP1, (64-1) - vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */ - /* Calculate w[t-15] shr 7 */ - vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */ - - ONE_ROUND rsp+frame_XFER+0*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - -/*;;;;;;;;;;;;;;;;;;;;;;;;; */ - - /* Calculate w[t-15] ror 8 */ - vpsrlq YTMP2, YTMP1, 8 - vpsllq YTMP1, YTMP1, (64-8) - vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */ - /* XOR the three components */ - vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */ - vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */ - - - /* Add three components, w[t-16], w[t-7] and sigma0 */ - vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */ - /* Move to appropriate lanes for calculating w[16] and w[17] */ - vperm2f128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */ - /* Move to appropriate lanes for calculating w[18] and w[19] */ - vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */ - - /* Calculate w[16] and w[17] in both 128 bit lanes */ - - /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */ - vperm2f128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */ - vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */ - - ONE_ROUND rsp+frame_XFER+1*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - -/*;;;;;;;;;;;;;;;;;;;;;;;;; */ +#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ + rorx y2, a, 39; \ + rorx y1, e, 14; \ + mov y3, a; \ + xor T1, y2; \ + xor y0, y1; \ + xor y3, b; \ + lea h, [h + y0]; \ + mov y0, a; \ + rorx y2, a, 28; \ + add d, h; \ + and y3, c; \ + xor T1, y2; \ + lea h, [h + y3]; \ + lea h, [h + T1]; \ + and y0, b; \ + lea h, [h + y0] - vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */ - vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */ - vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */ - vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */ - - /* Add sigma1 to the other compunents to get w[16] and w[17] */ - vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */ - - /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */ - vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */ - - ONE_ROUND rsp+frame_XFER+2*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - -/*;;;;;;;;;;;;;;;;;;;;;;;;; */ - - vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */ - vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */ - vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */ - vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */ - vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */ - vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */ - - /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */ - vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */ - - /* Form w[19, w[18], w17], w[16] */ - vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */ - - ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32 - vpaddq XFER, Y_0, [TBL + (4+\X)*32] - vmovdqa [rsp + frame_XFER + \X*32], XFER - ONE_ROUND_PART2 - RotateState - rotate_Ys -.endm - -.macro DO_4ROUNDS X - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+0*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+1*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+2*8+\X*32 - RotateState - -/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - ONE_ROUND rsp+frame_XFER+3*8+\X*32 - RotateState - -.endm +#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \ + ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \ + ONE_ROUND_PART2(a, b, c, d, e, f, g, h) + +#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + /* Extract w[t-7] */; \ + MY_VPALIGNR( YTMP0, Y_3, Y_2, 8) /* YTMP0 = W[-7] */; \ + /* Calculate w[t-16] + w[t-7] */; \ + vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */; \ + /* Extract w[t-15] */; \ + MY_VPALIGNR( YTMP1, Y_1, Y_0, 8) /* YTMP1 = W[-15] */; \ + \ + /* Calculate sigma0 */; \ + \ + /* Calculate w[t-15] ror 1 */; \ + vpsrlq YTMP2, YTMP1, 1; \ + vpsllq YTMP3, YTMP1, (64-1); \ + vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */; \ + /* Calculate w[t-15] shr 7 */; \ + vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */; \ + \ + ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ + \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + /* Calculate w[t-15] ror 8 */; \ + vpsrlq YTMP2, YTMP1, 8; \ + vpsllq YTMP1, YTMP1, (64-8); \ + vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */; \ + /* XOR the three components */; \ + vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \ + vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */; \ + \ + /* Add three components, w[t-16], w[t-7] and sigma0 */; \ + vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */; \ + /* Move to appropriate lanes for calculating w[16] and w[17] */; \ + vperm2i128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \ + /* Move to appropriate lanes for calculating w[18] and w[19] */; \ + vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \ + \ + /* Calculate w[16] and w[17] in both 128 bit lanes */; \ + \ + /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \ + vperm2i128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */; \ + vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */; \ + \ + ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ + \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */; \ + vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \ + vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */; \ + vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \ + \ + /* Add sigma1 to the other compunents to get w[16] and w[17] */; \ + vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */; \ + \ + /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \ + vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */; \ + \ + ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ + \ + /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ + vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */; \ + vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \ + vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */; \ + vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */; \ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */; \ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \ + \ + /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \ + vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */; \ + \ + /* Form w[19, w[18], w17], w[16] */; \ + vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */; \ + \ + ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \ + vpaddq XFER, Y_0, [TBL + (4+X)*32]; \ + vmovdqa [rsp + frame_XFER + X*32], XFER; \ + ONE_ROUND_PART2(f, g, h, a, b, c, d, e) + +#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \ + ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ + ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ + ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ + ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_rorx(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks */ .globl _gcry_sha512_transform_amd64_avx2 ELF(.type _gcry_sha512_transform_amd64_avx2,@function;) .align 16 _gcry_sha512_transform_amd64_avx2: CFI_STARTPROC() xor eax, eax cmp rdx, 0 je .Lnowork vzeroupper /* Allocate Stack Space */ mov rax, rsp CFI_DEF_CFA_REGISTER(rax); sub rsp, frame_size and rsp, ~(0x40 - 1) mov [rsp + frame_RSPSAVE], rax CFI_CFA_ON_STACK(frame_RSPSAVE, 0) /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbp mov [rsp + frame_GPRSAVE + 8 * 1], rbx mov [rsp + frame_GPRSAVE + 8 * 2], r12 mov [rsp + frame_GPRSAVE + 8 * 3], r13 mov [rsp + frame_GPRSAVE + 8 * 4], r14 mov [rsp + frame_GPRSAVE + 8 * 5], r15 CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0) CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1) CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2) CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3) CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4) CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5) mov [rsp + frame_NBLKS], NUM_BLKS /*; load initial digest */ mov a,[8*0 + CTX] mov b,[8*1 + CTX] mov c,[8*2 + CTX] mov d,[8*3 + CTX] mov e,[8*4 + CTX] mov f,[8*5 + CTX] mov g,[8*6 + CTX] mov h,[8*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP] lea TBL,[.LK512 ADD_RIP] /*; byte swap first 16 dwords */ - COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) add INP, 128 mov [rsp + frame_INP], INP vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ - movq [rsp + frame_SRND],4 + mov qword ptr [rsp + frame_SRND], 4 .align 16 .Loop0: - FOUR_ROUNDS_AND_SCHED 0 - FOUR_ROUNDS_AND_SCHED 1 - FOUR_ROUNDS_AND_SCHED 2 - FOUR_ROUNDS_AND_SCHED 3 + FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d) + FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h) + FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d) add TBL, 4*32 - subq [rsp + frame_SRND], 1 + sub qword ptr [rsp + frame_SRND], 1 jne .Loop0 - subq [rsp + frame_NBLKS], 1 + sub qword ptr [rsp + frame_NBLKS], 1 je .Ldone_hash mov INP, [rsp + frame_INP] lea TBL,[.LK512 ADD_RIP] /* load next block and byte swap */ - COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) + COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) add INP, 128 mov [rsp + frame_INP], INP - DO_4ROUNDS 0 + DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER - DO_4ROUNDS 1 + DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER - DO_4ROUNDS 2 + DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER - DO_4ROUNDS 3 + DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER - addm [8*0 + CTX],a - addm [8*1 + CTX],b - addm [8*2 + CTX],c - addm [8*3 + CTX],d - addm [8*4 + CTX],e - addm [8*5 + CTX],f - addm [8*6 + CTX],g - addm [8*7 + CTX],h + addm([8*0 + CTX],a) + addm([8*1 + CTX],b) + addm([8*2 + CTX],c) + addm([8*3 + CTX],d) + addm([8*4 + CTX],e) + addm([8*5 + CTX],f) + addm([8*6 + CTX],g) + addm([8*7 + CTX],h) /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ - movq [rsp + frame_SRND],4 + mov qword ptr [rsp + frame_SRND],4 jmp .Loop0 .Ldone_hash: vzeroall - DO_4ROUNDS 0 + DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */ - DO_4ROUNDS 1 + DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */ - DO_4ROUNDS 2 + DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */ - DO_4ROUNDS 3 + DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */ - addm [8*0 + CTX],a + addm([8*0 + CTX],a) xor eax, eax /* burn stack */ - addm [8*1 + CTX],b - addm [8*2 + CTX],c - addm [8*3 + CTX],d - addm [8*4 + CTX],e - addm [8*5 + CTX],f - addm [8*6 + CTX],g - addm [8*7 + CTX],h + addm([8*1 + CTX],b) + addm([8*2 + CTX],c) + addm([8*3 + CTX],d) + addm([8*4 + CTX],e) + addm([8*5 + CTX],f) + addm([8*6 + CTX],g) + addm([8*7 + CTX],h) /* Restore GPRs */ mov rbp, [rsp + frame_GPRSAVE + 8 * 0] mov rbx, [rsp + frame_GPRSAVE + 8 * 1] mov r12, [rsp + frame_GPRSAVE + 8 * 2] mov r13, [rsp + frame_GPRSAVE + 8 * 3] mov r14, [rsp + frame_GPRSAVE + 8 * 4] mov r15, [rsp + frame_GPRSAVE + 8 * 5] CFI_RESTORE(rbp) CFI_RESTORE(rbx) CFI_RESTORE(r12) CFI_RESTORE(r13) CFI_RESTORE(r14) CFI_RESTORE(r15) /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] CFI_DEF_CFA_REGISTER(rsp) .Lnowork: ret CFI_ENDPROC() /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /*;; Binary Data */ .align 64 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .align 32 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 .octa 0x18191a1b1c1d1e1f1011121314151617 .LMASK_YMM_LO: .octa 0x00000000000000000000000000000000 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 39bfe362..6a1328a6 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -1,436 +1,467 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna * * Note: original implementation was named as SHA512-SSE4. However, only SSSE3 * is required. */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ -msg = rdi /* ARG1 */ -digest = rsi /* ARG2 */ -msglen = rdx /* ARG3 */ -T1 = rcx -T2 = r8 -a_64 = r9 -b_64 = r10 -c_64 = r11 -d_64 = r12 -e_64 = r13 -f_64 = r14 -g_64 = r15 -h_64 = rbx -tmp0 = rax +#define msg rdi /* ARG1 */ +#define digest rsi /* ARG2 */ +#define msglen rdx /* ARG3 */ +#define T1 rcx +#define T2 r8 +#define a_64 r9 +#define b_64 r10 +#define c_64 r11 +#define d_64 r12 +#define e_64 r13 +#define f_64 r14 +#define g_64 r15 +#define h_64 rbx +#define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ -frame_W = 0 /* Message Schedule */ -frame_W_size = (80 * 8) -frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ -frame_WK_size = (2 * 8) -frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) -frame_GPRSAVE_size = (5 * 8) -frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) +#define frame_W 0 /* Message Schedule */ +#define frame_W_size (80 * 8) +#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +#define frame_WK_size (2 * 8) +#define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) +#define frame_GPRSAVE_size (5 * 8) +#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ #define MSG(i) msg + 8*(i) /* Input message (arg1) */ #define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ #define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ #define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ #define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ -.macro RotateState - /* Rotate symbles a..h right */ - __TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = __TMP -.endm - -.macro SHA512_Round t - /* Compute Round %%t */ - mov T1, f_64 /* T1 = f */ - mov tmp0, e_64 /* tmp = e */ - xor T1, g_64 /* T1 = f ^ g */ - ror tmp0, 23 /* 41 ; tmp = e ror 23 */ - and T1, e_64 /* T1 = (f ^ g) & e */ - xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ - xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ - add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ - ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ - xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ - mov T2, a_64 /* T2 = a */ - add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ - ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ - add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ - mov tmp0, a_64 /* tmp = a */ - xor T2, c_64 /* T2 = a ^ c */ - and tmp0, c_64 /* tmp = a & c */ - and T2, b_64 /* T2 = (a ^ c) & b */ - xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ - mov tmp0, a_64 /* tmp = a */ - ror tmp0, 5 /* 39 ; tmp = a ror 5 */ - xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ - add d_64, T1 /* e(next_state) = d + T1 */ - ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ - xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ - lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ - ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ - add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ - RotateState -.endm - -.macro SHA512_2Sched_2Round_sse t -/* ; Compute rounds %%t-2 and %%t-1 - ; Compute message schedule QWORDS %%t and %%t+1 - - ; Two rounds are computed based on the values for K[t-2]+W[t-2] and - ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message - ; scheduler. - ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - ; They are then added to their respective SHA512 constants at - ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - ; For brievity, the comments following vectored instructions only refer to - ; the first of a pair of QWORDS. - ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} - ; The computation of the message schedule and the rounds are tightly - ; stitched to take advantage of instruction-level parallelism. - ; For clarity, integer instructions (for the rounds calculation) are indented - ; by one tab. Vectored instructions (for the message scheduler) are indented - ; by two tabs. */ - - mov T1, f_64 - movdqa xmm2, [W_t(\t-2)] /* XMM2 = W[t-2] */ - xor T1, g_64 - and T1, e_64 - movdqa xmm0, xmm2 /* XMM0 = W[t-2] */ - xor T1, g_64 - add T1, [WK_2(\t)] - movdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ - mov tmp0, e_64 - ror tmp0, 23 /* 41 */ - movdqa xmm3, xmm5 /* XMM3 = W[t-15] */ - xor tmp0, e_64 - ror tmp0, 4 /* 18 */ - psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */ - xor tmp0, e_64 - ror tmp0, 14 /* 14 */ - psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */ - add T1, tmp0 - add T1, h_64 - pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */ - mov T2, a_64 - xor T2, c_64 - pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */ - and T2, b_64 - mov tmp0, a_64 - psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */ - and tmp0, c_64 - xor T2, tmp0 - psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */ - mov tmp0, a_64 - ror tmp0, 5 /* 39 */ - pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */ - xor tmp0, a_64 - ror tmp0, 6 /* 34 */ - pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */ - xor tmp0, a_64 - ror tmp0, 28 /* 28 */ - psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */ - add T2, tmp0 - add d_64, T1 - psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */ - lea h_64, [T1 + T2] - RotateState - movdqa xmm1, xmm2 /* XMM1 = W[t-2] */ - mov T1, f_64 - xor T1, g_64 - movdqa xmm4, xmm5 /* XMM4 = W[t-15] */ - and T1, e_64 - xor T1, g_64 - psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */ - add T1, [WK_2(\t+1)] - mov tmp0, e_64 - psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */ - ror tmp0, 23 /* 41 */ - xor tmp0, e_64 - pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */ - ror tmp0, 4 /* 18 */ - xor tmp0, e_64 - pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */ - ror tmp0, 14 /* 14 */ - add T1, tmp0 - psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */ - add T1, h_64 - mov T2, a_64 - psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */ - xor T2, c_64 - and T2, b_64 - pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */ - mov tmp0, a_64 - and tmp0, c_64 - movdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ - xor T2, tmp0 - pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */ - mov tmp0, a_64 - paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */ - ror tmp0, 5 /* 39 */ - paddq xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */ - xor tmp0, a_64 - paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ - ror tmp0, 6 /* 34 */ - movdqa [W_t(\t)], xmm0 /* Store scheduled qwords */ - xor tmp0, a_64 - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - ror tmp0, 28 /* 28 */ - movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ - add T2, tmp0 - add d_64, T1 - lea h_64, [T1 + T2] - RotateState -.endm +#define SHA512_Round(t, a, b, c, d, e, f, g, h) \ + /* Compute Round %%t */; \ + mov T1, f /* T1 = f */; \ + mov tmp0, e /* tmp = e */; \ + xor T1, g /* T1 = f ^ g */; \ + ror tmp0, 23 /* 41 ; tmp = e ror 23 */; \ + and T1, e /* T1 = (f ^ g) & e */; \ + xor tmp0, e /* tmp = (e ror 23) ^ e */; \ + xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ + add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ + ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ + xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ + mov T2, a /* T2 = a */; \ + add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ + ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ + mov tmp0, a /* tmp = a */; \ + xor T2, c /* T2 = a ^ c */; \ + and tmp0, c /* tmp = a & c */; \ + and T2, b /* T2 = (a ^ c) & b */; \ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ + mov tmp0, a /* tmp = a */; \ + ror tmp0, 5 /* 39 ; tmp = a ror 5 */; \ + xor tmp0, a /* tmp = (a ror 5) ^ a */; \ + add d, T1 /* e(next_state) = d + T1 */; \ + ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ + xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ + lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ + ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ + add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + +#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \ + /* \ + ; Compute rounds %%t-2 and %%t-1 \ + ; Compute message schedule QWORDS %%t and %%t+1 \ + ; \ + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ + ; scheduler. \ + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ + ; They are then added to their respective SHA512 constants at \ + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ + ; For brievity, the comments following vectored instructions only refer to \ + ; the first of a pair of QWORDS. \ + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \ + ; The computation of the message schedule and the rounds are tightly \ + ; stitched to take advantage of instruction-level parallelism. \ + ; For clarity, integer instructions (for the rounds calculation) are indented \ + ; by one tab. Vectored instructions (for the message scheduler) are indented \ + ; by two tabs. \ + */ \ + \ + mov T1, f; \ + movdqa xmm2, [W_t(t-2)] /* XMM2 = W[t-2] */; \ + xor T1, g; \ + and T1, e; \ + movdqa xmm0, xmm2 /* XMM0 = W[t-2] */; \ + xor T1, g; \ + add T1, [WK_2(t)]; \ + movdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ + mov tmp0, e; \ + ror tmp0, 23 /* 41 */; \ + movdqa xmm3, xmm5 /* XMM3 = W[t-15] */; \ + xor tmp0, e; \ + ror tmp0, 4 /* 18 */; \ + psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */; \ + xor tmp0, e; \ + ror tmp0, 14 /* 14 */; \ + psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */; \ + add T1, tmp0; \ + add T1, h; \ + pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \ + mov T2, a; \ + xor T2, c; \ + pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \ + and T2, b; \ + mov tmp0, a; \ + psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \ + and tmp0, c; \ + xor T2, tmp0; \ + psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \ + mov tmp0, a; \ + ror tmp0, 5 /* 39 */; \ + pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \ + xor tmp0, a; \ + ror tmp0, 6 /* 34 */; \ + pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \ + xor tmp0, a; \ + ror tmp0, 28 /* 28 */; \ + psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \ + add T2, tmp0; \ + add d, T1; \ + psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \ + lea h, [T1 + T2] + +#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \ + movdqa xmm1, xmm2 /* XMM1 = W[t-2] */; \ + mov T1, f; \ + xor T1, g; \ + movdqa xmm4, xmm5 /* XMM4 = W[t-15] */; \ + and T1, e; \ + xor T1, g; \ + psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \ + add T1, [WK_2(t+1)]; \ + mov tmp0, e; \ + psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \ + ror tmp0, 23 /* 41 */; \ + xor tmp0, e; \ + pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */; \ + ror tmp0, 4 /* 18 */; \ + xor tmp0, e; \ + pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */; \ + ror tmp0, 14 /* 14 */; \ + add T1, tmp0; \ + psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \ + add T1, h; \ + mov T2, a; \ + psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \ + xor T2, c; \ + and T2, b; \ + pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */; \ + mov tmp0, a; \ + and tmp0, c; \ + movdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ + xor T2, tmp0; \ + pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */; \ + mov tmp0, a; \ + paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \ + ror tmp0, 5 /* 39 */; \ + paddq xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \ + xor tmp0, a; \ + paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ + ror tmp0, 6 /* 34 */; \ + movdqa [W_t(t)], xmm0 /* Store scheduled qwords */; \ + xor tmp0, a; \ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + ror tmp0, 28 /* 28 */; \ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ + add T2, tmp0; \ + add d, T1; \ + lea h, [T1 + T2] + +#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \ + SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_sse4(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks. */ .globl _gcry_sha512_transform_amd64_ssse3 ELF(.type _gcry_sha512_transform_amd64_ssse3,@function;) .align 16 _gcry_sha512_transform_amd64_ssse3: CFI_STARTPROC() xor eax, eax cmp msglen, 0 je .Lnowork /* Allocate Stack Space */ sub rsp, frame_size CFI_ADJUST_CFA_OFFSET(frame_size); /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbx mov [rsp + frame_GPRSAVE + 8 * 1], r12 mov [rsp + frame_GPRSAVE + 8 * 2], r13 mov [rsp + frame_GPRSAVE + 8 * 3], r14 mov [rsp + frame_GPRSAVE + 8 * 4], r15 CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0); CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1); CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2); CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3); CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4); .Lupdateblock: /* Load state variables */ mov a_64, [DIGEST(0)] mov b_64, [DIGEST(1)] mov c_64, [DIGEST(2)] mov d_64, [DIGEST(3)] mov e_64, [DIGEST(4)] mov f_64, [DIGEST(5)] mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] - t = 0 - .rept 80/2 + 1 - /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ - /* +1 iteration because the scheduler leads hashing by 1 iteration */ - .if t < 2 - /* BSWAP 2 QWORDS */ - movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] - movdqu xmm0, [MSG(t)] - pshufb xmm0, xmm1 /* BSWAP */ - movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - movdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ - .elseif t < 16 - /* BSWAP 2 QWORDS; Compute 2 Rounds */ - movdqu xmm0, [MSG(t)] - pshufb xmm0, xmm1 /* BSWAP */ - SHA512_Round (t - 2) /* Round t-2 */ - movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ - paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ - SHA512_Round (t - 1) /* Round t-1 */ - movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ - .elseif t < 79 - /* Schedule 2 QWORDS; Compute 2 Rounds */ - SHA512_2Sched_2Round_sse t - .else - /* Compute 2 Rounds */ - SHA512_Round (t - 2) - SHA512_Round (t - 1) - .endif - t = (t)+2 - .endr + /* BSWAP 2 QWORDS */ + movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + movdqu xmm0, [MSG(0)] + pshufb xmm0, xmm1 /* BSWAP */ + movdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(0)] /* Compute W[t]+K[t] */ + movdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ + + #define T_2_14(t, a, b, c, d, e, f, g, h) \ + /* BSWAP 2 QWORDS; Compute 2 Rounds */; \ + movdqu xmm0, [MSG(t)]; \ + pshufb xmm0, xmm1 /* BSWAP */; \ + SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ + SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64); \ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ + + #define T_16_78(t, a, b, c, d, e, f, g, h) \ + SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64) + + #define T_80(t, a, b, c, d, e, f, g, h) \ + /* Compute 2 Rounds */; \ + SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ + e##_64, f##_64, g##_64, h##_64); \ + SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ + d##_64, e##_64, f##_64, g##_64) + + T_2_14(2, a, b, c, d, e, f, g, h) + T_2_14(4, g, h, a, b, c, d, e, f) + T_2_14(6, e, f, g, h, a, b, c, d) + T_2_14(8, c, d, e, f, g, h, a, b) + T_2_14(10, a, b, c, d, e, f, g, h) + T_2_14(12, g, h, a, b, c, d, e, f) + T_2_14(14, e, f, g, h, a, b, c, d) + T_16_78(16, c, d, e, f, g, h, a, b) + T_16_78(18, a, b, c, d, e, f, g, h) + T_16_78(20, g, h, a, b, c, d, e, f) + T_16_78(22, e, f, g, h, a, b, c, d) + T_16_78(24, c, d, e, f, g, h, a, b) + T_16_78(26, a, b, c, d, e, f, g, h) + T_16_78(28, g, h, a, b, c, d, e, f) + T_16_78(30, e, f, g, h, a, b, c, d) + T_16_78(32, c, d, e, f, g, h, a, b) + T_16_78(34, a, b, c, d, e, f, g, h) + T_16_78(36, g, h, a, b, c, d, e, f) + T_16_78(38, e, f, g, h, a, b, c, d) + T_16_78(40, c, d, e, f, g, h, a, b) + T_16_78(42, a, b, c, d, e, f, g, h) + T_16_78(44, g, h, a, b, c, d, e, f) + T_16_78(46, e, f, g, h, a, b, c, d) + T_16_78(48, c, d, e, f, g, h, a, b) + T_16_78(50, a, b, c, d, e, f, g, h) + T_16_78(52, g, h, a, b, c, d, e, f) + T_16_78(54, e, f, g, h, a, b, c, d) + T_16_78(56, c, d, e, f, g, h, a, b) + T_16_78(58, a, b, c, d, e, f, g, h) + T_16_78(60, g, h, a, b, c, d, e, f) + T_16_78(62, e, f, g, h, a, b, c, d) + T_16_78(64, c, d, e, f, g, h, a, b) + T_16_78(66, a, b, c, d, e, f, g, h) + T_16_78(68, g, h, a, b, c, d, e, f) + T_16_78(70, e, f, g, h, a, b, c, d) + T_16_78(72, c, d, e, f, g, h, a, b) + T_16_78(74, a, b, c, d, e, f, g, h) + T_16_78(76, g, h, a, b, c, d, e, f) + T_16_78(78, e, f, g, h, a, b, c, d) + T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 add [DIGEST(1)], b_64 add [DIGEST(2)], c_64 add [DIGEST(3)], d_64 add [DIGEST(4)], e_64 add [DIGEST(5)], f_64 add [DIGEST(6)], g_64 add [DIGEST(7)], h_64 /* Advance to next message block */ add msg, 16*8 dec msglen jnz .Lupdateblock /* Restore GPRs */ mov rbx, [rsp + frame_GPRSAVE + 8 * 0] mov r12, [rsp + frame_GPRSAVE + 8 * 1] mov r13, [rsp + frame_GPRSAVE + 8 * 2] mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] CFI_RESTORE(rbx) CFI_RESTORE(r12) CFI_RESTORE(r13) CFI_RESTORE(r14) CFI_RESTORE(r15) pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 pxor xmm3, xmm3 pxor xmm4, xmm4 pxor xmm5, xmm5 /* Burn stack */ - t = 0 - .rept frame_W_size / 16 - movdqu [rsp + frame_W + (t) * 16], xmm0 - t = ((t)+1) - .endr + mov eax, 0 +.Lerase_stack: + movdqu [rsp + rax], xmm0 + add eax, 16 + cmp eax, frame_W_size + jne .Lerase_stack movdqu [rsp + frame_WK], xmm0 xor eax, eax /* Restore Stack Pointer */ add rsp, frame_size CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: ret CFI_ENDPROC() /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Binary Data */ .align 16 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .LXMM_QWORD_BSWAP: .octa 0x08090a0b0c0d0e0f0001020304050607 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif #endif diff --git a/configure.ac b/configure.ac index f7339a3e..e4a10b78 100644 --- a/configure.ac +++ b/configure.ac @@ -1,3266 +1,3256 @@ # Configure.ac script for Libgcrypt # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, # 2007, 2008, 2009, 2011 Free Software Foundation, Inc. # Copyright (C) 2012-2021 g10 Code GmbH # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # (Process this file with autoconf to produce a configure script.) AC_REVISION($Revision$) AC_PREREQ([2.60]) min_automake_version="1.14" # To build a release you need to create a tag with the version number # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force". Please # bump the version number immediately after the release and do another # commit and push so that the git magic is able to work. See below # for the LT versions. m4_define([mym4_package],[libgcrypt]) m4_define([mym4_major], [1]) m4_define([mym4_minor], [9]) m4_define([mym4_micro], [1]) # Below is m4 magic to extract and compute the git revision number, # the decimalized short revision number, a beta version string and a # flag indicating a development version (mym4_isbeta). Note that the # m4 processing is done by autoconf and not during the configure run. m4_define([mym4_verslist], m4_split(m4_esyscmd([./autogen.sh --find-version] \ mym4_package mym4_major mym4_minor mym4_micro),[:])) m4_define([mym4_isbeta], m4_argn(2, mym4_verslist)) m4_define([mym4_version], m4_argn(4, mym4_verslist)) m4_define([mym4_revision], m4_argn(7, mym4_verslist)) m4_define([mym4_revision_dec], m4_argn(8, mym4_verslist)) m4_esyscmd([echo ]mym4_version[>VERSION]) AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org]) # LT Version numbers, remember to change them just *before* a release. # (Code changed: REVISION++) # (Interfaces added/removed/changed: CURRENT++, REVISION=0) # (Interfaces added: AGE++) # (Interfaces removed: AGE=0) # # (Interfaces removed: CURRENT++, AGE=0, REVISION=0) # (Interfaces added: CURRENT++, AGE++, REVISION=0) # (No interfaces changed: REVISION++) LIBGCRYPT_LT_CURRENT=23 LIBGCRYPT_LT_AGE=3 LIBGCRYPT_LT_REVISION=0 ################################################ AC_SUBST(LIBGCRYPT_LT_CURRENT) AC_SUBST(LIBGCRYPT_LT_AGE) AC_SUBST(LIBGCRYPT_LT_REVISION) # If the API is changed in an incompatible way: increment the next counter. # # 1.6: ABI and API change but the change is to most users irrelevant # and thus the API version number has not been incremented. LIBGCRYPT_CONFIG_API_VERSION=1 # If you change the required gpg-error version, please remove # unnecessary error code defines in src/gcrypt-int.h. NEED_GPG_ERROR_VERSION=1.27 AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_SRCDIR([src/libgcrypt.vers]) AM_INIT_AUTOMAKE([serial-tests dist-bzip2]) AC_CONFIG_HEADER(config.h) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_LIBOBJ_DIR([compat]) AC_CANONICAL_HOST AM_MAINTAINER_MODE AM_SILENT_RULES AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory]) AH_TOP([ #ifndef _GCRYPT_CONFIG_H_INCLUDED #define _GCRYPT_CONFIG_H_INCLUDED /* Enable gpg-error's strerror macro for W32CE. */ #define GPG_ERR_ENABLE_ERRNO_MACROS 1 ]) AH_BOTTOM([ #define _GCRYPT_IN_LIBGCRYPT 1 /* Add .note.gnu.property section for Intel CET in assembler sources when CET is enabled. */ #if defined(__ASSEMBLER__) && defined(__CET__) # include #endif /* If the configure check for endianness has been disabled, get it from OS macros. This is intended for making fat binary builds on OS X. */ #ifdef DISABLED_ENDIAN_CHECK # if defined(__BIG_ENDIAN__) # define WORDS_BIGENDIAN 1 # elif defined(__LITTLE_ENDIAN__) # undef WORDS_BIGENDIAN # else # error "No endianness found" # endif #endif /*DISABLED_ENDIAN_CHECK*/ /* We basically use the original Camellia source. Make sure the symbols properly prefixed. */ #define CAMELLIA_EXT_SYM_PREFIX _gcry_ #endif /*_GCRYPT_CONFIG_H_INCLUDED*/ ]) AH_VERBATIM([_REENTRANT], [/* To allow the use of Libgcrypt in multithreaded programs we have to use special features from the library. */ #ifndef _REENTRANT # define _REENTRANT 1 #endif ]) ###################### ## Basic checks. ### (we need some results later on (e.g. $GCC) ###################### AC_PROG_MAKE_SET missing_dir=`cd $ac_aux_dir && pwd` AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir) AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir) AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir) AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir) # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir) AC_PROG_CC AC_PROG_CPP AM_PROG_CC_C_O AM_PROG_AS AC_SEARCH_LIBS([strerror],[cposix]) AC_PROG_INSTALL AC_PROG_AWK AC_USE_SYSTEM_EXTENSIONS # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH dnl is defined. The issue is that with the new dtags, LD_LIBRARY_PATH has dnl the precedence over the run path, so that if a compatible MPFR library dnl is installed in some directory from $LD_LIBRARY_PATH, then the tested dnl MPFR library will be this library instead of the MPFR library from the dnl build tree. Other OS with the same issue might be added later. dnl dnl References: dnl https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=859732 dnl http://lists.gnu.org/archive/html/libtool/2017-05/msg00000.html dnl dnl We need to check whether --disable-new-dtags is supported as alternate dnl linkers may be used (e.g., with tcc: CC=tcc LD=tcc). dnl case $host in *-*-linux*) if test -n "$LD_LIBRARY_PATH"; then saved_LDFLAGS="$LDFLAGS" LDADD_FOR_TESTS_KLUDGE="-Wl,--disable-new-dtags" LDFLAGS="$LDFLAGS $LDADD_FOR_TESTS_KLUDGE" AC_MSG_CHECKING(whether --disable-new-dtags is supported by the linker) AC_LINK_IFELSE([AC_LANG_SOURCE([[ int main (void) { return 0; } ]])], [AC_MSG_RESULT(yes (use it since LD_LIBRARY_PATH is set))], [AC_MSG_RESULT(no) LDADD_FOR_TESTS_KLUDGE="" ]) LDFLAGS="$saved_LDFLAGS" fi ;; esac AC_SUBST([LDADD_FOR_TESTS_KLUDGE]) VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_major \ mym4_minor mym4_micro) AC_SUBST(VERSION_NUMBER) # We need to compile and run a program on the build machine. AX_CC_FOR_BUILD LT_PREREQ([2.2.6]) LT_INIT([win32-dll disable-static]) LT_LANG([Windows Resource]) ########################## ## General definitions. ## ########################## # Used by libgcrypt-config LIBGCRYPT_CONFIG_LIBS="-lgcrypt" LIBGCRYPT_CONFIG_CFLAGS="" LIBGCRYPT_CONFIG_HOST="$host" # Definitions for symmetric ciphers. available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed" available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20" available_ciphers="$available_ciphers sm4" enabled_ciphers="" # Definitions for public-key ciphers. available_pubkey_ciphers="dsa elgamal rsa ecc" enabled_pubkey_ciphers="" # Definitions for message digests. available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512" available_digests="$available_digests sha3 tiger whirlpool stribog blake2" available_digests="$available_digests sm3" enabled_digests="" # Definitions for kdfs (optional ones) available_kdfs="s2k pkdf2 scrypt" enabled_kdfs="" # Definitions for random modules. available_random_modules="linux egd unix" auto_random_modules="$available_random_modules" # Supported thread backends. LIBGCRYPT_THREAD_MODULES="" # Other definitions. have_w32_system=no have_w32ce_system=no have_pthread=no # Setup some stuff depending on host. case "${host}" in *-*-mingw32*) ac_cv_have_dev_random=no have_w32_system=yes case "${host}" in *-mingw32ce*) have_w32ce_system=yes available_random_modules="w32ce" ;; *) available_random_modules="w32" ;; esac AC_DEFINE(USE_ONLY_8DOT3,1, [set this to limit filenames to the 8.3 format]) AC_DEFINE(HAVE_DRIVE_LETTERS,1, [defined if we must run on a stupid file system]) AC_DEFINE(HAVE_DOSISH_SYSTEM,1, [defined if we run on some of the PCDOS like systems (DOS, Windoze. OS/2) with special properties like no file modes]) ;; i?86-emx-os2 | i?86-*-os2*emx) # OS/2 with the EMX environment ac_cv_have_dev_random=no AC_DEFINE(HAVE_DRIVE_LETTERS) AC_DEFINE(HAVE_DOSISH_SYSTEM) ;; i?86-*-msdosdjgpp*) # DOS with the DJGPP environment ac_cv_have_dev_random=no AC_DEFINE(HAVE_DRIVE_LETTERS) AC_DEFINE(HAVE_DOSISH_SYSTEM) ;; *-*-hpux*) if test -z "$GCC" ; then CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE" fi ;; *-dec-osf4*) if test -z "$GCC" ; then # Suppress all warnings # to get rid of the unsigned/signed char mismatch warnings. CFLAGS="$CFLAGS -w" fi ;; m68k-atari-mint) ;; *-apple-darwin*) AC_DEFINE(_DARWIN_C_SOURCE, 900000L, Expose all libc features (__DARWIN_C_FULL).) AC_DEFINE(USE_POSIX_SPAWN_FOR_TESTS, 1, [defined if we use posix_spawn in test program]) ;; *) ;; esac if test "$have_w32_system" = yes; then AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system]) if test "$have_w32ce_system" = yes; then AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE]) fi fi AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes) AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes) # A printable OS Name is sometimes useful. case "${host}" in *-*-mingw32ce*) PRINTABLE_OS_NAME="W32CE" ;; *-*-mingw32*) PRINTABLE_OS_NAME="W32" ;; i?86-emx-os2 | i?86-*-os2*emx ) PRINTABLE_OS_NAME="OS/2" ;; i?86-*-msdosdjgpp*) PRINTABLE_OS_NAME="MSDOS/DJGPP" ;; *-linux*) PRINTABLE_OS_NAME="GNU/Linux" ;; *) PRINTABLE_OS_NAME=`uname -s || echo "Unknown"` ;; esac NAME_OF_DEV_RANDOM="/dev/random" NAME_OF_DEV_URANDOM="/dev/urandom" AC_ARG_ENABLE(endian-check, AS_HELP_STRING([--disable-endian-check], [disable the endian check and trust the OS provided macros]), endiancheck=$enableval,endiancheck=yes) if test x"$endiancheck" = xyes ; then AC_C_BIGENDIAN else AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness]) fi AC_CHECK_SIZEOF(unsigned short, 2) AC_CHECK_SIZEOF(unsigned int, 4) AC_CHECK_SIZEOF(unsigned long, 4) AC_CHECK_SIZEOF(unsigned long long, 0) AC_CHECK_SIZEOF(void *, 0) AC_TYPE_UINTPTR_T if test "$ac_cv_sizeof_unsigned_short" = "0" \ || test "$ac_cv_sizeof_unsigned_int" = "0" \ || test "$ac_cv_sizeof_unsigned_long" = "0"; then AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]); fi # Ensure that we have UINT64_C before we bother to check for uint64_t AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works], AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[uint64_t foo=UINT64_C(42);]])], gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no)) if test "$gnupg_cv_uint64_c_works" = "yes" ; then AC_CHECK_SIZEOF(uint64_t) fi # Do we have any 64-bit data types? if test "$ac_cv_sizeof_unsigned_int" != "8" \ && test "$ac_cv_sizeof_unsigned_long" != "8" \ && test "$ac_cv_sizeof_unsigned_long_long" != "8" \ && test "$ac_cv_sizeof_uint64_t" != "8"; then AC_MSG_ERROR([[ *** *** No 64-bit integer type available. *** It is not possible to build Libgcrypt on this platform. ***]]) fi # If not specified otherwise, all available algorithms will be # included. default_ciphers="$available_ciphers" default_pubkey_ciphers="$available_pubkey_ciphers" default_digests="$available_digests" default_kdfs="$available_kdfs" # Blacklist MD2 by default default_digests=`echo $default_digests | sed -e 's/md2//g'` # Substitutions to set generated files in a Emacs buffer to read-only. AC_SUBST(emacs_local_vars_begin, ['Local Variables:']) AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t']) AC_SUBST(emacs_local_vars_end, ['End:']) ############################ ## Command line switches. ## ############################ # Implementation of the --enable-ciphers switch. AC_ARG_ENABLE(ciphers, AS_HELP_STRING([--enable-ciphers=ciphers], [select the symmetric ciphers to include]), [enabled_ciphers=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_ciphers=""]) if test "x$enabled_ciphers" = "x" \ -o "$enabled_ciphers" = "yes" \ -o "$enabled_ciphers" = "no"; then enabled_ciphers=$default_ciphers fi AC_MSG_CHECKING([which symmetric ciphers to include]) for cipher in $enabled_ciphers; do LIST_MEMBER($cipher, $available_ciphers) if test "$found" = "0"; then AC_MSG_ERROR([unsupported cipher "$cipher" specified]) fi done AC_MSG_RESULT([$enabled_ciphers]) # Implementation of the --enable-pubkey-ciphers switch. AC_ARG_ENABLE(pubkey-ciphers, AS_HELP_STRING([--enable-pubkey-ciphers=ciphers], [select the public-key ciphers to include]), [enabled_pubkey_ciphers=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_pubkey_ciphers=""]) if test "x$enabled_pubkey_ciphers" = "x" \ -o "$enabled_pubkey_ciphers" = "yes" \ -o "$enabled_pubkey_ciphers" = "no"; then enabled_pubkey_ciphers=$default_pubkey_ciphers fi AC_MSG_CHECKING([which public-key ciphers to include]) for cipher in $enabled_pubkey_ciphers; do LIST_MEMBER($cipher, $available_pubkey_ciphers) if test "$found" = "0"; then AC_MSG_ERROR([unsupported public-key cipher specified]) fi done AC_MSG_RESULT([$enabled_pubkey_ciphers]) # Implementation of the --enable-digests switch. AC_ARG_ENABLE(digests, AS_HELP_STRING([--enable-digests=digests], [select the message digests to include]), [enabled_digests=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_digests=""]) if test "x$enabled_digests" = "x" \ -o "$enabled_digests" = "yes" \ -o "$enabled_digests" = "no"; then enabled_digests=$default_digests fi AC_MSG_CHECKING([which message digests to include]) for digest in $enabled_digests; do LIST_MEMBER($digest, $available_digests) if test "$found" = "0"; then AC_MSG_ERROR([unsupported message digest specified]) fi done AC_MSG_RESULT([$enabled_digests]) # Implementation of the --enable-kdfs switch. AC_ARG_ENABLE(kdfs, AS_HELP_STRING([--enable-kfds=kdfs], [select the KDFs to include]), [enabled_kdfs=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_kdfs=""]) if test "x$enabled_kdfs" = "x" \ -o "$enabled_kdfs" = "yes" \ -o "$enabled_kdfs" = "no"; then enabled_kdfs=$default_kdfs fi AC_MSG_CHECKING([which key derivation functions to include]) for kdf in $enabled_kdfs; do LIST_MEMBER($kdf, $available_kdfs) if test "$found" = "0"; then AC_MSG_ERROR([unsupported key derivation function specified]) fi done AC_MSG_RESULT([$enabled_kdfs]) # Implementation of the --enable-random switch. AC_ARG_ENABLE(random, AS_HELP_STRING([--enable-random=name], [select which random number generator to use]), [random=`echo $enableval | tr '[A-Z]' '[a-z]'`], []) if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then random=default fi AC_MSG_CHECKING([which random module to use]) if test "$random" != "default" -a "$random" != "auto"; then LIST_MEMBER($random, $available_random_modules) if test "$found" = "0"; then AC_MSG_ERROR([unsupported random module specified]) fi fi AC_MSG_RESULT($random) # Implementation of the --disable-dev-random switch. AC_MSG_CHECKING([whether use of /dev/random is requested]) AC_ARG_ENABLE(dev-random, [ --disable-dev-random disable the use of dev random], try_dev_random=$enableval, try_dev_random=yes) AC_MSG_RESULT($try_dev_random) # Implementation of the --with-egd-socket switch. AC_ARG_WITH(egd-socket, [ --with-egd-socket=NAME Use NAME for the EGD socket)], egd_socket_name="$withval", egd_socket_name="" ) AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name", [Define if you don't want the default EGD socket name. For details see cipher/rndegd.c]) # Implementation of the --enable-random-daemon AC_MSG_CHECKING([whether the experimental random daemon is requested]) AC_ARG_ENABLE([random-daemon], AS_HELP_STRING([--enable-random-daemon], [Build and support the experimental gcryptrnd]), [use_random_daemon=$enableval], [use_random_daemon=no]) AC_MSG_RESULT($use_random_daemon) if test x$use_random_daemon = xyes ; then AC_DEFINE(USE_RANDOM_DAEMON,1, [Define to support the experimental random daemon]) fi AM_CONDITIONAL(USE_RANDOM_DAEMON, test x$use_random_daemon = xyes) # Implementation of --disable-asm. AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested]) AC_ARG_ENABLE([asm], AS_HELP_STRING([--disable-asm], [Disable MPI and cipher assembler modules]), [try_asm_modules=$enableval], [try_asm_modules=yes]) AC_MSG_RESULT($try_asm_modules) # Implementation of the --enable-m-guard switch. AC_MSG_CHECKING([whether memory guard is requested]) AC_ARG_ENABLE(m-guard, AS_HELP_STRING([--enable-m-guard], [Enable memory guard facility]), [use_m_guard=$enableval], [use_m_guard=no]) AC_MSG_RESULT($use_m_guard) if test "$use_m_guard" = yes ; then AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature]) fi # Implementation of the --enable-large-data-tests switch. AC_MSG_CHECKING([whether to run large data tests]) AC_ARG_ENABLE(large-data-tests, AS_HELP_STRING([--enable-large-data-tests], [Enable the real long ruinning large data tests]), large_data_tests=$enableval,large_data_tests=no) AC_MSG_RESULT($large_data_tests) AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests) # Implementation of --enable-force-soft-hwfeatures AC_MSG_CHECKING([whether 'soft' HW feature bits are forced on]) AC_ARG_ENABLE([force-soft-hwfeatures], AS_HELP_STRING([--enable-force-soft-hwfeatures], [Enable forcing 'soft' HW feature bits on]), [force_soft_hwfeatures=$enableval], [force_soft_hwfeatures=no]) AC_MSG_RESULT($force_soft_hwfeatures) # Implementation of the --with-capabilities switch. # Check whether we want to use Linux capabilities AC_MSG_CHECKING([whether use of capabilities is requested]) AC_ARG_WITH(capabilities, AS_HELP_STRING([--with-capabilities], [Use linux capabilities [default=no]]), [use_capabilities="$withval"],[use_capabilities=no]) AC_MSG_RESULT($use_capabilities) # Implementation of the --enable-hmac-binary-check. AC_MSG_CHECKING([whether a HMAC binary check is requested]) AC_ARG_ENABLE(hmac-binary-check, AS_HELP_STRING([--enable-hmac-binary-check], [Enable library integrity check]), [use_hmac_binary_check=$enableval], [use_hmac_binary_check=no]) AC_MSG_RESULT($use_hmac_binary_check) if test "$use_hmac_binary_check" = yes ; then AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1, [Define to support an HMAC based integrity check]) fi # Implementation of the --disable-jent-support switch. AC_MSG_CHECKING([whether jitter entropy support is requested]) AC_ARG_ENABLE(jent-support, AS_HELP_STRING([--disable-jent-support], [Disable support for the Jitter entropy collector]), jentsupport=$enableval,jentsupport=yes) AC_MSG_RESULT($jentsupport) # Implementation of the --disable-padlock-support switch. AC_MSG_CHECKING([whether padlock support is requested]) AC_ARG_ENABLE(padlock-support, AS_HELP_STRING([--disable-padlock-support], [Disable support for the PadLock Engine of VIA processors]), padlocksupport=$enableval,padlocksupport=yes) AC_MSG_RESULT($padlocksupport) # Implementation of the --disable-aesni-support switch. AC_MSG_CHECKING([whether AESNI support is requested]) AC_ARG_ENABLE(aesni-support, AS_HELP_STRING([--disable-aesni-support], [Disable support for the Intel AES-NI instructions]), aesnisupport=$enableval,aesnisupport=yes) AC_MSG_RESULT($aesnisupport) # Implementation of the --disable-shaext-support switch. AC_MSG_CHECKING([whether SHAEXT support is requested]) AC_ARG_ENABLE(shaext-support, AS_HELP_STRING([--disable-shaext-support], [Disable support for the Intel SHAEXT instructions]), shaextsupport=$enableval,shaextsupport=yes) AC_MSG_RESULT($shaextsupport) # Implementation of the --disable-pclmul-support switch. AC_MSG_CHECKING([whether PCLMUL support is requested]) AC_ARG_ENABLE(pclmul-support, AS_HELP_STRING([--disable-pclmul-support], [Disable support for the Intel PCLMUL instructions]), pclmulsupport=$enableval,pclmulsupport=yes) AC_MSG_RESULT($pclmulsupport) # Implementation of the --disable-sse41-support switch. AC_MSG_CHECKING([whether SSE4.1 support is requested]) AC_ARG_ENABLE(sse41-support, AS_HELP_STRING([--disable-sse41-support], [Disable support for the Intel SSE4.1 instructions]), sse41support=$enableval,sse41support=yes) AC_MSG_RESULT($sse41support) # Implementation of the --disable-drng-support switch. AC_MSG_CHECKING([whether DRNG support is requested]) AC_ARG_ENABLE(drng-support, AS_HELP_STRING([--disable-drng-support], [Disable support for the Intel DRNG (RDRAND instruction)]), drngsupport=$enableval,drngsupport=yes) AC_MSG_RESULT($drngsupport) # Implementation of the --disable-avx-support switch. AC_MSG_CHECKING([whether AVX support is requested]) AC_ARG_ENABLE(avx-support, AS_HELP_STRING([--disable-avx-support], [Disable support for the Intel AVX instructions]), avxsupport=$enableval,avxsupport=yes) AC_MSG_RESULT($avxsupport) # Implementation of the --disable-avx2-support switch. AC_MSG_CHECKING([whether AVX2 support is requested]) AC_ARG_ENABLE(avx2-support, AS_HELP_STRING([--disable-avx2-support], [Disable support for the Intel AVX2 instructions]), avx2support=$enableval,avx2support=yes) AC_MSG_RESULT($avx2support) # Implementation of the --disable-neon-support switch. AC_MSG_CHECKING([whether NEON support is requested]) AC_ARG_ENABLE(neon-support, AS_HELP_STRING([--disable-neon-support], [Disable support for the ARM NEON instructions]), neonsupport=$enableval,neonsupport=yes) AC_MSG_RESULT($neonsupport) # Implementation of the --disable-arm-crypto-support switch. AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested]) AC_ARG_ENABLE(arm-crypto-support, AS_HELP_STRING([--disable-arm-crypto-support], [Disable support for the ARMv8 Crypto Extension instructions]), armcryptosupport=$enableval,armcryptosupport=yes) AC_MSG_RESULT($armcryptosupport) # Implementation of the --disable-ppc-crypto-support switch. AC_MSG_CHECKING([whether PPC crypto support is requested]) AC_ARG_ENABLE(ppc-crypto-support, AS_HELP_STRING([--disable-ppc-crypto-support], [Disable support for the PPC crypto instructions introduced in POWER 8 (PowerISA 2.07)]), ppccryptosupport=$enableval,ppccryptosupport=yes) AC_MSG_RESULT($ppccryptosupport) # Implementation of the --disable-O-flag-munging switch. AC_MSG_CHECKING([whether a -O flag munging is requested]) AC_ARG_ENABLE([O-flag-munging], AS_HELP_STRING([--disable-O-flag-munging], [Disable modification of the cc -O flag]), [enable_o_flag_munging=$enableval], [enable_o_flag_munging=yes]) AC_MSG_RESULT($enable_o_flag_munging) AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes") # Implementation of the --disable-instrumentation-munging switch. AC_MSG_CHECKING([whether a instrumentation (-fprofile, -fsanitize) munging is requested]) AC_ARG_ENABLE([instrumentation-munging], AS_HELP_STRING([--disable-instrumentation-munging], [Disable modification of the cc instrumentation options]), [enable_instrumentation_munging=$enableval], [enable_instrumentation_munging=yes]) AC_MSG_RESULT($enable_instrumentation_munging) AM_CONDITIONAL(ENABLE_INSTRUMENTATION_MUNGING, test "$enable_instrumentation_munging" = "yes") # Implementation of the --disable-amd64-as-feature-detection switch. AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection]) AC_ARG_ENABLE(amd64-as-feature-detection, AS_HELP_STRING([--disable-amd64-as-feature-detection], [Disable the auto-detection of AMD64 as(1) features]), amd64_as_feature_detection=$enableval, amd64_as_feature_detection=yes) AC_MSG_RESULT($amd64_as_feature_detection) AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME", [A human readable text with the name of the OS]) # For some systems we know that we have ld_version scripts. # Use it then as default. have_ld_version_script=no case "${host}" in *-*-linux*) have_ld_version_script=yes ;; *-*-gnu*) have_ld_version_script=yes ;; esac AC_ARG_ENABLE([ld-version-script], AS_HELP_STRING([--enable-ld-version-script], [enable/disable use of linker version script. (default is system dependent)]), [have_ld_version_script=$enableval], [ : ] ) AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes") AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM", [defined to the name of the strong random device]) AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM", [defined to the name of the weaker random device]) ############################### #### Checks for libraries. #### ############################### # # gpg-error is required. # AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION") if test "x$GPG_ERROR_LIBS" = "x"; then AC_MSG_ERROR([libgpg-error is needed. See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .]) fi AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT, [The default error source for libgcrypt.]) # # Check whether the GNU Pth library is available. We require this # to build the optional gcryptrnd program. # AC_ARG_WITH(pth-prefix, AS_HELP_STRING([--with-pth-prefix=PFX], [prefix where GNU Pth is installed (optional)]), pth_config_prefix="$withval", pth_config_prefix="") if test x$pth_config_prefix != x ; then PTH_CONFIG="$pth_config_prefix/bin/pth-config" fi if test "$use_random_daemon" = "yes"; then AC_PATH_PROG(PTH_CONFIG, pth-config, no) if test "$PTH_CONFIG" = "no"; then AC_MSG_WARN([[ *** *** To build the Libgcrypt's random number daemon *** we need the support of the GNU Portable Threads Library. *** Download it from ftp://ftp.gnu.org/gnu/pth/ *** On a Debian GNU/Linux system you might want to try *** apt-get install libpth-dev ***]]) else GNUPG_PTH_VERSION_CHECK([1.3.7]) if test $have_pth = yes; then PTH_CFLAGS=`$PTH_CONFIG --cflags` PTH_LIBS=`$PTH_CONFIG --ldflags` PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`" AC_DEFINE(USE_GNU_PTH, 1, [Defined if the GNU Portable Thread Library should be used]) AC_DEFINE(HAVE_PTH, 1, [Defined if the GNU Pth is available]) fi fi fi AC_SUBST(PTH_CFLAGS) AC_SUBST(PTH_LIBS) # # Check whether pthreads is available # if test "$have_w32_system" != yes; then AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes) if test "$have_pthread" = yes; then AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.]) fi fi # Solaris needs -lsocket and -lnsl. Unisys system includes # gethostbyname in libsocket but needs libnsl for socket. AC_SEARCH_LIBS(setsockopt, [socket], , [AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])]) AC_SEARCH_LIBS(setsockopt, [nsl]) ################################## #### Checks for header files. #### ################################## AC_HEADER_STDC AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h sys/auxv.h) INSERT_SYS_SELECT_H= if test x"$ac_cv_header_sys_select_h" = xyes; then INSERT_SYS_SELECT_H=" include " fi AC_SUBST(INSERT_SYS_SELECT_H) ########################################## #### Checks for typedefs, structures, #### #### and compiler characteristics. #### ########################################## AC_C_CONST AC_C_INLINE AC_TYPE_SIZE_T AC_TYPE_PID_T AC_CHECK_TYPES([byte, ushort, u16, u32, u64]) gl_TYPE_SOCKLEN_T case "${host}" in *-*-mingw32*) # socklen_t may or may not be defined depending on what headers # are included. To be safe we use int as this is the actual type. FALLBACK_SOCKLEN_T="typedef int gcry_socklen_t;" ;; *) if test ".$gl_cv_socklen_t_equiv" = "."; then FALLBACK_SOCKLEN_T="typedef socklen_t gcry_socklen_t;" else FALLBACK_SOCKLEN_T="typedef ${gl_cv_socklen_t_equiv} gcry_socklen_t;" fi esac AC_SUBST(FALLBACK_SOCKLEN_T) # # Check for __builtin_bswap32 intrinsic. # AC_CACHE_CHECK(for __builtin_bswap32, [gcry_cv_have_builtin_bswap32], [gcry_cv_have_builtin_bswap32=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [int x = 0; int y = __builtin_bswap32(x); return y;])], [gcry_cv_have_builtin_bswap32=yes])]) if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP32,1, [Defined if compiler has '__builtin_bswap32' intrinsic]) fi # # Check for __builtin_bswap64 intrinsic. # AC_CACHE_CHECK(for __builtin_bswap64, [gcry_cv_have_builtin_bswap64], [gcry_cv_have_builtin_bswap64=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [long long x = 0; long long y = __builtin_bswap64(x); return y;])], [gcry_cv_have_builtin_bswap64=yes])]) if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP64,1, [Defined if compiler has '__builtin_bswap64' intrinsic]) fi # # Check for __builtin_ctz intrinsic. # AC_CACHE_CHECK(for __builtin_ctz, [gcry_cv_have_builtin_ctz], [gcry_cv_have_builtin_ctz=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned int x = 0; int y = __builtin_ctz(x); return y;])], [gcry_cv_have_builtin_ctz=yes])]) if test "$gcry_cv_have_builtin_ctz" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CTZ, 1, [Defined if compiler has '__builtin_ctz' intrinsic]) fi # # Check for __builtin_ctzl intrinsic. # AC_CACHE_CHECK(for __builtin_ctzl, [gcry_cv_have_builtin_ctzl], [gcry_cv_have_builtin_ctzl=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned long x = 0; long y = __builtin_ctzl(x); return y;])], [gcry_cv_have_builtin_ctzl=yes])]) if test "$gcry_cv_have_builtin_ctzl" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CTZL, 1, [Defined if compiler has '__builtin_ctzl' intrinsic]) fi # # Check for __builtin_clz intrinsic. # AC_CACHE_CHECK(for __builtin_clz, [gcry_cv_have_builtin_clz], [gcry_cv_have_builtin_clz=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned int x = 0; int y = __builtin_clz(x); return y;])], [gcry_cv_have_builtin_clz=yes])]) if test "$gcry_cv_have_builtin_clz" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CLZ, 1, [Defined if compiler has '__builtin_clz' intrinsic]) fi # # Check for __builtin_clzl intrinsic. # AC_CACHE_CHECK(for __builtin_clzl, [gcry_cv_have_builtin_clzl], [gcry_cv_have_builtin_clzl=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned long x = 0; long y = __builtin_clzl(x); return y;])], [gcry_cv_have_builtin_clzl=yes])]) if test "$gcry_cv_have_builtin_clzl" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CLZL, 1, [Defined if compiler has '__builtin_clzl' intrinsic]) fi # # Check for __sync_synchronize intrinsic. # AC_CACHE_CHECK(for __sync_synchronize, [gcry_cv_have_sync_synchronize], [gcry_cv_have_sync_synchronize=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [__sync_synchronize(); return 0;])], [gcry_cv_have_sync_synchronize=yes])]) if test "$gcry_cv_have_sync_synchronize" = "yes" ; then AC_DEFINE(HAVE_SYNC_SYNCHRONIZE, 1, [Defined if compiler has '__sync_synchronize' intrinsic]) fi # # Check for VLA support (variable length arrays). # AC_CACHE_CHECK(whether the variable length arrays are supported, [gcry_cv_have_vla], [gcry_cv_have_vla=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void f1(char *, int); char foo(int i) { char b[(i < 0 ? 0 : i) + 1]; f1(b, sizeof b); return b[0];}]])], [gcry_cv_have_vla=yes])]) if test "$gcry_cv_have_vla" = "yes" ; then AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported]) fi # # Check for ELF visibility support. # AC_CACHE_CHECK(whether the visibility attribute is supported, gcry_cv_visibility_attribute, [gcry_cv_visibility_attribute=no AC_LANG_CONFTEST([AC_LANG_SOURCE( [[int foo __attribute__ ((visibility ("hidden"))) = 1; int bar __attribute__ ((visibility ("protected"))) = 1; ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then gcry_cv_visibility_attribute=yes fi fi fi ]) if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(for broken visibility attribute, gcry_cv_broken_visibility_attribute, [gcry_cv_broken_visibility_attribute=yes AC_LANG_CONFTEST([AC_LANG_SOURCE( [[int foo (int x); int bar (int x) __asm__ ("foo") __attribute__ ((visibility ("hidden"))); int bar (int x) { return x; } ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep '\.hidden@<:@ _@:>@foo' conftest.s >/dev/null 2>&1; then gcry_cv_broken_visibility_attribute=no fi fi ]) fi if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(for broken alias attribute, gcry_cv_broken_alias_attribute, [gcry_cv_broken_alias_attribute=yes AC_LANG_CONFTEST([AC_LANG_SOURCE( [[extern int foo (int x) __asm ("xyzzy"); int bar (int x) { return x; } extern __typeof (bar) foo __attribute ((weak, alias ("bar"))); extern int dfoo; extern __typeof (dfoo) dfoo __asm ("abccb"); int dfoo = 1; ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \ grep 'abccb' conftest.s >/dev/null 2>&1; then gcry_cv_broken_alias_attribute=no fi fi ]) fi if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(if gcc supports -fvisibility=hidden, gcry_cv_gcc_has_f_visibility, [gcry_cv_gcc_has_f_visibility=no _gcc_cflags_save=$CFLAGS CFLAGS="-fvisibility=hidden" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])], gcry_cv_gcc_has_f_visibility=yes) CFLAGS=$_gcc_cflags_save; ]) fi if test "$gcry_cv_visibility_attribute" = "yes" \ && test "$gcry_cv_broken_visibility_attribute" != "yes" \ && test "$gcry_cv_broken_alias_attribute" != "yes" \ && test "$gcry_cv_gcc_has_f_visibility" = "yes" then AC_DEFINE(GCRY_USE_VISIBILITY, 1, [Define to use the GNU C visibility attribute.]) CFLAGS="$CFLAGS -fvisibility=hidden" fi # Following attribute tests depend on warnings to cause compile to fail, # so set -Werror temporarily. _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -Werror" # # Check whether the compiler supports the GCC style aligned attribute # AC_CACHE_CHECK([whether the GCC style aligned attribute is supported], [gcry_cv_gcc_attribute_aligned], [gcry_cv_gcc_attribute_aligned=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[struct { int a; } foo __attribute__ ((aligned (16)));]])], [gcry_cv_gcc_attribute_aligned=yes])]) if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1, [Defined if a GCC style "__attribute__ ((aligned (n))" is supported]) fi # # Check whether the compiler supports the GCC style packed attribute # AC_CACHE_CHECK([whether the GCC style packed attribute is supported], [gcry_cv_gcc_attribute_packed], [gcry_cv_gcc_attribute_packed=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[struct foolong_s { long b; } __attribute__ ((packed)); struct foo_s { char a; struct foolong_s b; } __attribute__ ((packed)); enum bar { FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))), };]])], [gcry_cv_gcc_attribute_packed=yes])]) if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1, [Defined if a GCC style "__attribute__ ((packed))" is supported]) fi # # Check whether the compiler supports the GCC style may_alias attribute # AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported], [gcry_cv_gcc_attribute_may_alias], [gcry_cv_gcc_attribute_may_alias=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[typedef struct foo_s { int a; } __attribute__ ((may_alias)) foo_t;]])], [gcry_cv_gcc_attribute_may_alias=yes])]) if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1, [Defined if a GCC style "__attribute__ ((may_alias))" is supported]) fi # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether the compiler supports 'asm' or '__asm__' keyword for # assembler blocks. # AC_CACHE_CHECK([whether 'asm' assembler keyword is supported], [gcry_cv_have_asm], [gcry_cv_have_asm=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { asm("":::"memory"); }]])], [gcry_cv_have_asm=yes])]) AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported], [gcry_cv_have___asm__], [gcry_cv_have___asm__=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("":::"memory"); }]])], [gcry_cv_have___asm__=yes])]) if test "$gcry_cv_have_asm" = "no" ; then if test "$gcry_cv_have___asm__" = "yes" ; then AC_DEFINE(asm,__asm__, [Define to supported assembler block keyword, if plain 'asm' was not supported]) fi fi # # Check whether the compiler supports inline assembly memory barrier. # if test "$gcry_cv_have_asm" = "no" ; then if test "$gcry_cv_have___asm__" = "yes" ; then AC_CACHE_CHECK([whether inline assembly memory barrier is supported], [gcry_cv_have_asm_volatile_memory], [gcry_cv_have_asm_volatile_memory=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(int x) { __asm__ volatile("":::"memory"); __asm__ volatile("":"+r"(x)::"memory"); }]])], [gcry_cv_have_asm_volatile_memory=yes])]) fi else AC_CACHE_CHECK([whether inline assembly memory barrier is supported], [gcry_cv_have_asm_volatile_memory], [gcry_cv_have_asm_volatile_memory=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(int x) { asm volatile("":::"memory"); asm volatile("":"+r"(x)::"memory"); }]])], [gcry_cv_have_asm_volatile_memory=yes])]) fi if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1, [Define if inline asm memory barrier is supported]) fi # # Check whether GCC assembler supports features needed for our ARM # implementations. This needs to be done before setting up the # assembler stuff. # AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations], [gcry_cv_gcc_arm_platform_as_ok], [if test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_arm_platform_as_ok="n/a" else gcry_cv_gcc_arm_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( /* Test if assembler supports UAL syntax. */ ".syntax unified\n\t" ".arm\n\t" /* our assembly code is in ARM mode */ ".text\n\t" /* Following causes error if assembler ignored '.syntax unified'. */ "asmfunc:\n\t" "add %r0, %r0, %r4, ror #12;\n\t" /* Test if '.type' and '.size' are supported. */ ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,%function;\n\t" );]], [ asmfunc(); ] )], [gcry_cv_gcc_arm_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1, [Defined if underlying assembler is compatible with ARM assembly implementations]) fi # # Check whether GCC assembler supports features needed for our ARMv8/Aarch64 # implementations. This needs to be done before setting up the # assembler stuff. # AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations], [gcry_cv_gcc_aarch64_platform_as_ok], [if test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_aarch64_platform_as_ok="n/a" else gcry_cv_gcc_aarch64_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".text\n\t" "asmfunc:\n\t" "eor x0, x0, x30, ror #12;\n\t" "add x0, x0, x30, asr #12;\n\t" "eor v0.16b, v0.16b, v31.16b;\n\t" );]], [ asmfunc(); ] )], [gcry_cv_gcc_aarch64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations]) fi # # Check whether GCC assembler supports for CFI directives. # AC_CACHE_CHECK([whether GCC assembler supports for CFI directives], [gcry_cv_gcc_asm_cfi_directives], [gcry_cv_gcc_asm_cfi_directives=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".text\n\t" "ac_test:\n\t" ".cfi_startproc\n\t" ".cfi_remember_state\n\t" ".cfi_adjust_cfa_offset 8\n\t" ".cfi_rel_offset 0, 8\n\t" ".cfi_def_cfa_register 1\n\t" ".cfi_register 2, 3\n\t" ".cfi_restore 2\n\t" ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t" ".cfi_restore_state\n\t" ".long 0\n\t" ".cfi_endproc\n\t" );]])], [gcry_cv_gcc_asm_cfi_directives=yes])]) if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1, [Defined if underlying assembler supports for CFI directives]) fi # # Check whether GCC assembler supports for ELF directives. # AC_CACHE_CHECK([whether GCC assembler supports for ELF directives], [gcry_cv_gcc_asm_elf_directives], [gcry_cv_gcc_asm_elf_directives=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( /* Test if ELF directives '.type' and '.size' are supported. */ ".text\n\t" "asmfunc:\n\t" ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,STT_FUNC;\n\t" );]])], [gcry_cv_gcc_asm_elf_directives=yes])]) if test "$gcry_cv_gcc_asm_elf_directives" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_ELF_DIRECTIVES,1, [Defined if underlying assembler supports for ELF directives]) fi # # Check whether underscores in symbols are required. This needs to be # done before setting up the assembler stuff. # GNUPG_SYS_SYMBOL_UNDERSCORE() ################################# #### #### #### Setup assembler stuff. #### #### Define mpi_cpu_arch. #### #### #### ################################# AC_ARG_ENABLE(mpi-path, AS_HELP_STRING([--enable-mpi-path=EXTRA_PATH], [prepend EXTRA_PATH to list of CPU specific optimizations]), mpi_extra_path="$enableval",mpi_extra_path="") AC_MSG_CHECKING(architecture and mpi assembler functions) if test -f $srcdir/mpi/config.links ; then . $srcdir/mpi/config.links AC_CONFIG_LINKS("$mpi_ln_list") ac_cv_mpi_sflags="$mpi_sflags" AC_MSG_RESULT($mpi_cpu_arch) else AC_MSG_RESULT(failed) AC_MSG_ERROR([mpi/config.links missing!]) fi MPI_SFLAGS="$ac_cv_mpi_sflags" AC_SUBST(MPI_SFLAGS) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes) AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes) AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes) AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes) AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes) # Reset non applicable feature flags. if test "$mpi_cpu_arch" != "x86" ; then aesnisupport="n/a" shaextsupport="n/a" pclmulsupport="n/a" sse41support="n/a" avxsupport="n/a" avx2support="n/a" padlocksupport="n/a" drngsupport="n/a" fi if test "$mpi_cpu_arch" != "arm" ; then if test "$mpi_cpu_arch" != "aarch64" ; then neonsupport="n/a" armcryptosupport="n/a" fi fi if test "$mpi_cpu_arch" != "ppc"; then ppccryptosupport="n/a" fi ############################################# #### #### #### Platform specific compiler checks. #### #### #### ############################################# # Following tests depend on warnings to cause compile to fail, so set -Werror # temporarily. _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -Werror" # # Check whether compiler supports 'ms_abi' function attribute. # AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute], [gcry_cv_gcc_attribute_ms_abi], [gcry_cv_gcc_attribute_ms_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[int __attribute__ ((ms_abi)) proto(int);]])], [gcry_cv_gcc_attribute_ms_abi=yes])]) if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1, [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute]) fi # # Check whether compiler supports 'sysv_abi' function attribute. # AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute], [gcry_cv_gcc_attribute_sysv_abi], [gcry_cv_gcc_attribute_sysv_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[int __attribute__ ((sysv_abi)) proto(int);]])], [gcry_cv_gcc_attribute_sysv_abi=yes])]) if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1, [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute]) fi # # Check whether default calling convention is 'ms_abi'. # if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then AC_CACHE_CHECK([whether default calling convention is 'ms_abi'], [gcry_cv_gcc_default_abi_is_ms_abi], [gcry_cv_gcc_default_abi_is_ms_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void *test(void) { void *(*def_func)(void) = test; void *__attribute__((ms_abi))(*msabi_func)(void); /* warning on SysV abi targets, passes on Windows based targets */ msabi_func = def_func; return msabi_func; }]])], [gcry_cv_gcc_default_abi_is_ms_abi=yes])]) if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1, [Defined if default calling convention is 'ms_abi']) fi fi # # Check whether default calling convention is 'sysv_abi'. # if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'], [gcry_cv_gcc_default_abi_is_sysv_abi], [gcry_cv_gcc_default_abi_is_sysv_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void *test(void) { void *(*def_func)(void) = test; void *__attribute__((sysv_abi))(*sysvabi_func)(void); /* warning on MS ABI targets, passes on SysV ABI targets */ sysvabi_func = def_func; return sysvabi_func; }]])], [gcry_cv_gcc_default_abi_is_sysv_abi=yes])]) if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1, [Defined if default calling convention is 'sysv_abi']) fi fi # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether GCC inline assembler supports SSSE3 instructions # This is required for the AES-NI instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions], [gcry_cv_gcc_inline_asm_ssse3], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_ssse3="n/a" else gcry_cv_gcc_inline_asm_ssse3=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; void a(void) { __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_ssse3=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1, [Defined if inline assembler supports SSSE3 instructions]) fi # # Check whether GCC inline assembler supports PCLMUL instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions], [gcry_cv_gcc_inline_asm_pclmul], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_pclmul="n/a" else gcry_cv_gcc_inline_asm_pclmul=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_pclmul=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1, [Defined if inline assembler supports PCLMUL instructions]) fi # # Check whether GCC inline assembler supports SHA Extensions instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions], [gcry_cv_gcc_inline_asm_shaext], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_shaext="n/a" else gcry_cv_gcc_inline_asm_shaext=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc"); __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_shaext=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1, [Defined if inline assembler supports SHA Extensions instructions]) fi # # Check whether GCC inline assembler supports SSE4.1 instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions], [gcry_cv_gcc_inline_asm_sse41], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_sse41="n/a" else gcry_cv_gcc_inline_asm_sse41=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { int i; __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i)); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_sse41=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1, [Defined if inline assembler supports SSE4.1 instructions]) fi # # Check whether GCC inline assembler supports AVX instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions], [gcry_cv_gcc_inline_asm_avx], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_avx="n/a" else gcry_cv_gcc_inline_asm_avx=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_avx=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1, [Defined if inline assembler supports AVX instructions]) fi # # Check whether GCC inline assembler supports AVX2 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions], [gcry_cv_gcc_inline_asm_avx2], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_avx2="n/a" else gcry_cv_gcc_inline_asm_avx2=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void a(void) { __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); }]], [ a(); ] )], [gcry_cv_gcc_inline_asm_avx2=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1, [Defined if inline assembler supports AVX2 instructions]) fi # # Check whether GCC inline assembler supports BMI2 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions], [gcry_cv_gcc_inline_asm_bmi2], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_bmi2="n/a" else gcry_cv_gcc_inline_asm_bmi2=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[unsigned int a(unsigned int x, unsigned int y) { unsigned int tmp1, tmp2; asm ("rorxl %2, %1, %0" : "=r" (tmp1) : "rm0" (x), "J" (32 - ((23) & 31))); asm ("andnl %2, %1, %0" : "=r" (tmp2) : "r0" (x), "rm" (y)); return tmp1 + tmp2; }]], [ a(1, 2); ] )], [gcry_cv_gcc_inline_asm_bmi2=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1, [Defined if inline assembler supports BMI2 instructions]) fi # # Check whether GCC assembler needs "-Wa,--divide" to correctly handle # constant division # if test $amd64_as_feature_detection = yes; then AC_CACHE_CHECK([whether GCC assembler handles division correctly], [gcry_cv_gcc_as_const_division_ok], [gcry_cv_gcc_as_const_division_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]], [fn();])], [gcry_cv_gcc_as_const_division_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then # # Add '-Wa,--divide' to CPPFLAGS and try check again. # _gcc_cppflags_save="$CPPFLAGS" CPPFLAGS="$CPPFLAGS -Wa,--divide" AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"], [gcry_cv_gcc_as_const_division_with_wadivide_ok], [gcry_cv_gcc_as_const_division_with_wadivide_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]], [fn();])], [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then # '-Wa,--divide' did not work, restore old flags. CPPFLAGS="$_gcc_cppflags_save" fi fi fi # # Check whether GCC assembler supports features needed for our amd64 # implementations # if test $amd64_as_feature_detection = yes; then AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations], [gcry_cv_gcc_amd64_platform_as_ok], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_amd64_platform_as_ok="n/a" else gcry_cv_gcc_amd64_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( /* Test if '.type' and '.size' are supported. */ /* These work only on ELF targets. */ ".text\n\t" "asmfunc:\n\t" ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,@function;\n\t" /* Test if assembler allows use of '/' for constant division * (Solaris/x86 issue). If previous constant division check * and "-Wa,--divide" workaround failed, this causes assembly * to be disable on this machine. */ "xorl \$(123456789/12345678), %ebp;\n\t" );]], [ asmfunc(); ])], [gcry_cv_gcc_amd64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with amd64 assembly implementations]) fi if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" && test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" && test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations], [gcry_cv_gcc_win64_platform_as_ok], [gcry_cv_gcc_win64_platform_as_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".text\n\t" ".globl asmfunc\n\t" "asmfunc:\n\t" "xorq \$(1234), %rbp;\n\t" );]], [ asmfunc(); ])], [gcry_cv_gcc_win64_platform_as_ok=yes])]) if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with WIN64 assembly implementations]) fi fi fi # # Check whether GCC assembler supports features needed for assembly # implementations that use Intel syntax # AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations], [gcry_cv_gcc_platform_as_ok_for_intel_syntax], [if test "$mpi_cpu_arch" != "x86" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a" else gcry_cv_gcc_platform_as_ok_for_intel_syntax=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".intel_syntax noprefix\n\t" ".text\n\t" "actest:\n\t" "pxor xmm1, xmm7;\n\t" - /* Intel syntax implementation also use GAS macros, so check - * for them here. */ - "VAL_A = xmm4\n\t" - "VAL_B = xmm2\n\t" - ".macro SET_VAL_A p1\n\t" - " VAL_A = \\\\p1 \n\t" - ".endm\n\t" - ".macro SET_VAL_B p1\n\t" - " VAL_B = \\\\p1 \n\t" - ".endm\n\t" - "vmovdqa VAL_A, VAL_B;\n\t" - "SET_VAL_A eax\n\t" - "SET_VAL_B ebp\n\t" - "add VAL_A, VAL_B;\n\t" - "add VAL_B, 0b10101;\n\t" + "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t" + "add eax, ebp;\n\t" + "rorx eax, ebp, 1;\n\t" + "sub eax, [esp + 4];\n\t" + "add dword ptr [esp + eax], 0b10101;\n\t" ".att_syntax prefix\n\t" );]], [ actest(); ])], [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes]) fi]) if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1, [Defined if underlying assembler is compatible with Intel syntax assembly implementations]) fi # # Check whether compiler is configured for ARMv6 or newer architecture # AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture], [gcry_cv_cc_arm_arch_is_v6], [if test "$mpi_cpu_arch" != "arm" || test "$try_asm_modules" != "yes" ; then gcry_cv_cc_arm_arch_is_v6="n/a" else gcry_cv_cc_arm_arch_is_v6=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[ #if defined(__arm__) && \ ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ || defined(__ARM_ARCH_7EM__)) /* empty */ #else /* fail compile if not ARMv6. */ not_armv6 not_armv6 = (not_armv6)not_armv6; #endif ]])], [gcry_cv_cc_arm_arch_is_v6=yes]) fi]) if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then AC_DEFINE(HAVE_ARM_ARCH_V6,1, [Defined if ARM architecture is v6 or newer]) fi # # Check whether GCC inline assembler supports NEON instructions # AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions], [gcry_cv_gcc_inline_asm_neon], [if test "$mpi_cpu_arch" != "arm" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_neon="n/a" else gcry_cv_gcc_inline_asm_neon=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".syntax unified\n\t" ".arm\n\t" ".fpu neon\n\t" ".text\n\t" "testfn:\n\t" "vld1.64 {%q0-%q1}, [%r0]!;\n\t" "vrev64.8 %q0, %q3;\n\t" "vadd.u64 %q0, %q1;\n\t" "vadd.s64 %d3, %d2, %d3;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_neon=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1, [Defined if inline assembler supports NEON instructions]) fi # # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions], [gcry_cv_gcc_inline_asm_aarch32_crypto], [if test "$mpi_cpu_arch" != "arm" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_aarch32_crypto="n/a" else gcry_cv_gcc_inline_asm_aarch32_crypto=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".syntax unified\n\t" ".arch armv8-a\n\t" ".arm\n\t" ".fpu crypto-neon-fp-armv8\n\t" ".text\n\t" "testfn:\n\t" "sha1h.32 q0, q0;\n\t" "sha1c.32 q0, q0, q0;\n\t" "sha1p.32 q0, q0, q0;\n\t" "sha1su0.32 q0, q0, q0;\n\t" "sha1su1.32 q0, q0;\n\t" "sha256h.32 q0, q0, q0;\n\t" "sha256h2.32 q0, q0, q0;\n\t" "sha1p.32 q0, q0, q0;\n\t" "sha256su0.32 q0, q0;\n\t" "sha256su1.32 q0, q0, q15;\n\t" "aese.8 q0, q0;\n\t" "aesd.8 q0, q0;\n\t" "aesmc.8 q0, q0;\n\t" "aesimc.8 q0, q0;\n\t" "vmull.p64 q0, d0, d0;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch32_crypto=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1, [Defined if inline assembler supports AArch32 Crypto Extension instructions]) fi # # Check whether GCC inline assembler supports AArch64 NEON instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions], [gcry_cv_gcc_inline_asm_aarch64_neon], [if test "$mpi_cpu_arch" != "aarch64" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_aarch64_neon="n/a" else gcry_cv_gcc_inline_asm_aarch64_neon=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".cpu generic+simd\n\t" ".text\n\t" "testfn:\n\t" "mov w0, \#42;\n\t" "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch64_neon=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1, [Defined if inline assembler supports AArch64 NEON instructions]) fi # # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions], [gcry_cv_gcc_inline_asm_aarch64_crypto], [if test "$mpi_cpu_arch" != "aarch64" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_aarch64_crypto="n/a" else gcry_cv_gcc_inline_asm_aarch64_crypto=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__( ".cpu generic+simd+crypto\n\t" ".text\n\t" "testfn:\n\t" "mov w0, \#42;\n\t" "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" "sha1h s0, s0;\n\t" "sha1c q0, s0, v0.4s;\n\t" "sha1p q0, s0, v0.4s;\n\t" "sha1su0 v0.4s, v0.4s, v0.4s;\n\t" "sha1su1 v0.4s, v0.4s;\n\t" "sha256h q0, q0, v0.4s;\n\t" "sha256h2 q0, q0, v0.4s;\n\t" "sha1p q0, s0, v0.4s;\n\t" "sha256su0 v0.4s, v0.4s;\n\t" "sha256su1 v0.4s, v0.4s, v31.4s;\n\t" "aese v0.16b, v0.16b;\n\t" "aesd v0.16b, v0.16b;\n\t" "aesmc v0.16b, v0.16b;\n\t" "aesimc v0.16b, v0.16b;\n\t" "pmull v0.1q, v0.1d, v31.1d;\n\t" "pmull2 v0.1q, v0.2d, v31.2d;\n\t" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch64_crypto=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1, [Defined if inline assembler supports AArch64 Crypto Extension instructions]) fi # # Check whether PowerPC AltiVec/VSX intrinsics # AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX intrinsics], [gcry_cv_cc_ppc_altivec], [if test "$mpi_cpu_arch" != "ppc" || test "$try_asm_modules" != "yes" ; then gcry_cv_cc_ppc_altivec="n/a" else gcry_cv_cc_ppc_altivec=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#include typedef vector unsigned char block; typedef vector unsigned int vecu32; block fn(block in) { block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0)); vecu32 y = vec_vsx_ld (0, (unsigned int*)0); return vec_cipher_be (t, in) ^ (block)y; } ]])], [gcry_cv_cc_ppc_altivec=yes]) fi]) if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1, [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics]) fi _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -maltivec -mvsx -mcrypto" if test "$gcry_cv_cc_ppc_altivec" = "no" && test "$mpi_cpu_arch" = "ppc" && test "$try_asm_modules" == "yes" ; then AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags], [gcry_cv_cc_ppc_altivec_cflags], [gcry_cv_cc_ppc_altivec_cflags=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#include typedef vector unsigned char block; typedef vector unsigned int vecu32; block fn(block in) { block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0)); vecu32 y = vec_vsx_ld (0, (unsigned int*)0); return vec_cipher_be (t, in) ^ (block)y; }]])], [gcry_cv_cc_ppc_altivec_cflags=yes])]) if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1, [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics]) AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1, [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags]) fi fi AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS, test "$gcry_cv_cc_ppc_altivec_cflags" = "yes") # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions # AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions], [gcry_cv_gcc_inline_asm_ppc_altivec], [if test "$mpi_cpu_arch" != "ppc" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_ppc_altivec="n/a" else gcry_cv_gcc_inline_asm_ppc_altivec=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".globl testfn;\n" ".text\n\t" "testfn:\n" "stvx %v31,%r12,%r0;\n" "lvx %v20,%r12,%r0;\n" "vcipher %v0, %v1, %v22;\n" "lxvw4x %vs32, %r0, %r1;\n" "vadduwm %v0, %v1, %v22;\n" "vshasigmaw %v0, %v1, 0, 15;\n" "vshasigmad %v0, %v1, 0, 15;\n" "vpmsumd %v11, %v11, %v11;\n" ); ]], [ testfn(); ] )], [gcry_cv_gcc_inline_asm_ppc_altivec=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1, [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions]) fi # # Check whether GCC inline assembler supports PowerISA 3.00 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions], [gcry_cv_gcc_inline_asm_ppc_arch_3_00], [if test "$mpi_cpu_arch" != "ppc" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a" else gcry_cv_gcc_inline_asm_ppc_arch_3_00=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[__asm__(".text\n\t" ".globl testfn;\n" "testfn:\n" "stxvb16x %r1,%v12,%v30;\n" ); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1, [Defined if inline assembler supports PowerISA 3.00 instructions]) fi # # Check whether GCC inline assembler supports zSeries instructions # AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions], [gcry_cv_gcc_inline_asm_s390x], [if test "$mpi_cpu_arch" != "s390x" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_s390x="n/a" else gcry_cv_gcc_inline_asm_s390x=no AC_LINK_IFELSE([AC_LANG_PROGRAM( [[typedef unsigned int u128_t __attribute__ ((mode (TI))); unsigned int testfunc(unsigned int x, void *y, unsigned int z) { unsigned long fac[8]; register unsigned long reg0 asm("0") = 0; register unsigned long reg1 asm("1") = x; u128_t r1 = ((u128_t)(unsigned long)y << 64) | (unsigned long)z; u128_t r2 = 0; u128_t r3 = 0; asm volatile (".insn rre,0xb92e << 16, %[r1], %[r2]\n\t" : [r1] "+a" (r1), [r2] "+a" (r2) : "r" (reg0), "r" (reg1) : "cc", "memory"); asm volatile (".insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t" : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3) : "r" (reg0), "r" (reg1) : "cc", "memory"); reg0 = 8 - 1; asm ("stfle %1\n\t" : "+d" (reg0), "=Q" (fac[0]) : : "cc", "memory"); asm volatile ("mvc 0(16, %0), 0(%1)\n\t" : : "a" (y), "a" (fac) : "memory"); asm volatile ("xc 0(16, %0), 0(%0)\n\t" : : "a" (fac) : "memory"); asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t" : : : "memory", "r11"); asm volatile ("algrk %%r14, %%r14, %%r14\n\t" : : : "memory", "r14"); return (unsigned int)r1 ^ reg0; } ]] , [ testfunc(0, 0, 0); ])], [gcry_cv_gcc_inline_asm_s390x=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X,1, [Defined if inline assembler supports zSeries instructions]) fi # # Check whether GCC inline assembler supports zSeries vector instructions # AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instructions], [gcry_cv_gcc_inline_asm_s390x_vx], [if test "$mpi_cpu_arch" != "s390x" || test "$try_asm_modules" != "yes" ; then gcry_cv_gcc_inline_asm_s390x_vx="n/a" else gcry_cv_gcc_inline_asm_s390x_vx=no if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then AC_LINK_IFELSE([AC_LANG_PROGRAM( [[void testfunc(void) { asm volatile (".machine \"z13+vx\"\n\t" "vx %%v0, %%v1, %%v31\n\t" "verllf %%v11, %%v11, (16)(0)\n\t" : : : "memory"); } ]], [ testfunc(); ])], [gcry_cv_gcc_inline_asm_s390x_vx=yes]) fi fi]) if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_S390X_VX,1, [Defined if inline assembler supports zSeries vector instructions]) fi ####################################### #### Checks for library functions. #### ####################################### AC_FUNC_VPRINTF # We have replacements for these in src/missing-string.c AC_CHECK_FUNCS(stpcpy strcasecmp) # We have replacements for these in src/g10lib.h AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise) # Other checks AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4) AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog) AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info) AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy) GNUPG_CHECK_MLOCK # # Replacement functions. # AC_REPLACE_FUNCS([getpid clock]) # # Check whether it is necessary to link against libdl. # DL_LIBS="" if test "$use_hmac_binary_check" = yes ; then _gcry_save_libs="$LIBS" LIBS="" AC_SEARCH_LIBS(dlopen, c dl,,,) DL_LIBS=$LIBS LIBS="$_gcry_save_libs" fi AC_SUBST(DL_LIBS) # # Check whether we can use Linux capabilities as requested. # if test "$use_capabilities" = "yes" ; then use_capabilities=no AC_CHECK_HEADERS(sys/capability.h) if test "$ac_cv_header_sys_capability_h" = "yes" ; then AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1) if test "$ac_cv_lib_cap_cap_init" = "yes"; then AC_DEFINE(USE_CAPABILITIES,1, [define if capabilities should be used]) LIBS="$LIBS -lcap" use_capabilities=yes fi fi if test "$use_capabilities" = "no" ; then AC_MSG_WARN([[ *** *** The use of capabilities on this system is not possible. *** You need a recent Linux kernel and some patches: *** fcaps-2.2.9-990610.patch (kernel patch for 2.2.9) *** fcap-module-990613.tar.gz (kernel module) *** libcap-1.92.tar.gz (user mode library and utilities) *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN *** set (filesystems menu). Be warned: This code is *really* ALPHA. ***]]) fi fi # Check whether a random device is available. if test "$try_dev_random" = yes ; then AC_CACHE_CHECK(for random device, ac_cv_have_dev_random, [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi]) if test "$ac_cv_have_dev_random" = yes; then AC_DEFINE(HAVE_DEV_RANDOM,1, [defined if the system supports a random device] ) fi else AC_MSG_CHECKING(for random device) ac_cv_have_dev_random=no AC_MSG_RESULT(has been disabled) fi # Figure out the random modules for this configuration. if test "$random" = "default"; then # Select default value. if test "$ac_cv_have_dev_random" = yes; then # Try Linuxish random device. random_modules="linux" else case "${host}" in *-*-mingw32ce*) # WindowsCE random device. random_modules="w32ce" ;; *-*-mingw32*|*-*-cygwin*) # Windows random device. random_modules="w32" ;; *) # Build everything, allow to select at runtime. random_modules="$auto_random_modules" ;; esac fi else if test "$random" = "auto"; then # Build everything, allow to select at runtime. random_modules="$auto_random_modules" else random_modules="$random" fi fi # # Other defines # if test mym4_isgit = "yes"; then AC_DEFINE(IS_DEVELOPMENT_VERSION,1, [Defined if this is not a regular release]) fi AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes) # This is handy for debugging so the compiler doesn't rearrange # things and eliminate variables. AC_ARG_ENABLE(optimization, AS_HELP_STRING([--disable-optimization], [disable compiler optimization]), [if test $enableval = no ; then CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'` fi]) AC_MSG_NOTICE([checking for cc features]) # CFLAGS mangling when using gcc. if test "$GCC" = yes; then AC_MSG_CHECKING([if gcc supports -fno-delete-null-pointer-checks]) _gcc_cflags_save=$CFLAGS CFLAGS="-fno-delete-null-pointer-checks" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -fno-delete-null-pointer-checks" fi CFLAGS="$CFLAGS -Wall" if test "$USE_MAINTAINER_MODE" = "yes"; then CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes" CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security" # If -Wno-missing-field-initializers is supported we can enable a # a bunch of really useful warnings. AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers]) _gcc_cflags_save=$CFLAGS CFLAGS="-Wno-missing-field-initializers" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast" CFLAGS="$CFLAGS -Wwrite-strings" CFLAGS="$CFLAGS -Wdeclaration-after-statement" CFLAGS="$CFLAGS -Wno-missing-field-initializers" CFLAGS="$CFLAGS -Wno-sign-compare" fi AC_MSG_CHECKING([if gcc supports -Wpointer-arith]) _gcc_cflags_save=$CFLAGS CFLAGS="-Wpointer-arith" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -Wpointer-arith" fi fi fi # Check whether as(1) supports a noeexecstack feature. This test # includes an override option. CL_AS_NOEXECSTACK AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION) AC_SUBST(LIBGCRYPT_CONFIG_LIBS) AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS) AC_SUBST(LIBGCRYPT_CONFIG_HOST) AC_SUBST(LIBGCRYPT_THREAD_MODULES) AC_CONFIG_COMMANDS([gcrypt-conf],[[ chmod +x src/libgcrypt-config ]],[[ prefix=$prefix exec_prefix=$exec_prefix libdir=$libdir datadir=$datadir DATADIRNAME=$DATADIRNAME ]]) ##################### #### Conclusion. #### ##################### # Check that requested feature can actually be used and define # ENABLE_foo_SUPPORT macros. if test x"$aesnisupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then aesnisupport="no (unsupported by compiler)" fi fi if test x"$shaextsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then shaextsupport="no (unsupported by compiler)" fi fi if test x"$pclmulsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then pclmulsupport="no (unsupported by compiler)" fi fi if test x"$sse41support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then sse41support="no (unsupported by compiler)" fi fi if test x"$avxsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then avxsupport="no (unsupported by compiler)" fi fi if test x"$avx2support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then avx2support="no (unsupported by compiler)" fi fi if test x"$neonsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then neonsupport="no (unsupported by compiler)" fi fi fi if test x"$armcryptosupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then neonsupport="no (unsupported by compiler)" fi fi fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, [Enable support for Intel AES-NI instructions.]) fi if test x"$shaextsupport" = xyes ; then AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1, [Enable support for Intel SHAEXT instructions.]) fi if test x"$pclmulsupport" = xyes ; then AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1, [Enable support for Intel PCLMUL instructions.]) fi if test x"$sse41support" = xyes ; then AC_DEFINE(ENABLE_SSE41_SUPPORT, 1, [Enable support for Intel SSE4.1 instructions.]) fi if test x"$avxsupport" = xyes ; then AC_DEFINE(ENABLE_AVX_SUPPORT,1, [Enable support for Intel AVX instructions.]) fi if test x"$avx2support" = xyes ; then AC_DEFINE(ENABLE_AVX2_SUPPORT,1, [Enable support for Intel AVX2 instructions.]) fi if test x"$neonsupport" = xyes ; then AC_DEFINE(ENABLE_NEON_SUPPORT,1, [Enable support for ARM NEON instructions.]) fi if test x"$armcryptosupport" = xyes ; then AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1, [Enable support for ARMv8 Crypto Extension instructions.]) fi if test x"$ppccryptosupport" = xyes ; then AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1, [Enable support for POWER 8 (PowerISA 2.07) crypto extension.]) fi if test x"$jentsupport" = xyes ; then AC_DEFINE(ENABLE_JENT_SUPPORT, 1, [Enable support for the jitter entropy collector.]) fi if test x"$padlocksupport" = xyes ; then AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1, [Enable support for the PadLock engine.]) fi if test x"$drngsupport" = xyes ; then AC_DEFINE(ENABLE_DRNG_SUPPORT, 1, [Enable support for Intel DRNG (RDRAND instruction).]) fi if test x"$force_soft_hwfeatures" = xyes ; then AC_DEFINE(ENABLE_FORCE_SOFT_HWFEATURES, 1, [Enable forcing 'soft' HW feature bits on (for testing).]) fi # Define conditional sources and config.h symbols depending on the # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules. LIST_MEMBER(arcfour, $enabled_ciphers) if test "$found" = "1"; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo" AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo" ;; esac fi LIST_MEMBER(blowfish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo" AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-arm.lo" ;; esac fi LIST_MEMBER(cast5, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo" AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-arm.lo" ;; esac fi LIST_MEMBER(des, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo" AC_DEFINE(USE_DES, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo" ;; esac fi LIST_MEMBER(aes, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo" AC_DEFINE(USE_AES, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo" # Build with the SSSE3 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo" # Build with the ARMv8/AArch32 CE implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aarch64.lo" # Build with the ARMv8/AArch64 CE implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc9le.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo" ;; s390x-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-s390x.lo" ;; esac case "$mpi_cpu_arch" in x86) # Build with the AES-NI implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aesni.lo" # Build with the Padlock implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-padlock.lo" ;; esac fi LIST_MEMBER(twofish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo" AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo" if test x"$avx2support" = xyes ; then # Build with the AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo" fi ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-aarch64.lo" ;; esac fi LIST_MEMBER(serpent, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo" AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the SSE2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo" ;; esac if test x"$avx2support" = xyes ; then # Build with the AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-avx2-amd64.lo" fi if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-armv7-neon.lo" fi fi LIST_MEMBER(rfc2268, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo" AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included]) fi LIST_MEMBER(seed, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo" AC_DEFINE(USE_SEED, 1, [Defined if this module should be included]) fi LIST_MEMBER(camellia, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo" AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included]) case "${host}" in arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-arm.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aarch64.lo" ;; esac if test x"$avxsupport" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx-amd64.lo" fi fi if test x"$avx2support" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx2-amd64.lo" fi fi fi LIST_MEMBER(idea, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo" AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included]) fi LIST_MEMBER(salsa20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo" AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-armv7-neon.lo" fi fi LIST_MEMBER(gost28147, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo" AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included]) fi LIST_MEMBER(chacha20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo" ;; powerpc64le-*-*) # Build with the ppc8 vector implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo" ;; powerpc64-*-*) # Build with the ppc8 vector implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo" ;; powerpc-*-*) # Build with the ppc8 vector implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ppc.lo" ;; s390x-*-*) # Build with the s390x/zSeries vector implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-s390x.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo" fi fi LIST_MEMBER(sm4, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4.lo" AC_DEFINE(USE_SM4, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4-aesni-avx-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS sm4-aesni-avx2-amd64.lo" ;; esac fi LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" AC_DEFINE(USE_DSA, 1, [Defined if this module should be included]) fi LIST_MEMBER(rsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo" AC_DEFINE(USE_RSA, 1, [Defined if this module should be included]) fi LIST_MEMBER(elgamal, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo" AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included]) fi LIST_MEMBER(ecc, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \ ecc.lo ecc-curves.lo ecc-misc.lo \ ecc-ecdh.lo ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo \ ecc-sm2.lo" AC_DEFINE(USE_ECC, 1, [Defined if this module should be included]) fi LIST_MEMBER(crc, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo" AC_DEFINE(USE_CRC, 1, [Defined if this module should be included]) case "${host}" in i?86-*-* | x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo" ;; powerpc64-*-*) GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo" ;; powerpc-*-*) GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-ppc.lo" ;; esac fi LIST_MEMBER(gostr3411-94, $enabled_digests) if test "$found" = "1" ; then # GOST R 34.11-94 internally uses GOST 28147-89 LIST_MEMBER(gost28147, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo" AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included]) fi fi LIST_MEMBER(stribog, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo" AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included]) fi LIST_MEMBER(md2, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo" AC_DEFINE(USE_MD2, 1, [Defined if this module should be included]) fi LIST_MEMBER(md4, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo" AC_DEFINE(USE_MD4, 1, [Defined if this module should be included]) fi LIST_MEMBER(md5, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo" AC_DEFINE(USE_MD5, 1, [Defined if this module should be included]) fi LIST_MEMBER(rmd160, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo" AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included]) fi LIST_MEMBER(sha256, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo" AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch64-ce.lo" ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha256-ppc.lo" esac case "$mpi_cpu_arch" in x86) # Build with the SHAEXT implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-intel-shaext.lo" ;; esac fi LIST_MEMBER(sha512, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo" AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo" ;; i?86-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-i386.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo" ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo" ;; powerpc64-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo" ;; powerpc-*-*) # Big-Endian. # Build with the crypto extension implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc.lo" esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo" fi fi LIST_MEMBER(sha3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo" AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation : ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo" fi fi LIST_MEMBER(tiger, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo" AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included]) fi LIST_MEMBER(whirlpool, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo" AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool-sse2-amd64.lo" ;; esac fi LIST_MEMBER(blake2, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo" AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2b-amd64-avx2.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2s-amd64-avx.lo" ;; esac fi # SHA-1 needs to be included always for example because it is used by # random-csprng.c. GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo" AC_DEFINE(USE_SHA1, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv7-neon.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch64-ce.lo" ;; esac case "$mpi_cpu_arch" in x86) # Build with the SHAEXT implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-intel-shaext.lo" ;; esac LIST_MEMBER(sm3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo" AC_DEFINE(USE_SM3, 1, [Defined if this module should be included]) fi LIST_MEMBER(scrypt, $enabled_kdfs) if test "$found" = "1" ; then GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo" AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included]) fi LIST_MEMBER(linux, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndlinux.lo" AC_DEFINE(USE_RNDLINUX, 1, [Defined if the /dev/random RNG should be used.]) fi LIST_MEMBER(unix, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo" AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.]) fi LIST_MEMBER(egd, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo" AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.]) fi LIST_MEMBER(w32, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo" AC_DEFINE(USE_RNDW32, 1, [Defined if the Windows specific RNG should be used.]) fi LIST_MEMBER(w32ce, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo" AC_DEFINE(USE_RNDW32CE, 1, [Defined if the WindowsCE specific RNG should be used.]) fi AC_SUBST([GCRYPT_CIPHERS]) AC_SUBST([GCRYPT_PUBKEY_CIPHERS]) AC_SUBST([GCRYPT_DIGESTS]) AC_SUBST([GCRYPT_KDFS]) AC_SUBST([GCRYPT_RANDOM]) AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers) AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers) AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests) # For printing the configuration we need a colon separated list of # algorithm names. tmp=`echo "$enabled_ciphers" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp", [List of available cipher algorithms]) tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp", [List of available public key cipher algorithms]) tmp=`echo "$enabled_digests" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp", [List of available digest algorithms]) tmp=`echo "$enabled_kdfs" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp", [List of available KDF algorithms]) # # Define conditional sources depending on the used hardware platform. # Note that all possible modules must also be listed in # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES). # GCRYPT_HWF_MODULES= case "$mpi_cpu_arch" in x86) AC_DEFINE(HAVE_CPU_ARCH_X86, 1, [Defined for the x86 platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-x86.lo" ;; alpha) AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms]) ;; sparc) AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms]) ;; mips) AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1, [Defined for MIPS platforms]) ;; m68k) AC_DEFINE(HAVE_CPU_ARCH_M68K, 1, [Defined for M68k platforms]) ;; ppc) AC_DEFINE(HAVE_CPU_ARCH_PPC, 1, [Defined for PPC platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-ppc.lo" ;; arm) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo" ;; aarch64) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM AArch64 platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-arm.lo" ;; s390x) AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms]) GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo" ;; esac AC_SUBST([GCRYPT_HWF_MODULES]) # # Option to disable building of doc file # build_doc=yes AC_ARG_ENABLE([doc], AS_HELP_STRING([--disable-doc], [do not build the documentation]), build_doc=$enableval, build_doc=yes) AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno]) # # Provide information about the build. # BUILD_REVISION="mym4_revision" AC_SUBST(BUILD_REVISION) AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION", [GIT commit id revision used to build this package]) changequote(,)dnl BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'` changequote([,])dnl BUILD_VERSION="${BUILD_VERSION}mym4_revision_dec" BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,` AC_SUBST(BUILD_VERSION) AC_SUBST(BUILD_FILEVERSION) AC_ARG_ENABLE([build-timestamp], AS_HELP_STRING([--enable-build-timestamp], [set an explicit build timestamp for reproducibility. (default is the current time in ISO-8601 format)]), [if test "$enableval" = "yes"; then BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date` else BUILD_TIMESTAMP="$enableval" fi], [BUILD_TIMESTAMP=""]) AC_SUBST(BUILD_TIMESTAMP) AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP", [The time this package was configured for a build]) # And create the files. AC_CONFIG_FILES([ Makefile m4/Makefile compat/Makefile mpi/Makefile cipher/Makefile random/Makefile doc/Makefile src/Makefile src/gcrypt.h src/libgcrypt-config src/libgcrypt.pc src/versioninfo.rc tests/Makefile ]) AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g]) AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf]) AC_OUTPUT detection_module="${GCRYPT_HWF_MODULES%.lo}" test -n "$detection_module" || detection_module="none" # Give some feedback GCRY_MSG_SHOW([],[]) GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:]) GCRY_MSG_SHOW([],[]) GCRY_MSG_SHOW([Platform: ],[$PRINTABLE_OS_NAME ($host)]) GCRY_MSG_SHOW([Hardware detection module:],[$detection_module]) GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers]) GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests]) GCRY_MSG_WRAP([Enabled kdf algorithms: ],[$enabled_kdfs]) GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers]) GCRY_MSG_SHOW([Random number generator: ],[$random]) GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport]) GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities]) GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport]) GCRY_MSG_SHOW([Try using AES-NI crypto: ],[$aesnisupport]) GCRY_MSG_SHOW([Try using Intel SHAEXT: ],[$shaextsupport]) GCRY_MSG_SHOW([Try using Intel PCLMUL: ],[$pclmulsupport]) GCRY_MSG_SHOW([Try using Intel SSE4.1: ],[$sse41support]) GCRY_MSG_SHOW([Try using DRNG (RDRAND): ],[$drngsupport]) GCRY_MSG_SHOW([Try using Intel AVX: ],[$avxsupport]) GCRY_MSG_SHOW([Try using Intel AVX2: ],[$avx2support]) GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport]) GCRY_MSG_SHOW([Try using PPC crypto: ],[$ppccryptosupport]) GCRY_MSG_SHOW([],[]) if test "x${gpg_config_script_warn}" != x; then cat <