diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S index adff6394..4179ba2c 100644 --- a/cipher/serpent-armv7-neon.S +++ b/cipher/serpent-armv7-neon.S @@ -1,1124 +1,1180 @@ /* serpent-armv7-neon.S - ARM/NEON assembly implementation of Serpent cipher * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_NEON) .text .syntax unified .fpu neon .arm /* ARM registers */ #define RROUND r0 /* NEON vector registers */ #define RA0 q0 #define RA1 q1 #define RA2 q2 #define RA3 q3 #define RA4 q4 #define RB0 q5 #define RB1 q6 #define RB2 q7 #define RB3 q8 #define RB4 q9 #define RT0 q10 #define RT1 q11 #define RT2 q12 #define RT3 q13 #define RA0d0 d0 #define RA0d1 d1 #define RA1d0 d2 #define RA1d1 d3 #define RA2d0 d4 #define RA2d1 d5 #define RA3d0 d6 #define RA3d1 d7 #define RA4d0 d8 #define RA4d1 d9 #define RB0d0 d10 #define RB0d1 d11 #define RB1d0 d12 #define RB1d1 d13 #define RB2d0 d14 #define RB2d1 d15 #define RB3d0 d16 #define RB3d1 d17 #define RB4d0 d18 #define RB4d1 d19 #define RT0d0 d20 #define RT0d1 d21 #define RT1d0 d22 #define RT1d1 d23 #define RT2d0 d24 #define RT2d1 d25 /********************************************************************** helper macros **********************************************************************/ #define transpose_4x4(_q0, _q1, _q2, _q3) \ vtrn.32 _q0, _q1; \ vtrn.32 _q2, _q3; \ vswp _q0##d1, _q2##d0; \ vswp _q1##d1, _q3##d0; /********************************************************************** 8-way serpent **********************************************************************/ /* * These are the S-Boxes of Serpent from following research paper. * * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, * (New York, New York, USA), p. 317–329, National Institute of Standards and * Technology, 2000. * * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf * */ #define SBOX0(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ veor a3, a3, a0; veor b3, b3, b0; vmov a4, a1; vmov b4, b1; \ vand a1, a1, a3; vand b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \ veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ veor a0, a0, a4; veor b0, b0, b4; veor a4, a4, a3; veor b4, b4, b3; \ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \ veor a2, a2, a4; veor b2, b2, b4; vmvn a4, a4; vmvn b4, b4; \ vorr a4, a4, a1; vorr b4, b4, b1; veor a1, a1, a3; veor b1, b1, b3; \ veor a1, a1, a4; veor b1, b1, b4; vorr a3, a3, a0; vorr b3, b3, b0; \ veor a1, a1, a3; veor b1, b1, b3; veor a4, a3; veor b4, b3; #define SBOX0_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmvn a2, a2; vmvn b2, b2; vmov a4, a1; vmov b4, b1; \ vorr a1, a1, a0; vorr b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \ veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \ veor a1, a1, a3; veor b1, b1, b3; veor a0, a0, a4; veor b0, b0, b4; \ veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a3; vand b0, b0, b3; \ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a1; vorr b0, b0, b1; \ veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \ veor a2, a2, a1; veor b2, b2, b1; veor a3, a3, a0; veor b3, b3, b0; \ veor a3, a3, a1; veor b3, b3, b1;\ vand a2, a2, a3; vand b2, b2, b3;\ veor a4, a2; veor b4, b2; #define SBOX1(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmvn a0, a0; vmvn b0, b0; vmvn a2, a2; vmvn b2, b2; \ vmov a4, a0; vmov b4, b0; vand a0, a0, a1; vand b0, b0, b1; \ veor a2, a2, a0; veor b2, b2, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ veor a3, a3, a2; veor b3, b3, b2; veor a1, a1, a0; veor b1, b1, b0; \ veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a1; vorr b4, b4, b1; \ veor a1, a1, a3; veor b1, b1, b3; vorr a2, a2, a0; vorr b2, b2, b0; \ vand a2, a2, a4; vand b2, b2, b4; veor a0, a0, a1; veor b0, b0, b1; \ vand a1, a1, a2; vand b1, b1, b2;\ veor a1, a1, a0; veor b1, b1, b0; vand a0, a0, a2; vand b0, b0, b2; \ veor a0, a4; veor b0, b4; #define SBOX1_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a1; vmov b4, b1; veor a1, a1, a3; veor b1, b1, b3; \ vand a3, a3, a1; vand b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \ veor a3, a3, a0; veor b3, b3, b0; vorr a0, a0, a1; vorr b0, b0, b1; \ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a4; veor b0, b0, b4; \ vorr a0, a0, a2; vorr b0, b0, b2; veor a1, a1, a3; veor b1, b1, b3; \ veor a0, a0, a1; veor b0, b0, b1; vorr a1, a1, a3; vorr b1, b1, b3; \ veor a1, a1, a0; veor b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \ veor a4, a4, a1; veor b4, b4, b1; vorr a1, a1, a0; vorr b1, b1, b0; \ veor a1, a1, a0; veor b1, b1, b0;\ vorr a1, a1, a4; vorr b1, b1, b4;\ veor a3, a1; veor b3, b1; #define SBOX2(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a0; vmov b4, b0; vand a0, a0, a2; vand b0, b0, b2; \ veor a0, a0, a3; veor b0, b0, b3; veor a2, a2, a1; veor b2, b2, b1; \ veor a2, a2, a0; veor b2, b2, b0; vorr a3, a3, a4; vorr b3, b3, b4; \ veor a3, a3, a1; veor b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \ vmov a1, a3; vmov b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \ veor a3, a3, a0; veor b3, b3, b0; vand a0, a0, a1; vand b0, b0, b1; \ veor a4, a4, a0; veor b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \ veor a1, a1, a4; veor b1, b1, b4; vmvn a4, a4; vmvn b4, b4; #define SBOX2_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \ vmov a4, a3; vmov b4, b3; vand a3, a3, a2; vand b3, b3, b2; \ veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a2; vorr b1, b1, b2; \ veor a1, a1, a4; veor b1, b1, b4; vand a4, a4, a3; vand b4, b4, b3; \ veor a2, a2, a3; veor b2, b2, b3; vand a4, a4, a0; vand b4, b4, b0; \ veor a4, a4, a2; veor b4, b4, b2; vand a2, a2, a1; vand b2, b2, b1; \ vorr a2, a2, a0; vorr b2, b2, b0; vmvn a3, a3; vmvn b3, b3; \ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \ vand a0, a0, a1; vand b0, b0, b1; veor a3, a3, a4; veor b3, b3, b4; \ veor a3, a0; veor b3, b0; #define SBOX3(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a0; vmov b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ veor a3, a3, a1; veor b3, b3, b1; vand a1, a1, a4; vand b1, b1, b4; \ veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a3; veor b2, b2, b3; \ vand a3, a3, a0; vand b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \ veor a3, a3, a4; veor b3, b3, b4; veor a0, a0, a1; veor b0, b0, b1; \ vand a4, a4, a0; vand b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \ veor a4, a4, a2; veor b4, b4, b2; vorr a1, a1, a0; vorr b1, b1, b0; \ veor a1, a1, a2; veor b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \ vmov a2, a1; vmov b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \ veor a1, a0; veor b1, b0; #define SBOX3_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a2; vmov b4, b2; veor a2, a2, a1; veor b2, b2, b1; \ veor a0, a0, a2; veor b0, b0, b2; vand a4, a4, a2; vand b4, b4, b2; \ veor a4, a4, a0; veor b4, b4, b0; vand a0, a0, a1; vand b0, b0, b1; \ veor a1, a1, a3; veor b1, b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \ veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \ veor a1, a1, a4; veor b1, b1, b4; vand a3, a3, a2; vand b3, b3, b2; \ veor a3, a3, a1; veor b3, b3, b1; veor a1, a1, a0; veor b1, b1, b0; \ vorr a1, a1, a2; vorr b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \ veor a1, a1, a4; veor b1, b1, b4;\ veor a0, a1; veor b0, b1; #define SBOX4(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ veor a1, a1, a3; veor b1, b1, b3; vmvn a3, a3; vmvn b3, b3; \ veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \ vmov a4, a1; vmov b4, b1; vand a1, a1, a3; vand b1, b1, b3; \ veor a1, a1, a2; veor b1, b1, b2; veor a4, a4, a3; veor b4, b4, b3; \ veor a0, a0, a4; veor b0, b0, b4; vand a2, a2, a4; vand b2, b2, b4; \ veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a1; vand b0, b0, b1; \ veor a3, a3, a0; veor b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ veor a0, a0, a2; veor b0, b0, b2; vand a2, a2, a3; vand b2, b2, b3; \ vmvn a0, a0; vmvn b0, b0; veor a4, a2; veor b4, b2; #define SBOX4_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a2; vmov b4, b2; vand a2, a2, a3; vand b2, b2, b3; \ veor a2, a2, a1; veor b2, b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \ vand a1, a1, a0; vand b1, b1, b0; veor a4, a4, a2; veor b4, b4, b2; \ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \ vmvn a0, a0; vmvn b0, b0; veor a3, a3, a4; veor b3, b3, b4; \ veor a1, a1, a3; veor b1, b1, b3; vand a3, a3, a0; vand b3, b3, b0; \ veor a3, a3, a2; veor b3, b3, b2; veor a0, a0, a1; veor b0, b0, b1; \ vand a2, a2, a0; vand b2, b2, b0; veor a3, a3, a0; veor b3, b3, b0; \ veor a2, a2, a4; veor b2, b2, b4;\ vorr a2, a2, a3; vorr b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \ veor a2, a1; veor b2, b1; #define SBOX5(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ veor a0, a0, a1; veor b0, b0, b1; veor a1, a1, a3; veor b1, b1, b3; \ vmvn a3, a3; vmvn b3, b3; vmov a4, a1; vmov b4, b1; \ vand a1, a1, a0; vand b1, b1, b0; veor a2, a2, a3; veor b2, b2, b3; \ veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \ veor a4, a4, a3; veor b4, b4, b3; vand a3, a3, a1; vand b3, b3, b1; \ veor a3, a3, a0; veor b3, b3, b0; veor a4, a4, a1; veor b4, b4, b1; \ veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a0; veor b2, b2, b0; \ vand a0, a0, a3; vand b0, b0, b3; vmvn a2, a2; vmvn b2, b2; \ veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a3; vorr b4, b4, b3; \ veor a2, a4; veor b2, b4; #define SBOX5_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmvn a1, a1; vmvn b1, b1; vmov a4, a3; vmov b4, b3; \ veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a0; vorr b3, b3, b0; \ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \ vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \ veor a2, a2, a4; veor b2, b2, b4; vorr a4, a4, a0; vorr b4, b4, b0; \ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \ veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \ vand a3, a3, a4; vand b3, b3, b4; veor a4, a4, a1; veor b4, b4, b1; \ veor a3, a3, a4; veor b3, b3, b4; vmvn a4, a4; vmvn b4, b4; \ veor a3, a0; veor b3, b0; #define SBOX6(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmvn a2, a2; vmvn b2, b2; vmov a4, a3; vmov b4, b3; \ vand a3, a3, a0; vand b3, b3, b0; veor a0, a0, a4; veor b0, b0, b4; \ veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a4; vorr b2, b2, b4; \ veor a1, a1, a3; veor b1, b1, b3; veor a2, a2, a0; veor b2, b2, b0; \ vorr a0, a0, a1; vorr b0, b0, b1; veor a2, a2, a1; veor b2, b2, b1; \ veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \ veor a0, a0, a2; veor b0, b0, b2; veor a4, a4, a3; veor b4, b4, b3; \ veor a4, a4, a0; veor b4, b4, b0; vmvn a3, a3; vmvn b3, b3; \ vand a2, a2, a4; vand b2, b2, b4;\ veor a2, a3; veor b2, b3; #define SBOX6_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ veor a0, a0, a2; veor b0, b0, b2; vmov a4, a2; vmov b4, b2; \ vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \ vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \ veor a2, a2, a3; veor b2, b2, b3; vorr a4, a4, a0; vorr b4, b4, b0; \ veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \ veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a3; vand b1, b1, b3; \ veor a1, a1, a0; veor b1, b1, b0; veor a0, a0, a3; veor b0, b0, b3; \ vorr a0, a0, a2; vorr b0, b0, b2; veor a3, a3, a1; veor b3, b3, b1; \ veor a4, a0; veor b4, b0; #define SBOX7(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a1; vmov b4, b1; vorr a1, a1, a2; vorr b1, b1, b2; \ veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \ veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a4; vorr b3, b3, b4; \ vand a3, a3, a0; vand b3, b3, b0; veor a4, a4, a2; veor b4, b4, b2; \ veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a4; vorr b1, b1, b4; \ veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a4; vorr b0, b0, b4; \ veor a0, a0, a2; veor b0, b0, b2; veor a1, a1, a4; veor b1, b1, b4; \ veor a2, a2, a1; veor b2, b2, b1; vand a1, a1, a0; vand b1, b1, b0; \ veor a1, a1, a4; veor b1, b1, b4; vmvn a2, a2; vmvn b2, b2; \ vorr a2, a2, a0; vorr b2, b2, b0;\ veor a4, a2; veor b4, b2; #define SBOX7_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vmov a4, a2; vmov b4, b2; veor a2, a2, a0; veor b2, b2, b0; \ vand a0, a0, a3; vand b0, b0, b3; vorr a4, a4, a3; vorr b4, b4, b3; \ vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \ vorr a1, a1, a0; vorr b1, b1, b0; veor a0, a0, a2; veor b0, b0, b2; \ vand a2, a2, a4; vand b2, b2, b4; vand a3, a3, a4; vand b3, b3, b4; \ veor a1, a1, a2; veor b1, b1, b2; veor a2, a2, a0; veor b2, b2, b0; \ vorr a0, a0, a2; vorr b0, b0, b2; veor a4, a4, a1; veor b4, b4, b1; \ veor a0, a0, a3; veor b0, b0, b3; veor a3, a3, a4; veor b3, b3, b4; \ vorr a4, a4, a0; vorr b4, b4, b0; veor a3, a3, a2; veor b3, b3, b2; \ veor a4, a2; veor b4, b2; /* Apply SBOX number WHICH to to the block. */ #define SBOX(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ SBOX##which (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) /* Apply inverse SBOX number WHICH to to the block. */ #define SBOX_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ SBOX##which##_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) /* XOR round key into block state in a0,a1,a2,a3. a4 used as temporary. */ #define BLOCK_XOR_KEY(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vdup.32 RT3, RT0d0[0]; \ vdup.32 RT1, RT0d0[1]; \ vdup.32 RT2, RT0d1[0]; \ vdup.32 RT0, RT0d1[1]; \ veor a0, a0, RT3; veor b0, b0, RT3; \ veor a1, a1, RT1; veor b1, b1, RT1; \ veor a2, a2, RT2; veor b2, b2, RT2; \ veor a3, a3, RT0; veor b3, b3, RT0; #define BLOCK_LOAD_KEY_ENC() \ vld1.8 {RT0d0, RT0d1}, [RROUND]!; #define BLOCK_LOAD_KEY_DEC() \ vld1.8 {RT0d0, RT0d1}, [RROUND]; \ sub RROUND, RROUND, #16 /* Apply the linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vshl.u32 a4, a0, #13; vshl.u32 b4, b0, #13; \ vshr.u32 a0, a0, #(32-13); vshr.u32 b0, b0, #(32-13); \ veor a0, a0, a4; veor b0, b0, b4; \ vshl.u32 a4, a2, #3; vshl.u32 b4, b2, #3; \ vshr.u32 a2, a2, #(32-3); vshr.u32 b2, b2, #(32-3); \ veor a2, a2, a4; veor b2, b2, b4; \ veor a1, a0, a1; veor b1, b0, b1; \ veor a1, a2, a1; veor b1, b2, b1; \ vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \ veor a3, a2, a3; veor b3, b2, b3; \ veor a3, a4, a3; veor b3, b4, b3; \ vshl.u32 a4, a1, #1; vshl.u32 b4, b1, #1; \ vshr.u32 a1, a1, #(32-1); vshr.u32 b1, b1, #(32-1); \ veor a1, a1, a4; veor b1, b1, b4; \ vshl.u32 a4, a3, #7; vshl.u32 b4, b3, #7; \ vshr.u32 a3, a3, #(32-7); vshr.u32 b3, b3, #(32-7); \ veor a3, a3, a4; veor b3, b3, b4; \ veor a0, a1, a0; veor b0, b1, b0; \ veor a0, a3, a0; veor b0, b3, b0; \ vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \ veor a2, a3, a2; veor b2, b3, b2; \ veor a2, a4, a2; veor b2, b4, b2; \ vshl.u32 a4, a0, #5; vshl.u32 b4, b0, #5; \ vshr.u32 a0, a0, #(32-5); vshr.u32 b0, b0, #(32-5); \ veor a0, a0, a4; veor b0, b0, b4; \ vshl.u32 a4, a2, #22; vshl.u32 b4, b2, #22; \ vshr.u32 a2, a2, #(32-22); vshr.u32 b2, b2, #(32-22); \ veor a2, a2, a4; veor b2, b2, b4; /* Apply the inverse linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ vshr.u32 a4, a2, #22; vshr.u32 b4, b2, #22; \ vshl.u32 a2, a2, #(32-22); vshl.u32 b2, b2, #(32-22); \ veor a2, a2, a4; veor b2, b2, b4; \ vshr.u32 a4, a0, #5; vshr.u32 b4, b0, #5; \ vshl.u32 a0, a0, #(32-5); vshl.u32 b0, b0, #(32-5); \ veor a0, a0, a4; veor b0, b0, b4; \ vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \ veor a2, a3, a2; veor b2, b3, b2; \ veor a2, a4, a2; veor b2, b4, b2; \ veor a0, a1, a0; veor b0, b1, b0; \ veor a0, a3, a0; veor b0, b3, b0; \ vshr.u32 a4, a3, #7; vshr.u32 b4, b3, #7; \ vshl.u32 a3, a3, #(32-7); vshl.u32 b3, b3, #(32-7); \ veor a3, a3, a4; veor b3, b3, b4; \ vshr.u32 a4, a1, #1; vshr.u32 b4, b1, #1; \ vshl.u32 a1, a1, #(32-1); vshl.u32 b1, b1, #(32-1); \ veor a1, a1, a4; veor b1, b1, b4; \ vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \ veor a3, a2, a3; veor b3, b2, b3; \ veor a3, a4, a3; veor b3, b4, b3; \ veor a1, a0, a1; veor b1, b0, b1; \ veor a1, a2, a1; veor b1, b2, b1; \ vshr.u32 a4, a2, #3; vshr.u32 b4, b2, #3; \ vshl.u32 a2, a2, #(32-3); vshl.u32 b2, b2, #(32-3); \ veor a2, a2, a4; veor b2, b2, b4; \ vshr.u32 a4, a0, #13; vshr.u32 b4, b0, #13; \ vshl.u32 a0, a0, #(32-13); vshl.u32 b0, b0, #(32-13); \ veor a0, a0, a4; veor b0, b0, b4; /* Apply a Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ BLOCK_LOAD_KEY_ENC (); \ SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); /* Apply the last Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ BLOCK_LOAD_KEY_ENC (); \ SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); /* Apply an inverse Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \ BLOCK_LOAD_KEY_DEC (); /* Apply the first inverse Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ BLOCK_LOAD_KEY_DEC (); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \ BLOCK_LOAD_KEY_DEC (); .align 3 .type __serpent_enc_blk8,%function; __serpent_enc_blk8: /* input: * r0: round key pointer * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks * output: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel * ciphertext blocks */ transpose_4x4(RA0, RA1, RA2, RA3); BLOCK_LOAD_KEY_ENC (); transpose_4x4(RB0, RB1, RB2, RB3); ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); transpose_4x4(RA4, RA1, RA2, RA0); transpose_4x4(RB4, RB1, RB2, RB0); bx lr; .size __serpent_enc_blk8,.-__serpent_enc_blk8; .align 3 .type __serpent_dec_blk8,%function; __serpent_dec_blk8: /* input: * r0: round key pointer * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel * ciphertext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks */ add RROUND, RROUND, #(32*16); transpose_4x4(RA0, RA1, RA2, RA3); BLOCK_LOAD_KEY_DEC (); transpose_4x4(RB0, RB1, RB2, RB3); ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, RA3, RA0, RA1, RA4, RA2, RB0, RB1, RB2, RB3, RB4, RB3, RB0, RB1, RB4, RB2); ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); transpose_4x4(RA0, RA1, RA2, RA3); transpose_4x4(RB0, RB1, RB2, RB3); bx lr; .size __serpent_dec_blk8,.-__serpent_dec_blk8; +.align 3 +.globl _gcry_serpent_neon_blk8 +.type _gcry_serpent_neon_blk8,%function; +_gcry_serpent_neon_blk8: + /* input: + * r0: ctx, CTX + * r1: dst (8 blocks) + * r2: src (8 blocks) + * r3: encrypt + */ + + push {lr}; + vpush {RA4-RB2}; + + cmp r3, #0 + + vld1.8 {RA0, RA1}, [r2]!; + vld1.8 {RA2, RA3}, [r2]!; + vld1.8 {RB0, RB1}, [r2]!; + vld1.8 {RB2, RB3}, [r2]!; + + beq .Lblk8_dec; + bl __serpent_enc_blk8; + vst1.8 {RA4}, [r1]!; + vst1.8 {RA1, RA2}, [r1]!; + vst1.8 {RA0}, [r1]!; + vst1.8 {RB4}, [r1]!; + vst1.8 {RB1, RB2}, [r1]!; + vst1.8 {RB0}, [r1]!; + b .Lblk8_end; + .Lblk8_dec: + bl __serpent_dec_blk8; + vst1.8 {RA0, RA1}, [r1]!; + vst1.8 {RA2, RA3}, [r1]!; + vst1.8 {RB0, RB1}, [r1]!; + vst1.8 {RB2, RB3}, [r1]!; + +.Lblk8_end: + /* clear the used registers */ + veor RA0, RA0; + veor RA1, RA1; + veor RA2, RA2; + veor RA3, RA3; + + vpop {RA4-RB2}; + + veor RB3, RB3; + veor RB4, RB4; + veor RT0, RT0; + veor RT1, RT1; + veor RT2, RT2; + veor RT3, RT3; + + pop {pc}; +.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec; + .align 3 .globl _gcry_serpent_neon_ctr_enc .type _gcry_serpent_neon_ctr_enc,%function; _gcry_serpent_neon_ctr_enc: /* input: * r0: ctx, CTX * r1: dst (8 blocks) * r2: src (8 blocks) * r3: iv */ vmov.u8 RT1d0, #0xff; /* u64: -1 */ push {r4,lr}; vadd.u64 RT2d0, RT1d0, RT1d0; /* u64: -2 */ vpush {RA4-RB2}; /* load IV and byteswap */ vld1.8 {RA0}, [r3]; vrev64.u8 RT0, RA0; /* be => le */ ldr r4, [r3, #8]; /* construct IVs */ vsub.u64 RA2d1, RT0d1, RT2d0; /* +2 */ vsub.u64 RA1d1, RT0d1, RT1d0; /* +1 */ cmp r4, #-1; vsub.u64 RB0d1, RA2d1, RT2d0; /* +4 */ vsub.u64 RA3d1, RA2d1, RT1d0; /* +3 */ ldr r4, [r3, #12]; vsub.u64 RB2d1, RB0d1, RT2d0; /* +6 */ vsub.u64 RB1d1, RB0d1, RT1d0; /* +5 */ vsub.u64 RT2d1, RB2d1, RT2d0; /* +8 */ vsub.u64 RB3d1, RB2d1, RT1d0; /* +7 */ vmov RA1d0, RT0d0; vmov RA2d0, RT0d0; vmov RA3d0, RT0d0; vmov RB0d0, RT0d0; rev r4, r4; vmov RB1d0, RT0d0; vmov RB2d0, RT0d0; vmov RB3d0, RT0d0; vmov RT2d0, RT0d0; /* check need for handling 64-bit overflow and carry */ beq .Ldo_ctr_carry; .Lctr_carry_done: /* le => be */ vrev64.u8 RA1, RA1; vrev64.u8 RA2, RA2; vrev64.u8 RA3, RA3; vrev64.u8 RB0, RB0; vrev64.u8 RT2, RT2; vrev64.u8 RB1, RB1; vrev64.u8 RB2, RB2; vrev64.u8 RB3, RB3; /* store new IV */ vst1.8 {RT2}, [r3]; bl __serpent_enc_blk8; vld1.8 {RT0, RT1}, [r2]!; vld1.8 {RT2, RT3}, [r2]!; veor RA4, RA4, RT0; veor RA1, RA1, RT1; vld1.8 {RT0, RT1}, [r2]!; veor RA2, RA2, RT2; veor RA0, RA0, RT3; vld1.8 {RT2, RT3}, [r2]!; veor RB4, RB4, RT0; veor RT0, RT0; veor RB1, RB1, RT1; veor RT1, RT1; veor RB2, RB2, RT2; veor RT2, RT2; veor RB0, RB0, RT3; veor RT3, RT3; vst1.8 {RA4}, [r1]!; vst1.8 {RA1}, [r1]!; veor RA1, RA1; vst1.8 {RA2}, [r1]!; veor RA2, RA2; vst1.8 {RA0}, [r1]!; veor RA0, RA0; vst1.8 {RB4}, [r1]!; veor RB4, RB4; vst1.8 {RB1}, [r1]!; vst1.8 {RB2}, [r1]!; vst1.8 {RB0}, [r1]!; vpop {RA4-RB2}; /* clear the used registers */ veor RA3, RA3; veor RB3, RB3; pop {r4,pc}; .Ldo_ctr_carry: cmp r4, #-8; blo .Lctr_carry_done; beq .Lcarry_RT2; cmp r4, #-6; blo .Lcarry_RB3; beq .Lcarry_RB2; cmp r4, #-4; blo .Lcarry_RB1; beq .Lcarry_RB0; cmp r4, #-2; blo .Lcarry_RA3; beq .Lcarry_RA2; vsub.u64 RA1d0, RT1d0; .Lcarry_RA2: vsub.u64 RA2d0, RT1d0; .Lcarry_RA3: vsub.u64 RA3d0, RT1d0; .Lcarry_RB0: vsub.u64 RB0d0, RT1d0; .Lcarry_RB1: vsub.u64 RB1d0, RT1d0; .Lcarry_RB2: vsub.u64 RB2d0, RT1d0; .Lcarry_RB3: vsub.u64 RB3d0, RT1d0; .Lcarry_RT2: vsub.u64 RT2d0, RT1d0; b .Lctr_carry_done; .size _gcry_serpent_neon_ctr_enc,.-_gcry_serpent_neon_ctr_enc; .align 3 .globl _gcry_serpent_neon_cfb_dec .type _gcry_serpent_neon_cfb_dec,%function; _gcry_serpent_neon_cfb_dec: /* input: * r0: ctx, CTX * r1: dst (8 blocks) * r2: src (8 blocks) * r3: iv */ push {lr}; vpush {RA4-RB2}; /* Load input */ vld1.8 {RA0}, [r3]; vld1.8 {RA1, RA2}, [r2]!; vld1.8 {RA3}, [r2]!; vld1.8 {RB0}, [r2]!; vld1.8 {RB1, RB2}, [r2]!; vld1.8 {RB3}, [r2]!; /* Update IV */ vld1.8 {RT0}, [r2]!; vst1.8 {RT0}, [r3]; mov r3, lr; sub r2, r2, #(8*16); bl __serpent_enc_blk8; vld1.8 {RT0, RT1}, [r2]!; vld1.8 {RT2, RT3}, [r2]!; veor RA4, RA4, RT0; veor RA1, RA1, RT1; vld1.8 {RT0, RT1}, [r2]!; veor RA2, RA2, RT2; veor RA0, RA0, RT3; vld1.8 {RT2, RT3}, [r2]!; veor RB4, RB4, RT0; veor RT0, RT0; veor RB1, RB1, RT1; veor RT1, RT1; veor RB2, RB2, RT2; veor RT2, RT2; veor RB0, RB0, RT3; veor RT3, RT3; vst1.8 {RA4}, [r1]!; vst1.8 {RA1}, [r1]!; veor RA1, RA1; vst1.8 {RA2}, [r1]!; veor RA2, RA2; vst1.8 {RA0}, [r1]!; veor RA0, RA0; vst1.8 {RB4}, [r1]!; veor RB4, RB4; vst1.8 {RB1}, [r1]!; vst1.8 {RB2}, [r1]!; vst1.8 {RB0}, [r1]!; vpop {RA4-RB2}; /* clear the used registers */ veor RA3, RA3; veor RB3, RB3; pop {pc}; .size _gcry_serpent_neon_cfb_dec,.-_gcry_serpent_neon_cfb_dec; .align 3 .globl _gcry_serpent_neon_cbc_dec .type _gcry_serpent_neon_cbc_dec,%function; _gcry_serpent_neon_cbc_dec: /* input: * r0: ctx, CTX * r1: dst (8 blocks) * r2: src (8 blocks) * r3: iv */ push {lr}; vpush {RA4-RB2}; vld1.8 {RA0, RA1}, [r2]!; vld1.8 {RA2, RA3}, [r2]!; vld1.8 {RB0, RB1}, [r2]!; vld1.8 {RB2, RB3}, [r2]!; sub r2, r2, #(8*16); bl __serpent_dec_blk8; vld1.8 {RB4}, [r3]; vld1.8 {RT0, RT1}, [r2]!; vld1.8 {RT2, RT3}, [r2]!; veor RA0, RA0, RB4; veor RA1, RA1, RT0; veor RA2, RA2, RT1; vld1.8 {RT0, RT1}, [r2]!; veor RA3, RA3, RT2; veor RB0, RB0, RT3; vld1.8 {RT2, RT3}, [r2]!; veor RB1, RB1, RT0; veor RT0, RT0; veor RB2, RB2, RT1; veor RT1, RT1; veor RB3, RB3, RT2; veor RT2, RT2; vst1.8 {RT3}, [r3]; /* store new IV */ veor RT3, RT3; vst1.8 {RA0, RA1}, [r1]!; veor RA0, RA0; veor RA1, RA1; vst1.8 {RA2, RA3}, [r1]!; veor RA2, RA2; vst1.8 {RB0, RB1}, [r1]!; veor RA3, RA3; vst1.8 {RB2, RB3}, [r1]!; veor RB3, RB3; vpop {RA4-RB2}; /* clear the used registers */ veor RB4, RB4; pop {pc}; .size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec; .align 3 .globl _gcry_serpent_neon_ocb_enc .type _gcry_serpent_neon_ocb_enc,%function; _gcry_serpent_neon_ocb_enc: /* input: * r0 : ctx, CTX * r1 : dst (8 blocks) * r2 : src (8 blocks) * r3 : offset * sp+0: checksum * sp+4: L pointers (void *L[8]) */ push {r4-r11, ip, lr}; add ip, sp, #(10*4); vpush {RA4-RB2}; ldm ip, {r4, lr}; vld1.8 {RT0}, [r3]; vld1.8 {RT1}, [r4]; /* Load L pointers */ ldm lr!, {r5, r6, r7, r8}; ldm lr, {r9, r10, r11, ip}; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ vld1.8 {RA0, RA1}, [r2]!; vld1.8 {RA2, RA3}, [r2]!; vld1.8 {RB0, RB1}, [r2]!; vld1.8 {RB2, RB3}, [r2]; #define OCB_INPUT(lreg, vreg) \ vld1.8 {RT3}, [lreg]; \ veor RT0, RT3; \ veor RT1, vreg; \ veor vreg, RT0; \ vst1.8 {RT0}, [r1]!; OCB_INPUT(r5, RA0); OCB_INPUT(r6, RA1); OCB_INPUT(r7, RA2); OCB_INPUT(r8, RA3); OCB_INPUT(r9, RB0); OCB_INPUT(r10, RB1); OCB_INPUT(r11, RB2); OCB_INPUT(ip, RB3); #undef OCB_INPUT sub r1, r1, #(8*16); vst1.8 {RT0}, [r3]; vst1.8 {RT1}, [r4]; mov r2, r1; bl __serpent_enc_blk8; vld1.8 {RT0, RT1}, [r1]!; veor RT0, RA4, RT0; veor RT1, RA1, RT1; vld1.8 {RT2, RT3}, [r1]!; vst1.8 {RT0, RT1}, [r2]!; veor RT2, RA2, RT2; veor RT3, RA0, RT3; vld1.8 {RT0, RT1}, [r1]!; vst1.8 {RT2, RT3}, [r2]!; veor RT0, RB4, RT0; veor RT1, RB1, RT1; vld1.8 {RT2, RT3}, [r1]!; vst1.8 {RT0, RT1}, [r2]!; veor RT2, RB2, RT2; veor RT3, RB0, RT3; vst1.8 {RT2, RT3}, [r2]!; vpop {RA4-RB2}; /* clear the used registers */ veor RA3, RA3; veor RB3, RB3; pop {r4-r11, ip, pc}; .size _gcry_serpent_neon_ocb_enc,.-_gcry_serpent_neon_ocb_enc; .align 3 .globl _gcry_serpent_neon_ocb_dec .type _gcry_serpent_neon_ocb_dec,%function; _gcry_serpent_neon_ocb_dec: /* input: * r0 : ctx, CTX * r1 : dst (8 blocks) * r2 : src (8 blocks) * r3 : offset * sp+0: checksum * sp+4: L pointers (void *L[8]) */ push {r4-r11, ip, lr}; add ip, sp, #(10*4); vpush {RA4-RB2}; ldm ip, {r4, lr}; vld1.8 {RT0}, [r3]; /* Load L pointers */ ldm lr!, {r5, r6, r7, r8}; ldm lr, {r9, r10, r11, ip}; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ vld1.8 {RA0, RA1}, [r2]!; vld1.8 {RA2, RA3}, [r2]!; vld1.8 {RB0, RB1}, [r2]!; vld1.8 {RB2, RB3}, [r2]; #define OCB_INPUT(lreg, vreg) \ vld1.8 {RT3}, [lreg]; \ veor RT0, RT3; \ veor vreg, RT0; \ vst1.8 {RT0}, [r1]!; OCB_INPUT(r5, RA0); OCB_INPUT(r6, RA1); OCB_INPUT(r7, RA2); OCB_INPUT(r8, RA3); OCB_INPUT(r9, RB0); OCB_INPUT(r10, RB1); OCB_INPUT(r11, RB2); OCB_INPUT(ip, RB3); #undef OCB_INPUT sub r1, r1, #(8*16); vst1.8 {RT0}, [r3]; mov r2, r1; bl __serpent_dec_blk8; /* Checksum_i = Checksum_{i-1} xor P_i */ vld1.8 {RA4}, [r4]; vld1.8 {RT0, RT1}, [r1]!; veor RA0, RA0, RT0; veor RA1, RA1, RT1; vld1.8 {RT2, RT3}, [r1]!; veor RA4, RA4, RA0; vst1.8 {RA0, RA1}, [r2]!; veor RA4, RA4, RA1; veor RA2, RA2, RT2; veor RA3, RA3, RT3; vld1.8 {RT0, RT1}, [r1]!; veor RA4, RA4, RA2; vst1.8 {RA2, RA3}, [r2]!; veor RA4, RA4, RA3; veor RB0, RB0, RT0; veor RB1, RB1, RT1; vld1.8 {RT2, RT3}, [r1]!; veor RA4, RA4, RB0; vst1.8 {RB0, RB1}, [r2]!; veor RA4, RA4, RB1; veor RB2, RB2, RT2; veor RB3, RB3, RT3; veor RA4, RA4, RB2; vst1.8 {RB2, RB3}, [r2]!; veor RA4, RA4, RB3; vst1.8 {RA4}, [r4]; vpop {RA4-RB2}; /* clear the used registers */ veor RB4, RB4; pop {r4-r11, ip, pc}; .size _gcry_serpent_neon_ocb_dec,.-_gcry_serpent_neon_ocb_dec; .align 3 .globl _gcry_serpent_neon_ocb_auth .type _gcry_serpent_neon_ocb_auth,%function; _gcry_serpent_neon_ocb_auth: /* input: * r0 : ctx, CTX * r1 : abuf (8 blocks) * r2 : offset * r3 : checksum * sp+0: L pointers (void *L[8]) */ push {r5-r11, ip, lr}; ldr lr, [sp, #(9*4)]; vpush {RA4-RB2}; vld1.8 {RT0}, [r2]; /* Load L pointers */ ldm lr!, {r5, r6, r7, r8}; ldm lr, {r9, r10, r11, ip}; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ vld1.8 {RA0, RA1}, [r1]!; vld1.8 {RA2, RA3}, [r1]!; vld1.8 {RB0, RB1}, [r1]!; vld1.8 {RB2, RB3}, [r1]; #define OCB_INPUT(lreg, vreg) \ vld1.8 {RT3}, [lreg]; \ veor RT0, RT3; \ veor vreg, RT0; OCB_INPUT(r5, RA0); OCB_INPUT(r6, RA1); OCB_INPUT(r7, RA2); OCB_INPUT(r8, RA3); OCB_INPUT(r9, RB0); OCB_INPUT(r10, RB1); OCB_INPUT(r11, RB2); OCB_INPUT(ip, RB3); #undef OCB_INPUT vst1.8 {RT0}, [r2]; bl __serpent_enc_blk8; /* Checksum_i = Checksum_{i-1} xor P_i */ vld1.8 {RT0}, [r3]; veor RA4, RB4; veor RA1, RB1; veor RA2, RB2; veor RA0, RB0; veor RA2, RT0; veor RA1, RA4; veor RA0, RA2; veor RA0, RA1; vst1.8 {RA0}, [r3]; vpop {RA4-RB2}; /* clear the used registers */ veor RA3, RA3; veor RB3, RB3; pop {r5-r11, ip, pc}; .size _gcry_serpent_neon_ocb_auth,.-_gcry_serpent_neon_ocb_auth; #endif diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index d3515a21..54ff61e4 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -1,1160 +1,1210 @@ /* serpent-avx2-amd64.S - AVX2 implementation of Serpent cipher * * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #ifdef __x86_64 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \ defined(ENABLE_AVX2_SUPPORT) #include "asm-common-amd64.h" /* struct serpent_context: */ #define ctx_keys 0 /* register macros */ #define CTX %rdi /* vector registers */ #define RA0 %ymm0 #define RA1 %ymm1 #define RA2 %ymm2 #define RA3 %ymm3 #define RA4 %ymm4 #define RB0 %ymm5 #define RB1 %ymm6 #define RB2 %ymm7 #define RB3 %ymm8 #define RB4 %ymm9 #define RNOT %ymm10 #define RTMP0 %ymm11 #define RTMP1 %ymm12 #define RTMP2 %ymm13 #define RTMP3 %ymm14 #define RTMP4 %ymm15 #define RNOTx %xmm10 #define RTMP0x %xmm11 #define RTMP1x %xmm12 #define RTMP2x %xmm13 #define RTMP3x %xmm14 #define RTMP4x %xmm15 /********************************************************************** helper macros **********************************************************************/ /* vector 32-bit rotation to left */ #define vec_rol(reg, nleft, tmp) \ vpslld $(nleft), reg, tmp; \ vpsrld $(32 - (nleft)), reg, reg; \ vpor tmp, reg, reg; /* vector 32-bit rotation to right */ #define vec_ror(reg, nright, tmp) \ vec_rol(reg, 32 - nright, tmp) /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /********************************************************************** 16-way serpent **********************************************************************/ /* * These are the S-Boxes of Serpent from following research paper. * * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, * (New York, New York, USA), p. 317–329, National Institute of Standards and * Technology, 2000. * * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf * */ #define SBOX0(r0, r1, r2, r3, r4) \ vpxor r0, r3, r3; vmovdqa r1, r4; \ vpand r3, r1, r1; vpxor r2, r4, r4; \ vpxor r0, r1, r1; vpor r3, r0, r0; \ vpxor r4, r0, r0; vpxor r3, r4, r4; \ vpxor r2, r3, r3; vpor r1, r2, r2; \ vpxor r4, r2, r2; vpxor RNOT, r4, r4; \ vpor r1, r4, r4; vpxor r3, r1, r1; \ vpxor r4, r1, r1; vpor r0, r3, r3; \ vpxor r3, r1, r1; vpxor r3, r4, r4; #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \ vpxor RNOT, r2, r2; vmovdqa r1, r4; \ vpor r0, r1, r1; vpxor RNOT, r4, r4; \ vpxor r2, r1, r1; vpor r4, r2, r2; \ vpxor r3, r1, r1; vpxor r4, r0, r0; \ vpxor r0, r2, r2; vpand r3, r0, r0; \ vpxor r0, r4, r4; vpor r1, r0, r0; \ vpxor r2, r0, r0; vpxor r4, r3, r3; \ vpxor r1, r2, r2; vpxor r0, r3, r3; \ vpxor r1, r3, r3; \ vpand r3, r2, r2; \ vpxor r2, r4, r4; #define SBOX1(r0, r1, r2, r3, r4) \ vpxor RNOT, r0, r0; vpxor RNOT, r2, r2; \ vmovdqa r0, r4; vpand r1, r0, r0; \ vpxor r0, r2, r2; vpor r3, r0, r0; \ vpxor r2, r3, r3; vpxor r0, r1, r1; \ vpxor r4, r0, r0; vpor r1, r4, r4; \ vpxor r3, r1, r1; vpor r0, r2, r2; \ vpand r4, r2, r2; vpxor r1, r0, r0; \ vpand r2, r1, r1; \ vpxor r0, r1, r1; vpand r2, r0, r0; \ vpxor r4, r0, r0; #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r1, r4; vpxor r3, r1, r1; \ vpand r1, r3, r3; vpxor r2, r4, r4; \ vpxor r0, r3, r3; vpor r1, r0, r0; \ vpxor r3, r2, r2; vpxor r4, r0, r0; \ vpor r2, r0, r0; vpxor r3, r1, r1; \ vpxor r1, r0, r0; vpor r3, r1, r1; \ vpxor r0, r1, r1; vpxor RNOT, r4, r4; \ vpxor r1, r4, r4; vpor r0, r1, r1; \ vpxor r0, r1, r1; \ vpor r4, r1, r1; \ vpxor r1, r3, r3; #define SBOX2(r0, r1, r2, r3, r4) \ vmovdqa r0, r4; vpand r2, r0, r0; \ vpxor r3, r0, r0; vpxor r1, r2, r2; \ vpxor r0, r2, r2; vpor r4, r3, r3; \ vpxor r1, r3, r3; vpxor r2, r4, r4; \ vmovdqa r3, r1; vpor r4, r3, r3; \ vpxor r0, r3, r3; vpand r1, r0, r0; \ vpxor r0, r4, r4; vpxor r3, r1, r1; \ vpxor r4, r1, r1; vpxor RNOT, r4, r4; #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \ vpxor r3, r2, r2; vpxor r0, r3, r3; \ vmovdqa r3, r4; vpand r2, r3, r3; \ vpxor r1, r3, r3; vpor r2, r1, r1; \ vpxor r4, r1, r1; vpand r3, r4, r4; \ vpxor r3, r2, r2; vpand r0, r4, r4; \ vpxor r2, r4, r4; vpand r1, r2, r2; \ vpor r0, r2, r2; vpxor RNOT, r3, r3; \ vpxor r3, r2, r2; vpxor r3, r0, r0; \ vpand r1, r0, r0; vpxor r4, r3, r3; \ vpxor r0, r3, r3; #define SBOX3(r0, r1, r2, r3, r4) \ vmovdqa r0, r4; vpor r3, r0, r0; \ vpxor r1, r3, r3; vpand r4, r1, r1; \ vpxor r2, r4, r4; vpxor r3, r2, r2; \ vpand r0, r3, r3; vpor r1, r4, r4; \ vpxor r4, r3, r3; vpxor r1, r0, r0; \ vpand r0, r4, r4; vpxor r3, r1, r1; \ vpxor r2, r4, r4; vpor r0, r1, r1; \ vpxor r2, r1, r1; vpxor r3, r0, r0; \ vmovdqa r1, r2; vpor r3, r1, r1; \ vpxor r0, r1, r1; #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r2, r4; vpxor r1, r2, r2; \ vpxor r2, r0, r0; vpand r2, r4, r4; \ vpxor r0, r4, r4; vpand r1, r0, r0; \ vpxor r3, r1, r1; vpor r4, r3, r3; \ vpxor r3, r2, r2; vpxor r3, r0, r0; \ vpxor r4, r1, r1; vpand r2, r3, r3; \ vpxor r1, r3, r3; vpxor r0, r1, r1; \ vpor r2, r1, r1; vpxor r3, r0, r0; \ vpxor r4, r1, r1; \ vpxor r1, r0, r0; #define SBOX4(r0, r1, r2, r3, r4) \ vpxor r3, r1, r1; vpxor RNOT, r3, r3; \ vpxor r3, r2, r2; vpxor r0, r3, r3; \ vmovdqa r1, r4; vpand r3, r1, r1; \ vpxor r2, r1, r1; vpxor r3, r4, r4; \ vpxor r4, r0, r0; vpand r4, r2, r2; \ vpxor r0, r2, r2; vpand r1, r0, r0; \ vpxor r0, r3, r3; vpor r1, r4, r4; \ vpxor r0, r4, r4; vpor r3, r0, r0; \ vpxor r2, r0, r0; vpand r3, r2, r2; \ vpxor RNOT, r0, r0; vpxor r2, r4, r4; #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r2, r4; vpand r3, r2, r2; \ vpxor r1, r2, r2; vpor r3, r1, r1; \ vpand r0, r1, r1; vpxor r2, r4, r4; \ vpxor r1, r4, r4; vpand r2, r1, r1; \ vpxor RNOT, r0, r0; vpxor r4, r3, r3; \ vpxor r3, r1, r1; vpand r0, r3, r3; \ vpxor r2, r3, r3; vpxor r1, r0, r0; \ vpand r0, r2, r2; vpxor r0, r3, r3; \ vpxor r4, r2, r2; \ vpor r3, r2, r2; vpxor r0, r3, r3; \ vpxor r1, r2, r2; #define SBOX5(r0, r1, r2, r3, r4) \ vpxor r1, r0, r0; vpxor r3, r1, r1; \ vpxor RNOT, r3, r3; vmovdqa r1, r4; \ vpand r0, r1, r1; vpxor r3, r2, r2; \ vpxor r2, r1, r1; vpor r4, r2, r2; \ vpxor r3, r4, r4; vpand r1, r3, r3; \ vpxor r0, r3, r3; vpxor r1, r4, r4; \ vpxor r2, r4, r4; vpxor r0, r2, r2; \ vpand r3, r0, r0; vpxor RNOT, r2, r2; \ vpxor r4, r0, r0; vpor r3, r4, r4; \ vpxor r4, r2, r2; #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \ vpxor RNOT, r1, r1; vmovdqa r3, r4; \ vpxor r1, r2, r2; vpor r0, r3, r3; \ vpxor r2, r3, r3; vpor r1, r2, r2; \ vpand r0, r2, r2; vpxor r3, r4, r4; \ vpxor r4, r2, r2; vpor r0, r4, r4; \ vpxor r1, r4, r4; vpand r2, r1, r1; \ vpxor r3, r1, r1; vpxor r2, r4, r4; \ vpand r4, r3, r3; vpxor r1, r4, r4; \ vpxor r4, r3, r3; vpxor RNOT, r4, r4; \ vpxor r0, r3, r3; #define SBOX6(r0, r1, r2, r3, r4) \ vpxor RNOT, r2, r2; vmovdqa r3, r4; \ vpand r0, r3, r3; vpxor r4, r0, r0; \ vpxor r2, r3, r3; vpor r4, r2, r2; \ vpxor r3, r1, r1; vpxor r0, r2, r2; \ vpor r1, r0, r0; vpxor r1, r2, r2; \ vpxor r0, r4, r4; vpor r3, r0, r0; \ vpxor r2, r0, r0; vpxor r3, r4, r4; \ vpxor r0, r4, r4; vpxor RNOT, r3, r3; \ vpand r4, r2, r2; \ vpxor r3, r2, r2; #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \ vpxor r2, r0, r0; vmovdqa r2, r4; \ vpand r0, r2, r2; vpxor r3, r4, r4; \ vpxor RNOT, r2, r2; vpxor r1, r3, r3; \ vpxor r3, r2, r2; vpor r0, r4, r4; \ vpxor r2, r0, r0; vpxor r4, r3, r3; \ vpxor r1, r4, r4; vpand r3, r1, r1; \ vpxor r0, r1, r1; vpxor r3, r0, r0; \ vpor r2, r0, r0; vpxor r1, r3, r3; \ vpxor r0, r4, r4; #define SBOX7(r0, r1, r2, r3, r4) \ vmovdqa r1, r4; vpor r2, r1, r1; \ vpxor r3, r1, r1; vpxor r2, r4, r4; \ vpxor r1, r2, r2; vpor r4, r3, r3; \ vpand r0, r3, r3; vpxor r2, r4, r4; \ vpxor r1, r3, r3; vpor r4, r1, r1; \ vpxor r0, r1, r1; vpor r4, r0, r0; \ vpxor r2, r0, r0; vpxor r4, r1, r1; \ vpxor r1, r2, r2; vpand r0, r1, r1; \ vpxor r4, r1, r1; vpxor RNOT, r2, r2; \ vpor r0, r2, r2; \ vpxor r2, r4, r4; #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r2, r4; vpxor r0, r2, r2; \ vpand r3, r0, r0; vpor r3, r4, r4; \ vpxor RNOT, r2, r2; vpxor r1, r3, r3; \ vpor r0, r1, r1; vpxor r2, r0, r0; \ vpand r4, r2, r2; vpand r4, r3, r3; \ vpxor r2, r1, r1; vpxor r0, r2, r2; \ vpor r2, r0, r0; vpxor r1, r4, r4; \ vpxor r3, r0, r0; vpxor r4, r3, r3; \ vpor r0, r4, r4; vpxor r2, r3, r3; \ vpxor r2, r4, r4; /* Apply SBOX number WHICH to to the block. */ #define SBOX(which, r0, r1, r2, r3, r4) \ SBOX##which (r0, r1, r2, r3, r4) /* Apply inverse SBOX number WHICH to to the block. */ #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \ SBOX##which##_INVERSE (r0, r1, r2, r3, r4) /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary. */ #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \ vpbroadcastd (ctx_keys + (round) * 16 + 0 * 4)(CTX), r4; \ vpxor r4, r0, r0; \ vpbroadcastd (ctx_keys + (round) * 16 + 1 * 4)(CTX), r4; \ vpxor r4, r1, r1; \ vpbroadcastd (ctx_keys + (round) * 16 + 2 * 4)(CTX), r4; \ vpxor r4, r2, r2; \ vpbroadcastd (ctx_keys + (round) * 16 + 3 * 4)(CTX), r4; \ vpxor r4, r3, r3; /* Apply the linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \ vec_rol(r0, 13, r4); \ vec_rol(r2, 3, r4); \ vpxor r0, r1, r1; \ vpxor r2, r1, r1; \ vpslld $3, r0, r4; \ vpxor r2, r3, r3; \ vpxor r4, r3, r3; \ vec_rol(r1, 1, r4); \ vec_rol(r3, 7, r4); \ vpxor r1, r0, r0; \ vpxor r3, r0, r0; \ vpslld $7, r1, r4; \ vpxor r3, r2, r2; \ vpxor r4, r2, r2; \ vec_rol(r0, 5, r4); \ vec_rol(r2, 22, r4); /* Apply the inverse linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \ vec_ror(r2, 22, r4); \ vec_ror(r0, 5, r4); \ vpslld $7, r1, r4; \ vpxor r3, r2, r2; \ vpxor r4, r2, r2; \ vpxor r1, r0, r0; \ vpxor r3, r0, r0; \ vec_ror(r3, 7, r4); \ vec_ror(r1, 1, r4); \ vpslld $3, r0, r4; \ vpxor r2, r3, r3; \ vpxor r4, r3, r3; \ vpxor r0, r1, r1; \ vpxor r2, r1, r1; \ vec_ror(r2, 3, r4); \ vec_ror(r0, 13, r4); /* Apply a Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4); /* Apply the last Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1)); /* Apply an inverse Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \ LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); /* Apply the first inverse Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text .align 8 ELF(.type __serpent_enc_blk16,@function;) __serpent_enc_blk16: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * plaintext blocks * output: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel * ciphertext blocks */ CFI_STARTPROC(); vpcmpeqd RNOT, RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;) .align 8 ELF(.type __serpent_dec_blk16,@function;) __serpent_dec_blk16: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * ciphertext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * plaintext blocks */ CFI_STARTPROC(); vpcmpeqd RNOT, RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, RA3, RA0, RA1, RA4, RA2, RB0, RB1, RB2, RB3, RB4, RB3, RB0, RB1, RB4, RB2); ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;) +.align 8 +.globl _gcry_serpent_avx2_blk16 +ELF(.type _gcry_serpent_avx2_blk16,@function;) +_gcry_serpent_avx2_blk16: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %ecx: encrypt + */ + CFI_STARTPROC(); + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + vmovdqu (4 * 32)(%rdx), RB0; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RB2; + vmovdqu (7 * 32)(%rdx), RB3; + + testl %ecx, %ecx; + jz .Lblk16_dec; + call __serpent_enc_blk16; + vmovdqu RA4, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA0, (3 * 32)(%rsi); + vmovdqu RB4, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB0, (7 * 32)(%rsi); + jmp .Lblk16_end; + .Lblk16_dec: + call __serpent_dec_blk16; + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + +.Lblk16_end: + vzeroall; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;) + #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl _gcry_serpent_avx2_ctr_enc ELF(.type _gcry_serpent_avx2_ctr_enc,@function;) _gcry_serpent_avx2_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); movq 8(%rcx), %rax; bswapq %rax; vzeroupper; vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ /* load IV and byteswap */ vmovdqu (%rcx), RTMP4x; vpshufb RTMP3x, RTMP4x, RTMP4x; vmovdqa RTMP4x, RTMP0x; inc_le128(RTMP4x, RNOTx, RTMP1x); vinserti128 $1, RTMP4x, RTMP0, RTMP0; vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ /* check need for handling 64-bit overflow and carry */ cmpq $(0xffffffffffffffff - 16), %rax; ja .Lhandle_ctr_carry; /* construct IVs */ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ vpshufb RTMP3, RTMP0, RA1; vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ vpshufb RTMP3, RTMP0, RA2; vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ vpshufb RTMP3, RTMP0, RA3; vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ vpshufb RTMP3, RTMP0, RB0; vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ vpshufb RTMP3, RTMP0, RB1; vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ vpshufb RTMP3, RTMP0, RB2; vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ vpshufb RTMP3, RTMP0, RB3; vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ vpshufb RTMP3x, RTMP0x, RTMP0x; jmp .Lctr_carry_done; .Lhandle_ctr_carry: /* construct IVs */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */ inc_le128(RTMP0, RNOT, RTMP1); vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ .align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); call __serpent_enc_blk16; vpxor (0 * 32)(%rdx), RA4, RA4; vpxor (1 * 32)(%rdx), RA1, RA1; vpxor (2 * 32)(%rdx), RA2, RA2; vpxor (3 * 32)(%rdx), RA0, RA0; vpxor (4 * 32)(%rdx), RB4, RB4; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RB2, RB2; vpxor (7 * 32)(%rdx), RB0, RB0; vmovdqu RA4, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA0, (3 * 32)(%rsi); vmovdqu RB4, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB0, (7 * 32)(%rsi); vzeroall; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;) .align 8 .globl _gcry_serpent_avx2_cbc_dec ELF(.type _gcry_serpent_avx2_cbc_dec,@function;) _gcry_serpent_avx2_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; vmovdqu (0 * 32)(%rdx), RA0; vmovdqu (1 * 32)(%rdx), RA1; vmovdqu (2 * 32)(%rdx), RA2; vmovdqu (3 * 32)(%rdx), RA3; vmovdqu (4 * 32)(%rdx), RB0; vmovdqu (5 * 32)(%rdx), RB1; vmovdqu (6 * 32)(%rdx), RB2; vmovdqu (7 * 32)(%rdx), RB3; call __serpent_dec_blk16; vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RNOT; vpxor RNOT, RA0, RA0; vpxor (0 * 32 + 16)(%rdx), RA1, RA1; vpxor (1 * 32 + 16)(%rdx), RA2, RA2; vpxor (2 * 32 + 16)(%rdx), RA3, RA3; vpxor (3 * 32 + 16)(%rdx), RB0, RB0; vpxor (4 * 32 + 16)(%rdx), RB1, RB1; vpxor (5 * 32 + 16)(%rdx), RB2, RB2; vpxor (6 * 32 + 16)(%rdx), RB3, RB3; vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); /* store new IV */ vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA3, (3 * 32)(%rsi); vmovdqu RB0, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB3, (7 * 32)(%rsi); vzeroall; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;) .align 8 .globl _gcry_serpent_avx2_cfb_dec ELF(.type _gcry_serpent_avx2_cfb_dec,@function;) _gcry_serpent_avx2_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; /* Load input */ vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RA0; vmovdqu (0 * 32 + 16)(%rdx), RA1; vmovdqu (1 * 32 + 16)(%rdx), RA2; vmovdqu (2 * 32 + 16)(%rdx), RA3; vmovdqu (3 * 32 + 16)(%rdx), RB0; vmovdqu (4 * 32 + 16)(%rdx), RB1; vmovdqu (5 * 32 + 16)(%rdx), RB2; vmovdqu (6 * 32 + 16)(%rdx), RB3; /* Update IV */ vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); call __serpent_enc_blk16; vpxor (0 * 32)(%rdx), RA4, RA4; vpxor (1 * 32)(%rdx), RA1, RA1; vpxor (2 * 32)(%rdx), RA2, RA2; vpxor (3 * 32)(%rdx), RA0, RA0; vpxor (4 * 32)(%rdx), RB4, RB4; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RB2, RB2; vpxor (7 * 32)(%rdx), RB0, RB0; vmovdqu RA4, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA0, (3 * 32)(%rsi); vmovdqu RB4, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB0, (7 * 32)(%rsi); vzeroall; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;) .align 8 .globl _gcry_serpent_avx2_ocb_enc ELF(.type _gcry_serpent_avx2_ocb_enc,@function;) _gcry_serpent_avx2_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; vmovdqu (%r8), RTMP1x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RTMP1, RTMP1; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vextracti128 $1, RTMP1, RNOTx; vmovdqu RTMP0x, (%rcx); vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 32)(%rsi), RA4, RA4; vpxor (1 * 32)(%rsi), RA1, RA1; vpxor (2 * 32)(%rsi), RA2, RA2; vpxor (3 * 32)(%rsi), RA0, RA0; vpxor (4 * 32)(%rsi), RB4, RB4; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RB2, RB2; vpxor (7 * 32)(%rsi), RB0, RB0; vmovdqu RA4, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA0, (3 * 32)(%rsi); vmovdqu RB4, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB0, (7 * 32)(%rsi); vzeroall; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;) .align 8 .globl _gcry_serpent_avx2_ocb_dec ELF(.type _gcry_serpent_avx2_ocb_dec,@function;) _gcry_serpent_avx2_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0x, (%rcx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_dec_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vmovdqu (%r8), RTMP1x; vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RA1, RA1; vpxor (2 * 32)(%rsi), RA2, RA2; vpxor (3 * 32)(%rsi), RA3, RA3; vpxor (4 * 32)(%rsi), RB0, RB0; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RB2, RB2; vpxor (7 * 32)(%rsi), RB3, RB3; /* Checksum_i = Checksum_{i-1} xor P_i */ vmovdqu RA0, (0 * 32)(%rsi); vpxor RA0, RTMP1, RTMP1; vmovdqu RA1, (1 * 32)(%rsi); vpxor RA1, RTMP1, RTMP1; vmovdqu RA2, (2 * 32)(%rsi); vpxor RA2, RTMP1, RTMP1; vmovdqu RA3, (3 * 32)(%rsi); vpxor RA3, RTMP1, RTMP1; vmovdqu RB0, (4 * 32)(%rsi); vpxor RB0, RTMP1, RTMP1; vmovdqu RB1, (5 * 32)(%rsi); vpxor RB1, RTMP1, RTMP1; vmovdqu RB2, (6 * 32)(%rsi); vpxor RB2, RTMP1, RTMP1; vmovdqu RB3, (7 * 32)(%rsi); vpxor RB3, RTMP1, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); vzeroall; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;) .align 8 .globl _gcry_serpent_avx2_ocb_auth ELF(.type _gcry_serpent_avx2_ocb_auth,@function;) _gcry_serpent_avx2_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rsi), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0x, (%rdx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor RA4, RB4, RA4; vpxor RA1, RB1, RA1; vpxor RA2, RB2, RA2; vpxor RA0, RB0, RA0; vpxor RA4, RA1, RA1; vpxor RA2, RA0, RA0; vpxor RA1, RA0, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor (%rcx), RTMP1x, RTMP1x; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%rcx); vzeroall; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;) .align 16 /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 #endif /*defined(USE_SERPENT) && defined(ENABLE_AVX2_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index b5935095..01723a2a 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -1,1211 +1,1276 @@ /* serpent-sse2-amd64.S - SSE2 implementation of Serpent cipher * * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) #include "asm-common-amd64.h" /* struct serpent_context: */ #define ctx_keys 0 /* register macros */ #define CTX %rdi /* vector registers */ #define RA0 %xmm0 #define RA1 %xmm1 #define RA2 %xmm2 #define RA3 %xmm3 #define RA4 %xmm4 #define RB0 %xmm5 #define RB1 %xmm6 #define RB2 %xmm7 #define RB3 %xmm8 #define RB4 %xmm9 #define RNOT %xmm10 #define RTMP0 %xmm11 #define RTMP1 %xmm12 #define RTMP2 %xmm13 /********************************************************************** helper macros **********************************************************************/ /* vector 32-bit rotation to left */ #define vec_rol(reg, nleft, tmp) \ movdqa reg, tmp; \ pslld $(nleft), tmp; \ psrld $(32 - (nleft)), reg; \ por tmp, reg; /* vector 32-bit rotation to right */ #define vec_ror(reg, nright, tmp) \ vec_rol(reg, 32 - nright, tmp) /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ movdqa x0, t2; \ punpckhdq x1, t2; \ punpckldq x1, x0; \ \ movdqa x2, t1; \ punpckldq x3, t1; \ punpckhdq x3, x2; \ \ movdqa x0, x1; \ punpckhqdq t1, x1; \ punpcklqdq t1, x0; \ \ movdqa t2, x3; \ punpckhqdq x2, x3; \ punpcklqdq x2, t2; \ movdqa t2, x2; /* fill xmm register with 32-bit value from memory */ #define pbroadcastd(mem32, xreg) \ movd mem32, xreg; \ pshufd $0, xreg, xreg; /* xor with unaligned memory operand */ #define pxor_u(umem128, xreg, t) \ movdqu umem128, t; \ pxor t, xreg; /* 128-bit wide byte swap */ #define pbswap(xreg, t0) \ /* reorder 32-bit words, [a,b,c,d] => [d,c,b,a] */ \ pshufd $0x1b, xreg, xreg; \ /* reorder high&low 16-bit words, [d0,d1,c0,c1] => [d1,d0,c1,c0] */ \ pshuflw $0xb1, xreg, xreg; \ pshufhw $0xb1, xreg, xreg; \ /* reorder bytes in 16-bit words */ \ movdqa xreg, t0; \ psrlw $8, t0; \ psllw $8, xreg; \ por t0, xreg; /********************************************************************** 8-way serpent **********************************************************************/ /* * These are the S-Boxes of Serpent from following research paper. * * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, * (New York, New York, USA), p. 317–329, National Institute of Standards and * Technology, 2000. * * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf * */ #define SBOX0(r0, r1, r2, r3, r4) \ pxor r0, r3; movdqa r1, r4; \ pand r3, r1; pxor r2, r4; \ pxor r0, r1; por r3, r0; \ pxor r4, r0; pxor r3, r4; \ pxor r2, r3; por r1, r2; \ pxor r4, r2; pxor RNOT, r4; \ por r1, r4; pxor r3, r1; \ pxor r4, r1; por r0, r3; \ pxor r3, r1; pxor r3, r4; #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \ pxor RNOT, r2; movdqa r1, r4; \ por r0, r1; pxor RNOT, r4; \ pxor r2, r1; por r4, r2; \ pxor r3, r1; pxor r4, r0; \ pxor r0, r2; pand r3, r0; \ pxor r0, r4; por r1, r0; \ pxor r2, r0; pxor r4, r3; \ pxor r1, r2; pxor r0, r3; \ pxor r1, r3; \ pand r3, r2; \ pxor r2, r4; #define SBOX1(r0, r1, r2, r3, r4) \ pxor RNOT, r0; pxor RNOT, r2; \ movdqa r0, r4; pand r1, r0; \ pxor r0, r2; por r3, r0; \ pxor r2, r3; pxor r0, r1; \ pxor r4, r0; por r1, r4; \ pxor r3, r1; por r0, r2; \ pand r4, r2; pxor r1, r0; \ pand r2, r1; \ pxor r0, r1; pand r2, r0; \ pxor r4, r0; #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \ movdqa r1, r4; pxor r3, r1; \ pand r1, r3; pxor r2, r4; \ pxor r0, r3; por r1, r0; \ pxor r3, r2; pxor r4, r0; \ por r2, r0; pxor r3, r1; \ pxor r1, r0; por r3, r1; \ pxor r0, r1; pxor RNOT, r4; \ pxor r1, r4; por r0, r1; \ pxor r0, r1; \ por r4, r1; \ pxor r1, r3; #define SBOX2(r0, r1, r2, r3, r4) \ movdqa r0, r4; pand r2, r0; \ pxor r3, r0; pxor r1, r2; \ pxor r0, r2; por r4, r3; \ pxor r1, r3; pxor r2, r4; \ movdqa r3, r1; por r4, r3; \ pxor r0, r3; pand r1, r0; \ pxor r0, r4; pxor r3, r1; \ pxor r4, r1; pxor RNOT, r4; #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \ pxor r3, r2; pxor r0, r3; \ movdqa r3, r4; pand r2, r3; \ pxor r1, r3; por r2, r1; \ pxor r4, r1; pand r3, r4; \ pxor r3, r2; pand r0, r4; \ pxor r2, r4; pand r1, r2; \ por r0, r2; pxor RNOT, r3; \ pxor r3, r2; pxor r3, r0; \ pand r1, r0; pxor r4, r3; \ pxor r0, r3; #define SBOX3(r0, r1, r2, r3, r4) \ movdqa r0, r4; por r3, r0; \ pxor r1, r3; pand r4, r1; \ pxor r2, r4; pxor r3, r2; \ pand r0, r3; por r1, r4; \ pxor r4, r3; pxor r1, r0; \ pand r0, r4; pxor r3, r1; \ pxor r2, r4; por r0, r1; \ pxor r2, r1; pxor r3, r0; \ movdqa r1, r2; por r3, r1; \ pxor r0, r1; #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pxor r1, r2; \ pxor r2, r0; pand r2, r4; \ pxor r0, r4; pand r1, r0; \ pxor r3, r1; por r4, r3; \ pxor r3, r2; pxor r3, r0; \ pxor r4, r1; pand r2, r3; \ pxor r1, r3; pxor r0, r1; \ por r2, r1; pxor r3, r0; \ pxor r4, r1; \ pxor r1, r0; #define SBOX4(r0, r1, r2, r3, r4) \ pxor r3, r1; pxor RNOT, r3; \ pxor r3, r2; pxor r0, r3; \ movdqa r1, r4; pand r3, r1; \ pxor r2, r1; pxor r3, r4; \ pxor r4, r0; pand r4, r2; \ pxor r0, r2; pand r1, r0; \ pxor r0, r3; por r1, r4; \ pxor r0, r4; por r3, r0; \ pxor r2, r0; pand r3, r2; \ pxor RNOT, r0; pxor r2, r4; #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pand r3, r2; \ pxor r1, r2; por r3, r1; \ pand r0, r1; pxor r2, r4; \ pxor r1, r4; pand r2, r1; \ pxor RNOT, r0; pxor r4, r3; \ pxor r3, r1; pand r0, r3; \ pxor r2, r3; pxor r1, r0; \ pand r0, r2; pxor r0, r3; \ pxor r4, r2; \ por r3, r2; pxor r0, r3; \ pxor r1, r2; #define SBOX5(r0, r1, r2, r3, r4) \ pxor r1, r0; pxor r3, r1; \ pxor RNOT, r3; movdqa r1, r4; \ pand r0, r1; pxor r3, r2; \ pxor r2, r1; por r4, r2; \ pxor r3, r4; pand r1, r3; \ pxor r0, r3; pxor r1, r4; \ pxor r2, r4; pxor r0, r2; \ pand r3, r0; pxor RNOT, r2; \ pxor r4, r0; por r3, r4; \ pxor r4, r2; #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \ pxor RNOT, r1; movdqa r3, r4; \ pxor r1, r2; por r0, r3; \ pxor r2, r3; por r1, r2; \ pand r0, r2; pxor r3, r4; \ pxor r4, r2; por r0, r4; \ pxor r1, r4; pand r2, r1; \ pxor r3, r1; pxor r2, r4; \ pand r4, r3; pxor r1, r4; \ pxor r4, r3; pxor RNOT, r4; \ pxor r0, r3; #define SBOX6(r0, r1, r2, r3, r4) \ pxor RNOT, r2; movdqa r3, r4; \ pand r0, r3; pxor r4, r0; \ pxor r2, r3; por r4, r2; \ pxor r3, r1; pxor r0, r2; \ por r1, r0; pxor r1, r2; \ pxor r0, r4; por r3, r0; \ pxor r2, r0; pxor r3, r4; \ pxor r0, r4; pxor RNOT, r3; \ pand r4, r2; \ pxor r3, r2; #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \ pxor r2, r0; movdqa r2, r4; \ pand r0, r2; pxor r3, r4; \ pxor RNOT, r2; pxor r1, r3; \ pxor r3, r2; por r0, r4; \ pxor r2, r0; pxor r4, r3; \ pxor r1, r4; pand r3, r1; \ pxor r0, r1; pxor r3, r0; \ por r2, r0; pxor r1, r3; \ pxor r0, r4; #define SBOX7(r0, r1, r2, r3, r4) \ movdqa r1, r4; por r2, r1; \ pxor r3, r1; pxor r2, r4; \ pxor r1, r2; por r4, r3; \ pand r0, r3; pxor r2, r4; \ pxor r1, r3; por r4, r1; \ pxor r0, r1; por r4, r0; \ pxor r2, r0; pxor r4, r1; \ pxor r1, r2; pand r0, r1; \ pxor r4, r1; pxor RNOT, r2; \ por r0, r2; \ pxor r2, r4; #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pxor r0, r2; \ pand r3, r0; por r3, r4; \ pxor RNOT, r2; pxor r1, r3; \ por r0, r1; pxor r2, r0; \ pand r4, r2; pand r4, r3; \ pxor r2, r1; pxor r0, r2; \ por r2, r0; pxor r1, r4; \ pxor r3, r0; pxor r4, r3; \ por r0, r4; pxor r2, r3; \ pxor r2, r4; /* Apply SBOX number WHICH to to the block. */ #define SBOX(which, r0, r1, r2, r3, r4) \ SBOX##which (r0, r1, r2, r3, r4) /* Apply inverse SBOX number WHICH to to the block. */ #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \ SBOX##which##_INVERSE (r0, r1, r2, r3, r4) /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary. */ #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \ pbroadcastd ((ctx_keys + (round) * 16 + 0 * 4)(CTX), r4); \ pxor r4, r0; \ pbroadcastd ((ctx_keys + (round) * 16 + 1 * 4)(CTX), r4); \ pxor r4, r1; \ pbroadcastd ((ctx_keys + (round) * 16 + 2 * 4)(CTX), r4); \ pxor r4, r2; \ pbroadcastd ((ctx_keys + (round) * 16 + 3 * 4)(CTX), r4); \ pxor r4, r3; /* Apply the linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \ vec_rol(r0, 13, r4); \ vec_rol(r2, 3, r4); \ pxor r0, r1; \ pxor r2, r1; \ movdqa r0, r4; \ pslld $3, r4; \ pxor r2, r3; \ pxor r4, r3; \ vec_rol(r1, 1, r4); \ vec_rol(r3, 7, r4); \ pxor r1, r0; \ pxor r3, r0; \ movdqa r1, r4; \ pslld $7, r4; \ pxor r3, r2; \ pxor r4, r2; \ vec_rol(r0, 5, r4); \ vec_rol(r2, 22, r4); /* Apply the inverse linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \ vec_ror(r2, 22, r4); \ vec_ror(r0, 5, r4); \ movdqa r1, r4; \ pslld $7, r4; \ pxor r3, r2; \ pxor r4, r2; \ pxor r1, r0; \ pxor r3, r0; \ vec_ror(r3, 7, r4); \ vec_ror(r1, 1, r4); \ movdqa r0, r4; \ pslld $3, r4; \ pxor r2, r3; \ pxor r4, r3; \ pxor r0, r1; \ pxor r2, r1; \ vec_ror(r2, 3, r4); \ vec_ror(r0, 13, r4); /* Apply a Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4); /* Apply the last Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1)); /* Apply an inverse Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \ LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); /* Apply the first inverse Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text .align 8 ELF(.type __serpent_enc_blk8,@function;) __serpent_enc_blk8: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks * output: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel * ciphertext blocks */ CFI_STARTPROC(); pcmpeqd RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;) .align 8 ELF(.type __serpent_dec_blk8,@function;) __serpent_dec_blk8: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel * ciphertext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks */ CFI_STARTPROC(); pcmpeqd RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, RA3, RA0, RA1, RA4, RA2, RB0, RB1, RB2, RB3, RB4, RB3, RB0, RB1, RB4, RB2); ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;) +.align 8 +.globl _gcry_serpent_sse2_blk8 +ELF(.type _gcry_serpent_sse2_blk8,@function;) +_gcry_serpent_sse2_blk8: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %ecx: encrypt + */ + CFI_STARTPROC(); + + movdqu (0 * 16)(%rdx), RA0; + movdqu (1 * 16)(%rdx), RA1; + movdqu (2 * 16)(%rdx), RA2; + movdqu (3 * 16)(%rdx), RA3; + movdqu (4 * 16)(%rdx), RB0; + movdqu (5 * 16)(%rdx), RB1; + movdqu (6 * 16)(%rdx), RB2; + movdqu (7 * 16)(%rdx), RB3; + + testl %ecx, %ecx; + jz .Lblk8_dec; + call __serpent_enc_blk8; + movdqu RA4, (0 * 16)(%rsi); + movdqu RA1, (1 * 16)(%rsi); + movdqu RA2, (2 * 16)(%rsi); + movdqu RA0, (3 * 16)(%rsi); + movdqu RB4, (4 * 16)(%rsi); + movdqu RB1, (5 * 16)(%rsi); + movdqu RB2, (6 * 16)(%rsi); + movdqu RB0, (7 * 16)(%rsi); + jmp .Lblk8_end; + .Lblk8_dec: + call __serpent_dec_blk8; + movdqu RA0, (0 * 16)(%rsi); + movdqu RA1, (1 * 16)(%rsi); + movdqu RA2, (2 * 16)(%rsi); + movdqu RA3, (3 * 16)(%rsi); + movdqu RB0, (4 * 16)(%rsi); + movdqu RB1, (5 * 16)(%rsi); + movdqu RB2, (6 * 16)(%rsi); + movdqu RB3, (7 * 16)(%rsi); + +.Lblk8_end: + /* clear the used registers */ + pxor RA0, RA0; + pxor RA1, RA1; + pxor RA2, RA2; + pxor RA3, RA3; + pxor RA4, RA4; + pxor RB0, RB0; + pxor RB1, RB1; + pxor RB2, RB2; + pxor RB3, RB3; + pxor RB4, RB4; + pxor RTMP0, RTMP0; + pxor RTMP1, RTMP1; + pxor RTMP2, RTMP2; + pxor RNOT, RNOT; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;) + .align 8 .globl _gcry_serpent_sse2_ctr_enc ELF(.type _gcry_serpent_sse2_ctr_enc,@function;) _gcry_serpent_sse2_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); /* load IV and byteswap */ movdqu (%rcx), RA0; movdqa RA0, RTMP0; pbswap(RTMP0, RTMP1); /* be => le */ pcmpeqd RNOT, RNOT; psrldq $8, RNOT; /* low: -1, high: 0 */ movdqa RNOT, RTMP2; paddq RTMP2, RTMP2; /* low: -2, high: 0 */ /* construct IVs */ movdqa RTMP0, RTMP1; psubq RNOT, RTMP0; /* +1 */ movdqa RTMP0, RA1; psubq RTMP2, RTMP1; /* +2 */ movdqa RTMP1, RA2; psubq RTMP2, RTMP0; /* +3 */ movdqa RTMP0, RA3; psubq RTMP2, RTMP1; /* +4 */ movdqa RTMP1, RB0; psubq RTMP2, RTMP0; /* +5 */ movdqa RTMP0, RB1; psubq RTMP2, RTMP1; /* +6 */ movdqa RTMP1, RB2; psubq RTMP2, RTMP0; /* +7 */ movdqa RTMP0, RB3; psubq RTMP2, RTMP1; /* +8 */ /* check need for handling 64-bit overflow and carry */ cmpl $0xffffffff, 8(%rcx); jne .Lno_ctr_carry; movl 12(%rcx), %eax; bswapl %eax; cmpl $-8, %eax; jb .Lno_ctr_carry; pslldq $8, RNOT; /* low: 0, high: -1 */ je .Lcarry_RTMP0; cmpl $-6, %eax; jb .Lcarry_RB3; je .Lcarry_RB2; cmpl $-4, %eax; jb .Lcarry_RB1; je .Lcarry_RB0; cmpl $-2, %eax; jb .Lcarry_RA3; je .Lcarry_RA2; psubq RNOT, RA1; .Lcarry_RA2: psubq RNOT, RA2; .Lcarry_RA3: psubq RNOT, RA3; .Lcarry_RB0: psubq RNOT, RB0; .Lcarry_RB1: psubq RNOT, RB1; .Lcarry_RB2: psubq RNOT, RB2; .Lcarry_RB3: psubq RNOT, RB3; .Lcarry_RTMP0: psubq RNOT, RTMP1; .Lno_ctr_carry: /* le => be */ pbswap(RA1, RTMP0); pbswap(RA2, RTMP0); pbswap(RA3, RTMP0); pbswap(RB0, RTMP0); pbswap(RB1, RTMP0); pbswap(RB2, RTMP0); pbswap(RB3, RTMP0); pbswap(RTMP1, RTMP0); /* store new IV */ movdqu RTMP1, (%rcx); call __serpent_enc_blk8; pxor_u((0 * 16)(%rdx), RA4, RTMP0); pxor_u((1 * 16)(%rdx), RA1, RTMP0); pxor_u((2 * 16)(%rdx), RA2, RTMP0); pxor_u((3 * 16)(%rdx), RA0, RTMP0); pxor_u((4 * 16)(%rdx), RB4, RTMP0); pxor_u((5 * 16)(%rdx), RB1, RTMP0); pxor_u((6 * 16)(%rdx), RB2, RTMP0); pxor_u((7 * 16)(%rdx), RB0, RTMP0); movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA0, (3 * 16)(%rsi); movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;) .align 8 .globl _gcry_serpent_sse2_cbc_dec ELF(.type _gcry_serpent_sse2_cbc_dec,@function;) _gcry_serpent_sse2_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv */ CFI_STARTPROC(); movdqu (0 * 16)(%rdx), RA0; movdqu (1 * 16)(%rdx), RA1; movdqu (2 * 16)(%rdx), RA2; movdqu (3 * 16)(%rdx), RA3; movdqu (4 * 16)(%rdx), RB0; movdqu (5 * 16)(%rdx), RB1; movdqu (6 * 16)(%rdx), RB2; movdqu (7 * 16)(%rdx), RB3; call __serpent_dec_blk8; movdqu (7 * 16)(%rdx), RNOT; pxor_u((%rcx), RA0, RTMP0); pxor_u((0 * 16)(%rdx), RA1, RTMP0); pxor_u((1 * 16)(%rdx), RA2, RTMP0); pxor_u((2 * 16)(%rdx), RA3, RTMP0); pxor_u((3 * 16)(%rdx), RB0, RTMP0); pxor_u((4 * 16)(%rdx), RB1, RTMP0); pxor_u((5 * 16)(%rdx), RB2, RTMP0); pxor_u((6 * 16)(%rdx), RB3, RTMP0); movdqu RNOT, (%rcx); /* store new IV */ movdqu RA0, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA3, (3 * 16)(%rsi); movdqu RB0, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB3, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;) .align 8 .globl _gcry_serpent_sse2_cfb_dec ELF(.type _gcry_serpent_sse2_cfb_dec,@function;) _gcry_serpent_sse2_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv */ CFI_STARTPROC(); /* Load input */ movdqu (%rcx), RA0; movdqu 0 * 16(%rdx), RA1; movdqu 1 * 16(%rdx), RA2; movdqu 2 * 16(%rdx), RA3; movdqu 3 * 16(%rdx), RB0; movdqu 4 * 16(%rdx), RB1; movdqu 5 * 16(%rdx), RB2; movdqu 6 * 16(%rdx), RB3; /* Update IV */ movdqu 7 * 16(%rdx), RNOT; movdqu RNOT, (%rcx); call __serpent_enc_blk8; pxor_u((0 * 16)(%rdx), RA4, RTMP0); pxor_u((1 * 16)(%rdx), RA1, RTMP0); pxor_u((2 * 16)(%rdx), RA2, RTMP0); pxor_u((3 * 16)(%rdx), RA0, RTMP0); pxor_u((4 * 16)(%rdx), RB4, RTMP0); pxor_u((5 * 16)(%rdx), RB1, RTMP0); pxor_u((6 * 16)(%rdx), RB2, RTMP0); pxor_u((7 * 16)(%rdx), RB0, RTMP0); movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA0, (3 * 16)(%rsi); movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;) .align 8 .globl _gcry_serpent_sse2_ocb_enc ELF(.type _gcry_serpent_sse2_ocb_enc,@function;) _gcry_serpent_sse2_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; movdqu (%r8), RTMP1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ movdqu (n * 16)(%rdx), xreg; \ movdqu (lreg), RNOT; \ pxor RNOT, RTMP0; \ pxor xreg, RTMP1; \ pxor RTMP0, xreg; \ movdqu RTMP0, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT movdqu RTMP0, (%rcx); movdqu RTMP1, (%r8); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); pxor_u((0 * 16)(%rsi), RA4, RTMP0); pxor_u((1 * 16)(%rsi), RA1, RTMP0); pxor_u((2 * 16)(%rsi), RA2, RTMP0); pxor_u((3 * 16)(%rsi), RA0, RTMP0); pxor_u((4 * 16)(%rsi), RB4, RTMP0); pxor_u((5 * 16)(%rsi), RB1, RTMP0); pxor_u((6 * 16)(%rsi), RB2, RTMP0); pxor_u((7 * 16)(%rsi), RB0, RTMP0); movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA0, (3 * 16)(%rsi); movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;) .align 8 .globl _gcry_serpent_sse2_ocb_dec ELF(.type _gcry_serpent_sse2_ocb_dec,@function;) _gcry_serpent_sse2_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ movdqu (n * 16)(%rdx), xreg; \ movdqu (lreg), RNOT; \ pxor RNOT, RTMP0; \ pxor RTMP0, xreg; \ movdqu RTMP0, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT movdqu RTMP0, (%rcx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_dec_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); movdqu (%r8), RTMP0; pxor_u((0 * 16)(%rsi), RA0, RTMP1); pxor_u((1 * 16)(%rsi), RA1, RTMP1); pxor_u((2 * 16)(%rsi), RA2, RTMP1); pxor_u((3 * 16)(%rsi), RA3, RTMP1); pxor_u((4 * 16)(%rsi), RB0, RTMP1); pxor_u((5 * 16)(%rsi), RB1, RTMP1); pxor_u((6 * 16)(%rsi), RB2, RTMP1); pxor_u((7 * 16)(%rsi), RB3, RTMP1); /* Checksum_i = Checksum_{i-1} xor P_i */ movdqu RA0, (0 * 16)(%rsi); pxor RA0, RTMP0; movdqu RA1, (1 * 16)(%rsi); pxor RA1, RTMP0; movdqu RA2, (2 * 16)(%rsi); pxor RA2, RTMP0; movdqu RA3, (3 * 16)(%rsi); pxor RA3, RTMP0; movdqu RB0, (4 * 16)(%rsi); pxor RB0, RTMP0; movdqu RB1, (5 * 16)(%rsi); pxor RB1, RTMP0; movdqu RB2, (6 * 16)(%rsi); pxor RB2, RTMP0; movdqu RB3, (7 * 16)(%rsi); pxor RB3, RTMP0; movdqu RTMP0, (%r8); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;) .align 8 .globl _gcry_serpent_sse2_ocb_auth ELF(.type _gcry_serpent_sse2_ocb_auth,@function;) _gcry_serpent_sse2_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (8 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rdx), RTMP0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ movdqu (n * 16)(%rsi), xreg; \ movdqu (lreg), RNOT; \ pxor RNOT, RTMP0; \ pxor RTMP0, xreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT movdqu RTMP0, (%rdx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); movdqu (%rcx), RTMP0; pxor RB4, RA4; pxor RB1, RA1; pxor RB2, RA2; pxor RB0, RA0; pxor RTMP0, RA2; pxor RA4, RA1; pxor RA2, RA0; pxor RA1, RA0; movdqu RA0, (%rcx); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;) #endif /*defined(USE_SERPENT)*/ #endif /*__x86_64*/ diff --git a/cipher/serpent.c b/cipher/serpent.c index 93c561c5..0a9ed27c 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -1,1692 +1,1837 @@ /* serpent.c - Implementation of the Serpent encryption algorithm. * Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "bithelp.h" #include "bufhelp.h" #include "cipher-internal.h" #include "bulkhelp.h" /* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */ #undef USE_SSE2 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSE2 1 #endif /* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # if defined(ENABLE_AVX2_SUPPORT) # define USE_AVX2 1 # endif #endif /* USE_NEON indicates whether to enable ARM NEON assembly code. */ #undef USE_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_NEON 1 # endif #endif /*ENABLE_NEON_SUPPORT*/ /* Number of rounds per Serpent encrypt/decrypt operation. */ #define ROUNDS 32 /* Magic number, used during generating of the subkeys. */ #define PHI 0x9E3779B9 /* Serpent works on 128 bit blocks. */ typedef u32 serpent_block_t[4]; /* Serpent key, provided by the user. If the original key is shorter than 256 bits, it is padded. */ typedef u32 serpent_key_t[8]; /* The key schedule consists of 33 128 bit subkeys. */ typedef u32 serpent_subkeys_t[ROUNDS + 1][4]; /* A Serpent context. */ typedef struct serpent_context { serpent_subkeys_t keys; /* Generated subkeys. */ #ifdef USE_AVX2 int use_avx2; #endif #ifdef USE_NEON int use_neon; #endif } serpent_context_t; /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #if defined(USE_SSE2) || defined(USE_AVX2) # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ASM_FUNC_ABI __attribute__((sysv_abi)) # else # define ASM_FUNC_ABI # endif #endif #ifdef USE_SSE2 /* Assembler implementations of Serpent using SSE2. Process 8 block in parallel. */ extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_serpent_sse2_ocb_enc(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[8]) ASM_FUNC_ABI; extern void _gcry_serpent_sse2_ocb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[8]) ASM_FUNC_ABI; extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const u64 Ls[8]) ASM_FUNC_ABI; + +extern void _gcry_serpent_sse2_blk8(const serpent_context_t *c, byte *out, + const byte *in, int encrypt) ASM_FUNC_ABI; #endif #ifdef USE_AVX2 /* Assembler implementations of Serpent using AVX2. Process 16 block in parallel. */ extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_serpent_avx2_ocb_enc(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_serpent_avx2_ocb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_serpent_avx2_blk16(const serpent_context_t *c, byte *out, + const byte *in, int encrypt) ASM_FUNC_ABI; #endif #ifdef USE_NEON /* Assembler implementations of Serpent using ARM NEON. Process 8 block in parallel. */ extern void _gcry_serpent_neon_ctr_enc(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *ctr); extern void _gcry_serpent_neon_cbc_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv); extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv); extern void _gcry_serpent_neon_ocb_enc(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const void *Ls[8]); extern void _gcry_serpent_neon_ocb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *offset, unsigned char *checksum, const void *Ls[8]); extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, const void *Ls[8]); + +extern void _gcry_serpent_neon_blk8(const serpent_context_t *c, byte *out, + const byte *in, int encrypt); #endif /* Prototypes. */ static const char *serpent_test (void); static void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_serpent_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_serpent_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); +static void _gcry_serpent_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); +static void _gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); /* * These are the S-Boxes of Serpent from following research paper. * * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, * (New York, New York, USA), p. 317–329, National Institute of Standards and * Technology, 2000. * * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf * */ #define SBOX0(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r3 ^= r0; r4 = r1; \ r1 &= r3; r4 ^= r2; \ r1 ^= r0; r0 |= r3; \ r0 ^= r4; r4 ^= r3; \ r3 ^= r2; r2 |= r1; \ r2 ^= r4; r4 = ~r4; \ r4 |= r1; r1 ^= r3; \ r1 ^= r4; r3 |= r0; \ r1 ^= r3; r4 ^= r3; \ \ w = r1; x = r4; y = r2; z = r0; \ } #define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r2 = ~r2; r4 = r1; \ r1 |= r0; r4 = ~r4; \ r1 ^= r2; r2 |= r4; \ r1 ^= r3; r0 ^= r4; \ r2 ^= r0; r0 &= r3; \ r4 ^= r0; r0 |= r1; \ r0 ^= r2; r3 ^= r4; \ r2 ^= r1; r3 ^= r0; \ r3 ^= r1; \ r2 &= r3; \ r4 ^= r2; \ \ w = r0; x = r4; y = r1; z = r3; \ } #define SBOX1(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r0 = ~r0; r2 = ~r2; \ r4 = r0; r0 &= r1; \ r2 ^= r0; r0 |= r3; \ r3 ^= r2; r1 ^= r0; \ r0 ^= r4; r4 |= r1; \ r1 ^= r3; r2 |= r0; \ r2 &= r4; r0 ^= r1; \ r1 &= r2; \ r1 ^= r0; r0 &= r2; \ r0 ^= r4; \ \ w = r2; x = r0; y = r3; z = r1; \ } #define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r1; r1 ^= r3; \ r3 &= r1; r4 ^= r2; \ r3 ^= r0; r0 |= r1; \ r2 ^= r3; r0 ^= r4; \ r0 |= r2; r1 ^= r3; \ r0 ^= r1; r1 |= r3; \ r1 ^= r0; r4 = ~r4; \ r4 ^= r1; r1 |= r0; \ r1 ^= r0; \ r1 |= r4; \ r3 ^= r1; \ \ w = r4; x = r0; y = r3; z = r2; \ } #define SBOX2(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r0; r0 &= r2; \ r0 ^= r3; r2 ^= r1; \ r2 ^= r0; r3 |= r4; \ r3 ^= r1; r4 ^= r2; \ r1 = r3; r3 |= r4; \ r3 ^= r0; r0 &= r1; \ r4 ^= r0; r1 ^= r3; \ r1 ^= r4; r4 = ~r4; \ \ w = r2; x = r3; y = r1; z = r4; \ } #define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r2 ^= r3; r3 ^= r0; \ r4 = r3; r3 &= r2; \ r3 ^= r1; r1 |= r2; \ r1 ^= r4; r4 &= r3; \ r2 ^= r3; r4 &= r0; \ r4 ^= r2; r2 &= r1; \ r2 |= r0; r3 = ~r3; \ r2 ^= r3; r0 ^= r3; \ r0 &= r1; r3 ^= r4; \ r3 ^= r0; \ \ w = r1; x = r4; y = r2; z = r3; \ } #define SBOX3(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r0; r0 |= r3; \ r3 ^= r1; r1 &= r4; \ r4 ^= r2; r2 ^= r3; \ r3 &= r0; r4 |= r1; \ r3 ^= r4; r0 ^= r1; \ r4 &= r0; r1 ^= r3; \ r4 ^= r2; r1 |= r0; \ r1 ^= r2; r0 ^= r3; \ r2 = r1; r1 |= r3; \ r1 ^= r0; \ \ w = r1; x = r2; y = r3; z = r4; \ } #define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r2; r2 ^= r1; \ r0 ^= r2; r4 &= r2; \ r4 ^= r0; r0 &= r1; \ r1 ^= r3; r3 |= r4; \ r2 ^= r3; r0 ^= r3; \ r1 ^= r4; r3 &= r2; \ r3 ^= r1; r1 ^= r0; \ r1 |= r2; r0 ^= r3; \ r1 ^= r4; \ r0 ^= r1; \ \ w = r2; x = r1; y = r3; z = r0; \ } #define SBOX4(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r1 ^= r3; r3 = ~r3; \ r2 ^= r3; r3 ^= r0; \ r4 = r1; r1 &= r3; \ r1 ^= r2; r4 ^= r3; \ r0 ^= r4; r2 &= r4; \ r2 ^= r0; r0 &= r1; \ r3 ^= r0; r4 |= r1; \ r4 ^= r0; r0 |= r3; \ r0 ^= r2; r2 &= r3; \ r0 = ~r0; r4 ^= r2; \ \ w = r1; x = r4; y = r0; z = r3; \ } #define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r2; r2 &= r3; \ r2 ^= r1; r1 |= r3; \ r1 &= r0; r4 ^= r2; \ r4 ^= r1; r1 &= r2; \ r0 = ~r0; r3 ^= r4; \ r1 ^= r3; r3 &= r0; \ r3 ^= r2; r0 ^= r1; \ r2 &= r0; r3 ^= r0; \ r2 ^= r4; \ r2 |= r3; r3 ^= r0; \ r2 ^= r1; \ \ w = r0; x = r3; y = r2; z = r4; \ } #define SBOX5(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r0 ^= r1; r1 ^= r3; \ r3 = ~r3; r4 = r1; \ r1 &= r0; r2 ^= r3; \ r1 ^= r2; r2 |= r4; \ r4 ^= r3; r3 &= r1; \ r3 ^= r0; r4 ^= r1; \ r4 ^= r2; r2 ^= r0; \ r0 &= r3; r2 = ~r2; \ r0 ^= r4; r4 |= r3; \ r2 ^= r4; \ \ w = r1; x = r3; y = r0; z = r2; \ } #define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r1 = ~r1; r4 = r3; \ r2 ^= r1; r3 |= r0; \ r3 ^= r2; r2 |= r1; \ r2 &= r0; r4 ^= r3; \ r2 ^= r4; r4 |= r0; \ r4 ^= r1; r1 &= r2; \ r1 ^= r3; r4 ^= r2; \ r3 &= r4; r4 ^= r1; \ r3 ^= r4; r4 = ~r4; \ r3 ^= r0; \ \ w = r1; x = r4; y = r3; z = r2; \ } #define SBOX6(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r2 = ~r2; r4 = r3; \ r3 &= r0; r0 ^= r4; \ r3 ^= r2; r2 |= r4; \ r1 ^= r3; r2 ^= r0; \ r0 |= r1; r2 ^= r1; \ r4 ^= r0; r0 |= r3; \ r0 ^= r2; r4 ^= r3; \ r4 ^= r0; r3 = ~r3; \ r2 &= r4; \ r2 ^= r3; \ \ w = r0; x = r1; y = r4; z = r2; \ } #define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r0 ^= r2; r4 = r2; \ r2 &= r0; r4 ^= r3; \ r2 = ~r2; r3 ^= r1; \ r2 ^= r3; r4 |= r0; \ r0 ^= r2; r3 ^= r4; \ r4 ^= r1; r1 &= r3; \ r1 ^= r0; r0 ^= r3; \ r0 |= r2; r3 ^= r1; \ r4 ^= r0; \ \ w = r1; x = r2; y = r4; z = r3; \ } #define SBOX7(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r1; r1 |= r2; \ r1 ^= r3; r4 ^= r2; \ r2 ^= r1; r3 |= r4; \ r3 &= r0; r4 ^= r2; \ r3 ^= r1; r1 |= r4; \ r1 ^= r0; r0 |= r4; \ r0 ^= r2; r1 ^= r4; \ r2 ^= r1; r1 &= r0; \ r1 ^= r4; r2 = ~r2; \ r2 |= r0; \ r4 ^= r2; \ \ w = r4; x = r3; y = r1; z = r0; \ } #define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \ { \ u32 r4; \ \ r4 = r2; r2 ^= r0; \ r0 &= r3; r4 |= r3; \ r2 = ~r2; r3 ^= r1; \ r1 |= r0; r0 ^= r2; \ r2 &= r4; r3 &= r4; \ r1 ^= r2; r2 ^= r0; \ r0 |= r2; r4 ^= r1; \ r0 ^= r3; r3 ^= r4; \ r4 |= r0; r3 ^= r2; \ r4 ^= r2; \ \ w = r3; x = r0; y = r1; z = r4; \ } /* XOR BLOCK1 into BLOCK0. */ #define BLOCK_XOR(block0, block1) \ { \ block0[0] ^= block1[0]; \ block0[1] ^= block1[1]; \ block0[2] ^= block1[2]; \ block0[3] ^= block1[3]; \ } /* Copy BLOCK_SRC to BLOCK_DST. */ #define BLOCK_COPY(block_dst, block_src) \ { \ block_dst[0] = block_src[0]; \ block_dst[1] = block_src[1]; \ block_dst[2] = block_src[2]; \ block_dst[3] = block_src[3]; \ } /* Apply SBOX number WHICH to to the block found in ARRAY0, writing the output to the block found in ARRAY1. */ #define SBOX(which, array0, array1) \ SBOX##which (array0[0], array0[1], array0[2], array0[3], \ array1[0], array1[1], array1[2], array1[3]); /* Apply inverse SBOX number WHICH to to the block found in ARRAY0, writing the output to the block found in ARRAY1. */ #define SBOX_INVERSE(which, array0, array1) \ SBOX##which##_INVERSE (array0[0], array0[1], array0[2], array0[3], \ array1[0], array1[1], array1[2], array1[3]); /* Apply the linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION(block) \ { \ block[0] = rol (block[0], 13); \ block[2] = rol (block[2], 3); \ block[1] = block[1] ^ block[0] ^ block[2]; \ block[3] = block[3] ^ block[2] ^ (block[0] << 3); \ block[1] = rol (block[1], 1); \ block[3] = rol (block[3], 7); \ block[0] = block[0] ^ block[1] ^ block[3]; \ block[2] = block[2] ^ block[3] ^ (block[1] << 7); \ block[0] = rol (block[0], 5); \ block[2] = rol (block[2], 22); \ } /* Apply the inverse linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION_INVERSE(block) \ { \ block[2] = ror (block[2], 22); \ block[0] = ror (block[0] , 5); \ block[2] = block[2] ^ block[3] ^ (block[1] << 7); \ block[0] = block[0] ^ block[1] ^ block[3]; \ block[3] = ror (block[3], 7); \ block[1] = ror (block[1], 1); \ block[3] = block[3] ^ block[2] ^ (block[0] << 3); \ block[1] = block[1] ^ block[0] ^ block[2]; \ block[2] = ror (block[2], 3); \ block[0] = ror (block[0], 13); \ } /* Apply a Serpent round to BLOCK, using the SBOX number WHICH and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary storage. This macro increments `round'. */ #define ROUND(which, subkeys, block, block_tmp) \ { \ BLOCK_XOR (block, subkeys[round]); \ round++; \ SBOX (which, block, block_tmp); \ LINEAR_TRANSFORMATION (block_tmp); \ BLOCK_COPY (block, block_tmp); \ } /* Apply the last Serpent round to BLOCK, using the SBOX number WHICH and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary storage. The result will be stored in BLOCK_TMP. This macro increments `round'. */ #define ROUND_LAST(which, subkeys, block, block_tmp) \ { \ BLOCK_XOR (block, subkeys[round]); \ round++; \ SBOX (which, block, block_tmp); \ BLOCK_XOR (block_tmp, subkeys[round]); \ round++; \ } /* Apply an inverse Serpent round to BLOCK, using the SBOX number WHICH and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary storage. This macro increments `round'. */ #define ROUND_INVERSE(which, subkey, block, block_tmp) \ { \ LINEAR_TRANSFORMATION_INVERSE (block); \ SBOX_INVERSE (which, block, block_tmp); \ BLOCK_XOR (block_tmp, subkey[round]); \ round--; \ BLOCK_COPY (block, block_tmp); \ } /* Apply the first Serpent round to BLOCK, using the SBOX number WHICH and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary storage. The result will be stored in BLOCK_TMP. This macro increments `round'. */ #define ROUND_FIRST_INVERSE(which, subkeys, block, block_tmp) \ { \ BLOCK_XOR (block, subkeys[round]); \ round--; \ SBOX_INVERSE (which, block, block_tmp); \ BLOCK_XOR (block_tmp, subkeys[round]); \ round--; \ } /* Convert the user provided key KEY of KEY_LENGTH bytes into the internally used format. */ static void serpent_key_prepare (const byte *key, unsigned int key_length, serpent_key_t key_prepared) { int i; /* Copy key. */ key_length /= 4; for (i = 0; i < key_length; i++) key_prepared[i] = buf_get_le32 (key + i * 4); if (i < 8) { /* Key must be padded according to the Serpent specification. */ key_prepared[i] = 0x00000001; for (i++; i < 8; i++) key_prepared[i] = 0; } } /* Derive the 33 subkeys from KEY and store them in SUBKEYS. */ static void serpent_subkeys_generate (serpent_key_t key, serpent_subkeys_t subkeys) { u32 w[8]; /* The `prekey'. */ u32 ws[4]; u32 wt[4]; /* Initialize with key values. */ w[0] = key[0]; w[1] = key[1]; w[2] = key[2]; w[3] = key[3]; w[4] = key[4]; w[5] = key[5]; w[6] = key[6]; w[7] = key[7]; /* Expand to intermediate key using the affine recurrence. */ #define EXPAND_KEY4(wo, r) \ wo[0] = w[(r+0)%8] = \ rol (w[(r+0)%8] ^ w[(r+3)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ PHI ^ (r+0), 11); \ wo[1] = w[(r+1)%8] = \ rol (w[(r+1)%8] ^ w[(r+4)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ PHI ^ (r+1), 11); \ wo[2] = w[(r+2)%8] = \ rol (w[(r+2)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ w[(r+1)%8] ^ PHI ^ (r+2), 11); \ wo[3] = w[(r+3)%8] = \ rol (w[(r+3)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ w[(r+2)%8] ^ PHI ^ (r+3), 11); #define EXPAND_KEY(r) \ EXPAND_KEY4(ws, (r)); \ EXPAND_KEY4(wt, (r + 4)); /* Calculate subkeys via S-Boxes, in bitslice mode. */ EXPAND_KEY (0); SBOX (3, ws, subkeys[0]); SBOX (2, wt, subkeys[1]); EXPAND_KEY (8); SBOX (1, ws, subkeys[2]); SBOX (0, wt, subkeys[3]); EXPAND_KEY (16); SBOX (7, ws, subkeys[4]); SBOX (6, wt, subkeys[5]); EXPAND_KEY (24); SBOX (5, ws, subkeys[6]); SBOX (4, wt, subkeys[7]); EXPAND_KEY (32); SBOX (3, ws, subkeys[8]); SBOX (2, wt, subkeys[9]); EXPAND_KEY (40); SBOX (1, ws, subkeys[10]); SBOX (0, wt, subkeys[11]); EXPAND_KEY (48); SBOX (7, ws, subkeys[12]); SBOX (6, wt, subkeys[13]); EXPAND_KEY (56); SBOX (5, ws, subkeys[14]); SBOX (4, wt, subkeys[15]); EXPAND_KEY (64); SBOX (3, ws, subkeys[16]); SBOX (2, wt, subkeys[17]); EXPAND_KEY (72); SBOX (1, ws, subkeys[18]); SBOX (0, wt, subkeys[19]); EXPAND_KEY (80); SBOX (7, ws, subkeys[20]); SBOX (6, wt, subkeys[21]); EXPAND_KEY (88); SBOX (5, ws, subkeys[22]); SBOX (4, wt, subkeys[23]); EXPAND_KEY (96); SBOX (3, ws, subkeys[24]); SBOX (2, wt, subkeys[25]); EXPAND_KEY (104); SBOX (1, ws, subkeys[26]); SBOX (0, wt, subkeys[27]); EXPAND_KEY (112); SBOX (7, ws, subkeys[28]); SBOX (6, wt, subkeys[29]); EXPAND_KEY (120); SBOX (5, ws, subkeys[30]); SBOX (4, wt, subkeys[31]); EXPAND_KEY4 (ws, 128); SBOX (3, ws, subkeys[32]); wipememory (ws, sizeof (ws)); wipememory (wt, sizeof (wt)); wipememory (w, sizeof (w)); } /* Initialize CONTEXT with the key KEY of KEY_LENGTH bits. */ static gcry_err_code_t serpent_setkey_internal (serpent_context_t *context, const byte *key, unsigned int key_length) { serpent_key_t key_prepared; if (key_length > 32) return GPG_ERR_INV_KEYLEN; serpent_key_prepare (key, key_length, key_prepared); serpent_subkeys_generate (key_prepared, context->keys); #ifdef USE_AVX2 context->use_avx2 = 0; if ((_gcry_get_hw_features () & HWF_INTEL_AVX2)) { context->use_avx2 = 1; } #endif #ifdef USE_NEON context->use_neon = 0; if ((_gcry_get_hw_features () & HWF_ARM_NEON)) { context->use_neon = 1; } #endif wipememory (key_prepared, sizeof(key_prepared)); return 0; } /* Initialize CTX with the key KEY of KEY_LENGTH bytes. */ static gcry_err_code_t serpent_setkey (void *ctx, const byte *key, unsigned int key_length, cipher_bulk_ops_t *bulk_ops) { serpent_context_t *context = ctx; static const char *serpent_test_ret; static int serpent_init_done; gcry_err_code_t ret = GPG_ERR_NO_ERROR; if (! serpent_init_done) { /* Execute a self-test the first time, Serpent is used. */ serpent_init_done = 1; serpent_test_ret = serpent_test (); if (serpent_test_ret) log_error ("Serpent test failure: %s\n", serpent_test_ret); } /* Setup bulk encryption routines. */ memset (bulk_ops, 0, sizeof(*bulk_ops)); bulk_ops->cbc_dec = _gcry_serpent_cbc_dec; bulk_ops->cfb_dec = _gcry_serpent_cfb_dec; bulk_ops->ctr_enc = _gcry_serpent_ctr_enc; bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt; - bulk_ops->ocb_auth = _gcry_serpent_ocb_auth; + bulk_ops->ocb_auth = _gcry_serpent_ocb_auth; + bulk_ops->xts_crypt = _gcry_serpent_xts_crypt; + bulk_ops->ecb_crypt = _gcry_serpent_ecb_crypt; if (serpent_test_ret) ret = GPG_ERR_SELFTEST_FAILED; else ret = serpent_setkey_internal (context, key, key_length); return ret; } static void serpent_encrypt_internal (serpent_context_t *context, const byte *input, byte *output) { serpent_block_t b, b_next; int round = 0; b[0] = buf_get_le32 (input + 0); b[1] = buf_get_le32 (input + 4); b[2] = buf_get_le32 (input + 8); b[3] = buf_get_le32 (input + 12); ROUND (0, context->keys, b, b_next); ROUND (1, context->keys, b, b_next); ROUND (2, context->keys, b, b_next); ROUND (3, context->keys, b, b_next); ROUND (4, context->keys, b, b_next); ROUND (5, context->keys, b, b_next); ROUND (6, context->keys, b, b_next); ROUND (7, context->keys, b, b_next); ROUND (0, context->keys, b, b_next); ROUND (1, context->keys, b, b_next); ROUND (2, context->keys, b, b_next); ROUND (3, context->keys, b, b_next); ROUND (4, context->keys, b, b_next); ROUND (5, context->keys, b, b_next); ROUND (6, context->keys, b, b_next); ROUND (7, context->keys, b, b_next); ROUND (0, context->keys, b, b_next); ROUND (1, context->keys, b, b_next); ROUND (2, context->keys, b, b_next); ROUND (3, context->keys, b, b_next); ROUND (4, context->keys, b, b_next); ROUND (5, context->keys, b, b_next); ROUND (6, context->keys, b, b_next); ROUND (7, context->keys, b, b_next); ROUND (0, context->keys, b, b_next); ROUND (1, context->keys, b, b_next); ROUND (2, context->keys, b, b_next); ROUND (3, context->keys, b, b_next); ROUND (4, context->keys, b, b_next); ROUND (5, context->keys, b, b_next); ROUND (6, context->keys, b, b_next); ROUND_LAST (7, context->keys, b, b_next); buf_put_le32 (output + 0, b_next[0]); buf_put_le32 (output + 4, b_next[1]); buf_put_le32 (output + 8, b_next[2]); buf_put_le32 (output + 12, b_next[3]); } static void serpent_decrypt_internal (serpent_context_t *context, const byte *input, byte *output) { serpent_block_t b, b_next; int round = ROUNDS; b_next[0] = buf_get_le32 (input + 0); b_next[1] = buf_get_le32 (input + 4); b_next[2] = buf_get_le32 (input + 8); b_next[3] = buf_get_le32 (input + 12); ROUND_FIRST_INVERSE (7, context->keys, b_next, b); ROUND_INVERSE (6, context->keys, b, b_next); ROUND_INVERSE (5, context->keys, b, b_next); ROUND_INVERSE (4, context->keys, b, b_next); ROUND_INVERSE (3, context->keys, b, b_next); ROUND_INVERSE (2, context->keys, b, b_next); ROUND_INVERSE (1, context->keys, b, b_next); ROUND_INVERSE (0, context->keys, b, b_next); ROUND_INVERSE (7, context->keys, b, b_next); ROUND_INVERSE (6, context->keys, b, b_next); ROUND_INVERSE (5, context->keys, b, b_next); ROUND_INVERSE (4, context->keys, b, b_next); ROUND_INVERSE (3, context->keys, b, b_next); ROUND_INVERSE (2, context->keys, b, b_next); ROUND_INVERSE (1, context->keys, b, b_next); ROUND_INVERSE (0, context->keys, b, b_next); ROUND_INVERSE (7, context->keys, b, b_next); ROUND_INVERSE (6, context->keys, b, b_next); ROUND_INVERSE (5, context->keys, b, b_next); ROUND_INVERSE (4, context->keys, b, b_next); ROUND_INVERSE (3, context->keys, b, b_next); ROUND_INVERSE (2, context->keys, b, b_next); ROUND_INVERSE (1, context->keys, b, b_next); ROUND_INVERSE (0, context->keys, b, b_next); ROUND_INVERSE (7, context->keys, b, b_next); ROUND_INVERSE (6, context->keys, b, b_next); ROUND_INVERSE (5, context->keys, b, b_next); ROUND_INVERSE (4, context->keys, b, b_next); ROUND_INVERSE (3, context->keys, b, b_next); ROUND_INVERSE (2, context->keys, b, b_next); ROUND_INVERSE (1, context->keys, b, b_next); ROUND_INVERSE (0, context->keys, b, b_next); buf_put_le32 (output + 0, b_next[0]); buf_put_le32 (output + 4, b_next[1]); buf_put_le32 (output + 8, b_next[2]); buf_put_le32 (output + 12, b_next[3]); } static unsigned int serpent_encrypt (void *ctx, byte *buffer_out, const byte *buffer_in) { serpent_context_t *context = ctx; serpent_encrypt_internal (context, buffer_in, buffer_out); return /*burn_stack*/ (2 * sizeof (serpent_block_t)); } static unsigned int serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in) { serpent_context_t *context = ctx; serpent_decrypt_internal (context, buffer_in, buffer_out); return /*burn_stack*/ (2 * sizeof (serpent_block_t)); } /* Bulk encryption of complete blocks in CTR mode. This function is only intended for the bulk encryption feature of cipher.c. CTR is expected to be of size sizeof(serpent_block_t). */ static void _gcry_serpent_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { serpent_context_t *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char tmpbuf[sizeof(serpent_block_t)]; int burn_stack_depth = 2 * sizeof (serpent_block_t); #ifdef USE_AVX2 if (ctx->use_avx2) { int did_use_avx2 = 0; /* Process data in 16 block chunks. */ while (nblocks >= 16) { _gcry_serpent_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 16; outbuf += 16 * sizeof(serpent_block_t); inbuf += 16 * sizeof(serpent_block_t); did_use_avx2 = 1; } if (did_use_avx2) { /* serpent-avx2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic/sse2 code to handle smaller chunks... */ /* TODO: use caching instead? */ } #endif #ifdef USE_SSE2 { int did_use_sse2 = 0; /* Process data in 8 block chunks. */ while (nblocks >= 8) { _gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_sse2 = 1; } if (did_use_sse2) { /* serpent-sse2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ /* TODO: use caching instead? */ } #endif #ifdef USE_NEON if (ctx->use_neon) { int did_use_neon = 0; /* Process data in 8 block chunks. */ while (nblocks >= 8) { _gcry_serpent_neon_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_neon = 1; } if (did_use_neon) { /* serpent-neon assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ /* TODO: use caching instead? */ } #endif for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ serpent_encrypt_internal(ctx, ctr, tmpbuf); /* XOR the input with the encrypted counter and store in output. */ cipher_block_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t)); outbuf += sizeof(serpent_block_t); inbuf += sizeof(serpent_block_t); /* Increment the counter. */ cipher_block_add(ctr, 1, sizeof(serpent_block_t)); } wipememory(tmpbuf, sizeof(tmpbuf)); _gcry_burn_stack(burn_stack_depth); } /* Bulk decryption of complete blocks in CBC mode. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_serpent_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { serpent_context_t *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char savebuf[sizeof(serpent_block_t)]; int burn_stack_depth = 2 * sizeof (serpent_block_t); #ifdef USE_AVX2 if (ctx->use_avx2) { int did_use_avx2 = 0; /* Process data in 16 block chunks. */ while (nblocks >= 16) { _gcry_serpent_avx2_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 16; outbuf += 16 * sizeof(serpent_block_t); inbuf += 16 * sizeof(serpent_block_t); did_use_avx2 = 1; } if (did_use_avx2) { /* serpent-avx2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic/sse2 code to handle smaller chunks... */ } #endif #ifdef USE_SSE2 { int did_use_sse2 = 0; /* Process data in 8 block chunks. */ while (nblocks >= 8) { _gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_sse2 = 1; } if (did_use_sse2) { /* serpent-sse2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_NEON if (ctx->use_neon) { int did_use_neon = 0; /* Process data in 8 block chunks. */ while (nblocks >= 8) { _gcry_serpent_neon_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_neon = 1; } if (did_use_neon) { /* serpent-neon assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif for ( ;nblocks; nblocks-- ) { /* INBUF is needed later and it may be identical to OUTBUF, so store the intermediate result to SAVEBUF. */ serpent_decrypt_internal (ctx, inbuf, savebuf); cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, sizeof(serpent_block_t)); inbuf += sizeof(serpent_block_t); outbuf += sizeof(serpent_block_t); } wipememory(savebuf, sizeof(savebuf)); _gcry_burn_stack(burn_stack_depth); } /* Bulk decryption of complete blocks in CFB mode. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_serpent_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { serpent_context_t *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 2 * sizeof (serpent_block_t); #ifdef USE_AVX2 if (ctx->use_avx2) { int did_use_avx2 = 0; /* Process data in 16 block chunks. */ while (nblocks >= 16) { _gcry_serpent_avx2_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 16; outbuf += 16 * sizeof(serpent_block_t); inbuf += 16 * sizeof(serpent_block_t); did_use_avx2 = 1; } if (did_use_avx2) { /* serpent-avx2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic/sse2 code to handle smaller chunks... */ } #endif #ifdef USE_SSE2 { int did_use_sse2 = 0; /* Process data in 8 block chunks. */ while (nblocks >= 8) { _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_sse2 = 1; } if (did_use_sse2) { /* serpent-sse2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_NEON if (ctx->use_neon) { int did_use_neon = 0; /* Process data in 8 block chunks. */ while (nblocks >= 8) { _gcry_serpent_neon_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_neon = 1; } if (did_use_neon) { /* serpent-neon assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif for ( ;nblocks; nblocks-- ) { serpent_encrypt_internal(ctx, iv, iv); cipher_block_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t)); outbuf += sizeof(serpent_block_t); inbuf += sizeof(serpent_block_t); } _gcry_burn_stack(burn_stack_depth); } /* Bulk encryption/decryption of complete blocks in OCB mode. */ static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { #if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON) serpent_context_t *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 2 * sizeof (serpent_block_t); u64 blkn = c->u_mode.ocb.data_nblocks; #else (void)c; (void)outbuf_arg; (void)inbuf_arg; (void)encrypt; #endif #ifdef USE_AVX2 if (ctx->use_avx2) { int did_use_avx2 = 0; u64 Ls[16]; u64 *l; if (nblocks >= 16) { l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn); /* Process data in 16 block chunks. */ while (nblocks >= 16) { blkn += 16; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); if (encrypt) _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); else _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); nblocks -= 16; outbuf += 16 * sizeof(serpent_block_t); inbuf += 16 * sizeof(serpent_block_t); did_use_avx2 = 1; } } if (did_use_avx2) { /* serpent-avx2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_SSE2 { int did_use_sse2 = 0; u64 Ls[8]; u64 *l; if (nblocks >= 8) { l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn); /* Process data in 8 block chunks. */ while (nblocks >= 8) { blkn += 8; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); if (encrypt) _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); else _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_sse2 = 1; } } if (did_use_sse2) { /* serpent-sse2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_NEON if (ctx->use_neon) { int did_use_neon = 0; uintptr_t Ls[8]; uintptr_t *l; if (nblocks >= 8) { l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn); /* Process data in 8 block chunks. */ while (nblocks >= 8) { blkn += 8; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); if (encrypt) _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, (const void **)Ls); else _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, (const void **)Ls); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); inbuf += 8 * sizeof(serpent_block_t); did_use_neon = 1; } } if (did_use_neon) { /* serpent-neon assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON) c->u_mode.ocb.data_nblocks = blkn; if (burn_stack_depth) _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); #endif return nblocks; } /* Bulk authentication of complete blocks in OCB mode. */ static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { #if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON) serpent_context_t *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; int burn_stack_depth = 2 * sizeof(serpent_block_t); u64 blkn = c->u_mode.ocb.aad_nblocks; #else (void)c; (void)abuf_arg; #endif #ifdef USE_AVX2 if (ctx->use_avx2) { int did_use_avx2 = 0; u64 Ls[16]; u64 *l; if (nblocks >= 16) { l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn); /* Process data in 16 block chunks. */ while (nblocks >= 16) { blkn += 16; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, Ls); nblocks -= 16; abuf += 16 * sizeof(serpent_block_t); did_use_avx2 = 1; } } if (did_use_avx2) { /* serpent-avx2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_SSE2 { int did_use_sse2 = 0; u64 Ls[8]; u64 *l; if (nblocks >= 8) { l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn); /* Process data in 8 block chunks. */ while (nblocks >= 8) { blkn += 8; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, Ls); nblocks -= 8; abuf += 8 * sizeof(serpent_block_t); did_use_sse2 = 1; } } if (did_use_sse2) { /* serpent-avx2 assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #ifdef USE_NEON if (ctx->use_neon) { int did_use_neon = 0; uintptr_t Ls[8]; uintptr_t *l; if (nblocks >= 8) { l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn); /* Process data in 8 block chunks. */ while (nblocks >= 8) { blkn += 8; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, (const void **)Ls); nblocks -= 8; abuf += 8 * sizeof(serpent_block_t); did_use_neon = 1; } } if (did_use_neon) { /* serpent-neon assembly code does not use stack */ if (nblocks == 0) burn_stack_depth = 0; } /* Use generic code to handle smaller chunks... */ } #endif #if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON) c->u_mode.ocb.aad_nblocks = blkn; if (burn_stack_depth) _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); #endif return nblocks; } + +static unsigned int +serpent_crypt_blk1_16(const void *context, byte *out, const byte *in, + unsigned int num_blks, int encrypt) +{ + const serpent_context_t *ctx = context; + unsigned int burn, burn_stack_depth = 0; + +#ifdef USE_AVX2 + if (num_blks == 16 && ctx->use_avx2) + { + _gcry_serpent_avx2_blk16 (ctx, out, in, encrypt); + return 0; + } +#endif + +#ifdef USE_SSE2 + while (num_blks >= 8) + { + _gcry_serpent_sse2_blk8 (ctx, out, in, encrypt); + out += 8 * sizeof(serpent_block_t); + in += 8 * sizeof(serpent_block_t); + num_blks -= 8; + } +#endif + +#ifdef USE_NEON + if (ctx->use_neon) + { + while (num_blks >= 8) + { + _gcry_serpent_neon_blk8 (ctx, out, in, encrypt); + out += 8 * sizeof(serpent_block_t); + in += 8 * sizeof(serpent_block_t); + num_blks -= 8; + } + } +#endif + + while (num_blks >= 1) + { + if (encrypt) + serpent_encrypt_internal((void *)ctx, in, out); + else + serpent_decrypt_internal((void *)ctx, in, out); + + burn = 2 * sizeof(serpent_block_t); + burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth; + out += sizeof(serpent_block_t); + in += sizeof(serpent_block_t); + num_blks--; + } + + return burn_stack_depth; +} + +static unsigned int +serpent_encrypt_blk1_16(const void *ctx, byte *out, const byte *in, + unsigned int num_blks) +{ + return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 1); +} + +static unsigned int +serpent_decrypt_blk1_16(const void *ctx, byte *out, const byte *in, + unsigned int num_blks) +{ + return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 0); +} + + +/* Bulk encryption/decryption of complete blocks in XTS mode. */ +static void +_gcry_serpent_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + serpent_context_t *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * 16]; + unsigned int tmp_used = 16; + size_t nburn; + + nburn = bulk_xts_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16 + : serpent_decrypt_blk1_16, + outbuf, inbuf, nblocks, + tweak, tmpbuf, sizeof(tmpbuf) / 16, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory(tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk encryption/decryption in ECB mode. */ +static void +_gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt) +{ + serpent_context_t *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + size_t nburn; + + nburn = bulk_ecb_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16 + : serpent_decrypt_blk1_16, + outbuf, inbuf, nblocks, 16); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + /* Serpent test. */ static const char * serpent_test (void) { serpent_context_t context; unsigned char scratch[16]; unsigned int i; static struct test { int key_length; unsigned char key[32]; unsigned char text_plain[16]; unsigned char text_cipher[16]; } test_data[] = { { 16, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", "\xD2\x9D\x57\x6F\xCE\xA3\xA3\xA7\xED\x90\x99\xF2\x92\x73\xD7\x8E", "\xB2\x28\x8B\x96\x8A\xE8\xB0\x86\x48\xD1\xCE\x96\x06\xFD\x99\x2D" }, { 24, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00", "\xD2\x9D\x57\x6F\xCE\xAB\xA3\xA7\xED\x98\x99\xF2\x92\x7B\xD7\x8E", "\x13\x0E\x35\x3E\x10\x37\xC2\x24\x05\xE8\xFA\xEF\xB2\xC3\xC3\xE9" }, { 32, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", "\xD0\x95\x57\x6F\xCE\xA3\xE3\xA7\xED\x98\xD9\xF2\x90\x73\xD7\x8E", "\xB9\x0E\xE5\x86\x2D\xE6\x91\x68\xF2\xBD\xD5\x12\x5B\x45\x47\x2B" }, { 32, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", "\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00", "\x20\x61\xA4\x27\x82\xBD\x52\xEC\x69\x1E\xC3\x83\xB0\x3B\xA7\x7C" }, { 0 }, }; for (i = 0; test_data[i].key_length; i++) { serpent_setkey_internal (&context, test_data[i].key, test_data[i].key_length); serpent_encrypt_internal (&context, test_data[i].text_plain, scratch); if (memcmp (scratch, test_data[i].text_cipher, sizeof (serpent_block_t))) switch (test_data[i].key_length) { case 16: return "Serpent-128 test encryption failed."; case 24: return "Serpent-192 test encryption failed."; case 32: return "Serpent-256 test encryption failed."; } serpent_decrypt_internal (&context, test_data[i].text_cipher, scratch); if (memcmp (scratch, test_data[i].text_plain, sizeof (serpent_block_t))) switch (test_data[i].key_length) { case 16: return "Serpent-128 test decryption failed."; case 24: return "Serpent-192 test decryption failed."; case 32: return "Serpent-256 test decryption failed."; } } return NULL; } static const gcry_cipher_oid_spec_t serpent128_oids[] = { {"1.3.6.1.4.1.11591.13.2.1", GCRY_CIPHER_MODE_ECB }, {"1.3.6.1.4.1.11591.13.2.2", GCRY_CIPHER_MODE_CBC }, {"1.3.6.1.4.1.11591.13.2.3", GCRY_CIPHER_MODE_OFB }, {"1.3.6.1.4.1.11591.13.2.4", GCRY_CIPHER_MODE_CFB }, { NULL } }; static const gcry_cipher_oid_spec_t serpent192_oids[] = { {"1.3.6.1.4.1.11591.13.2.21", GCRY_CIPHER_MODE_ECB }, {"1.3.6.1.4.1.11591.13.2.22", GCRY_CIPHER_MODE_CBC }, {"1.3.6.1.4.1.11591.13.2.23", GCRY_CIPHER_MODE_OFB }, {"1.3.6.1.4.1.11591.13.2.24", GCRY_CIPHER_MODE_CFB }, { NULL } }; static const gcry_cipher_oid_spec_t serpent256_oids[] = { {"1.3.6.1.4.1.11591.13.2.41", GCRY_CIPHER_MODE_ECB }, {"1.3.6.1.4.1.11591.13.2.42", GCRY_CIPHER_MODE_CBC }, {"1.3.6.1.4.1.11591.13.2.43", GCRY_CIPHER_MODE_OFB }, {"1.3.6.1.4.1.11591.13.2.44", GCRY_CIPHER_MODE_CFB }, { NULL } }; static const char *serpent128_aliases[] = { "SERPENT", "SERPENT-128", NULL }; static const char *serpent192_aliases[] = { "SERPENT-192", NULL }; static const char *serpent256_aliases[] = { "SERPENT-256", NULL }; gcry_cipher_spec_t _gcry_cipher_spec_serpent128 = { GCRY_CIPHER_SERPENT128, {0, 0}, "SERPENT128", serpent128_aliases, serpent128_oids, 16, 128, sizeof (serpent_context_t), serpent_setkey, serpent_encrypt, serpent_decrypt }; gcry_cipher_spec_t _gcry_cipher_spec_serpent192 = { GCRY_CIPHER_SERPENT192, {0, 0}, "SERPENT192", serpent192_aliases, serpent192_oids, 16, 192, sizeof (serpent_context_t), serpent_setkey, serpent_encrypt, serpent_decrypt }; gcry_cipher_spec_t _gcry_cipher_spec_serpent256 = { GCRY_CIPHER_SERPENT256, {0, 0}, "SERPENT256", serpent256_aliases, serpent256_oids, 16, 256, sizeof (serpent_context_t), serpent_setkey, serpent_encrypt, serpent_decrypt };