diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c index 17e2f090..985f2fcd 100644 --- a/cipher/chacha20-ppc.c +++ b/cipher/chacha20-ppc.c @@ -1,624 +1,625 @@ /* chacha20-ppc.c - PowerPC vector implementation of ChaCha20 * Copyright (C) 2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \ defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \ defined(USE_CHACHA20) && \ __GNUC__ >= 4 #include #include "bufhelp.h" #include "poly1305-internal.h" #include "mpi-internal.h" #include "longlong.h" typedef vector unsigned char vector16x_u8; typedef vector unsigned int vector4x_u32; typedef vector unsigned long long vector2x_u64; #define ALWAYS_INLINE inline __attribute__((always_inline)) #define NO_INLINE __attribute__((noinline)) #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE #ifdef WORDS_BIGENDIAN static const vector16x_u8 le_bswap_const = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; #endif static ASM_FUNC_ATTR_INLINE vector4x_u32 vec_rol_elems(vector4x_u32 v, unsigned int idx) { #ifndef WORDS_BIGENDIAN return vec_sld (v, v, (16 - (4 * idx)) & 15); #else return vec_sld (v, v, (4 * idx) & 15); #endif } static ASM_FUNC_ATTR_INLINE vector4x_u32 vec_load_le(unsigned long offset, const unsigned char *ptr) { vector4x_u32 vec; vec = vec_vsx_ld (offset, (const u32 *)ptr); #ifdef WORDS_BIGENDIAN vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec, le_bswap_const); #endif return vec; } static ASM_FUNC_ATTR_INLINE void vec_store_le(vector4x_u32 vec, unsigned long offset, unsigned char *ptr) { #ifdef WORDS_BIGENDIAN vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec, le_bswap_const); #endif vec_vsx_st (vec, offset, (u32 *)ptr); } /********************************************************************** 2-way && 1-way chacha20 **********************************************************************/ #define ROTATE(v1,rolv) \ __asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv)) #define WORD_ROL(v1,c) \ ((v1) = vec_rol_elems((v1), (c))) #define XOR(ds,s) \ ((ds) ^= (s)) #define PLUS(ds,s) \ ((ds) += (s)) #define QUARTERROUND4(x0,x1,x2,x3,rol_x1,rol_x2,rol_x3) \ PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, rotate_16); \ PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, rotate_12); \ PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, rotate_8); \ PLUS(x2, x3); \ WORD_ROL(x3, rol_x3); \ XOR(x1, x2); \ WORD_ROL(x2, rol_x2); \ ROTATE(x1, rotate_7); \ WORD_ROL(x1, rol_x1); unsigned int ASM_FUNC_ATTR _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) { vector4x_u32 counter_1 = { 1, 0, 0, 0 }; vector4x_u32 rotate_16 = { 16, 16, 16, 16 }; vector4x_u32 rotate_12 = { 12, 12, 12, 12 }; vector4x_u32 rotate_8 = { 8, 8, 8, 8 }; vector4x_u32 rotate_7 = { 7, 7, 7, 7 }; vector4x_u32 state0, state1, state2, state3; vector4x_u32 v0, v1, v2, v3; vector4x_u32 v4, v5, v6, v7; int i; /* force preload of constants to vector registers */ __asm__ ("": "+v" (counter_1) :: "memory"); __asm__ ("": "+v" (rotate_16) :: "memory"); __asm__ ("": "+v" (rotate_12) :: "memory"); __asm__ ("": "+v" (rotate_8) :: "memory"); __asm__ ("": "+v" (rotate_7) :: "memory"); state0 = vec_vsx_ld(0 * 16, state); state1 = vec_vsx_ld(1 * 16, state); state2 = vec_vsx_ld(2 * 16, state); state3 = vec_vsx_ld(3 * 16, state); while (nblks >= 2) { v0 = state0; v1 = state1; v2 = state2; v3 = state3; v4 = state0; v5 = state1; v6 = state2; v7 = state3; v7 += counter_1; for (i = 20; i > 0; i -= 2) { QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3); QUARTERROUND4(v4, v5, v6, v7, 1, 2, 3); QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1); QUARTERROUND4(v4, v5, v6, v7, 3, 2, 1); } v0 += state0; v1 += state1; v2 += state2; v3 += state3; state3 += counter_1; /* update counter */ v4 += state0; v5 += state1; v6 += state2; v7 += state3; state3 += counter_1; /* update counter */ v0 ^= vec_load_le(0 * 16, src); v1 ^= vec_load_le(1 * 16, src); v2 ^= vec_load_le(2 * 16, src); v3 ^= vec_load_le(3 * 16, src); vec_store_le(v0, 0 * 16, dst); vec_store_le(v1, 1 * 16, dst); vec_store_le(v2, 2 * 16, dst); vec_store_le(v3, 3 * 16, dst); src += 64; dst += 64; v4 ^= vec_load_le(0 * 16, src); v5 ^= vec_load_le(1 * 16, src); v6 ^= vec_load_le(2 * 16, src); v7 ^= vec_load_le(3 * 16, src); vec_store_le(v4, 0 * 16, dst); vec_store_le(v5, 1 * 16, dst); vec_store_le(v6, 2 * 16, dst); vec_store_le(v7, 3 * 16, dst); src += 64; dst += 64; nblks -= 2; } while (nblks) { v0 = state0; v1 = state1; v2 = state2; v3 = state3; for (i = 20; i > 0; i -= 2) { QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3); QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1); } v0 += state0; v1 += state1; v2 += state2; v3 += state3; state3 += counter_1; /* update counter */ v0 ^= vec_load_le(0 * 16, src); v1 ^= vec_load_le(1 * 16, src); v2 ^= vec_load_le(2 * 16, src); v3 ^= vec_load_le(3 * 16, src); vec_store_le(v0, 0 * 16, dst); vec_store_le(v1, 1 * 16, dst); vec_store_le(v2, 2 * 16, dst); vec_store_le(v3, 3 * 16, dst); src += 64; dst += 64; nblks--; } vec_vsx_st(state3, 3 * 16, state); /* store counter */ return 0; } /********************************************************************** 4-way chacha20 **********************************************************************/ /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3) ({ \ vector4x_u32 t1 = vec_mergeh(x0, x2); \ vector4x_u32 t2 = vec_mergel(x0, x2); \ vector4x_u32 t3 = vec_mergeh(x1, x3); \ x3 = vec_mergel(x1, x3); \ x0 = vec_mergeh(t1, t3); \ x1 = vec_mergel(t1, t3); \ x2 = vec_mergeh(t2, x3); \ x3 = vec_mergel(t2, x3); \ }) #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE(d1, rotate_16); ROTATE(d2, rotate_16); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE(b1, rotate_12); ROTATE(b2, rotate_12); \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE(d1, rotate_8); ROTATE(d2, rotate_8); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE(b1, rotate_7); ROTATE(b2, rotate_7); unsigned int ASM_FUNC_ATTR _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) { vector4x_u32 counters_0123 = { 0, 1, 2, 3 }; vector4x_u32 counter_4 = { 4, 0, 0, 0 }; vector4x_u32 rotate_16 = { 16, 16, 16, 16 }; vector4x_u32 rotate_12 = { 12, 12, 12, 12 }; vector4x_u32 rotate_8 = { 8, 8, 8, 8 }; vector4x_u32 rotate_7 = { 7, 7, 7, 7 }; vector4x_u32 state0, state1, state2, state3; vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7; vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15; vector4x_u32 tmp; int i; /* force preload of constants to vector registers */ __asm__ ("": "+v" (counters_0123) :: "memory"); __asm__ ("": "+v" (counter_4) :: "memory"); __asm__ ("": "+v" (rotate_16) :: "memory"); __asm__ ("": "+v" (rotate_12) :: "memory"); __asm__ ("": "+v" (rotate_8) :: "memory"); __asm__ ("": "+v" (rotate_7) :: "memory"); state0 = vec_vsx_ld(0 * 16, state); state1 = vec_vsx_ld(1 * 16, state); state2 = vec_vsx_ld(2 * 16, state); state3 = vec_vsx_ld(3 * 16, state); do { v0 = vec_splat(state0, 0); v1 = vec_splat(state0, 1); v2 = vec_splat(state0, 2); v3 = vec_splat(state0, 3); v4 = vec_splat(state1, 0); v5 = vec_splat(state1, 1); v6 = vec_splat(state1, 2); v7 = vec_splat(state1, 3); v8 = vec_splat(state2, 0); v9 = vec_splat(state2, 1); v10 = vec_splat(state2, 2); v11 = vec_splat(state2, 3); v12 = vec_splat(state3, 0); v13 = vec_splat(state3, 1); v14 = vec_splat(state3, 2); v15 = vec_splat(state3, 3); v12 += counters_0123; v13 -= vec_cmplt(v12, counters_0123); for (i = 20; i > 0; i -= 2) { QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14) } v0 += vec_splat(state0, 0); v1 += vec_splat(state0, 1); v2 += vec_splat(state0, 2); v3 += vec_splat(state0, 3); v4 += vec_splat(state1, 0); v5 += vec_splat(state1, 1); v6 += vec_splat(state1, 2); v7 += vec_splat(state1, 3); v8 += vec_splat(state2, 0); v9 += vec_splat(state2, 1); v10 += vec_splat(state2, 2); v11 += vec_splat(state2, 3); tmp = vec_splat(state3, 0); tmp += counters_0123; v12 += tmp; v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123); v14 += vec_splat(state3, 2); v15 += vec_splat(state3, 3); state3 += counter_4; /* update counter */ transpose_4x4(v0, v1, v2, v3); transpose_4x4(v4, v5, v6, v7); transpose_4x4(v8, v9, v10, v11); transpose_4x4(v12, v13, v14, v15); v0 ^= vec_load_le((64 * 0 + 16 * 0), src); v1 ^= vec_load_le((64 * 1 + 16 * 0), src); v2 ^= vec_load_le((64 * 2 + 16 * 0), src); v3 ^= vec_load_le((64 * 3 + 16 * 0), src); v4 ^= vec_load_le((64 * 0 + 16 * 1), src); v5 ^= vec_load_le((64 * 1 + 16 * 1), src); v6 ^= vec_load_le((64 * 2 + 16 * 1), src); v7 ^= vec_load_le((64 * 3 + 16 * 1), src); v8 ^= vec_load_le((64 * 0 + 16 * 2), src); v9 ^= vec_load_le((64 * 1 + 16 * 2), src); v10 ^= vec_load_le((64 * 2 + 16 * 2), src); v11 ^= vec_load_le((64 * 3 + 16 * 2), src); v12 ^= vec_load_le((64 * 0 + 16 * 3), src); v13 ^= vec_load_le((64 * 1 + 16 * 3), src); v14 ^= vec_load_le((64 * 2 + 16 * 3), src); v15 ^= vec_load_le((64 * 3 + 16 * 3), src); vec_store_le(v0, (64 * 0 + 16 * 0), dst); vec_store_le(v1, (64 * 1 + 16 * 0), dst); vec_store_le(v2, (64 * 2 + 16 * 0), dst); vec_store_le(v3, (64 * 3 + 16 * 0), dst); vec_store_le(v4, (64 * 0 + 16 * 1), dst); vec_store_le(v5, (64 * 1 + 16 * 1), dst); vec_store_le(v6, (64 * 2 + 16 * 1), dst); vec_store_le(v7, (64 * 3 + 16 * 1), dst); vec_store_le(v8, (64 * 0 + 16 * 2), dst); vec_store_le(v9, (64 * 1 + 16 * 2), dst); vec_store_le(v10, (64 * 2 + 16 * 2), dst); vec_store_le(v11, (64 * 3 + 16 * 2), dst); vec_store_le(v12, (64 * 0 + 16 * 3), dst); vec_store_le(v13, (64 * 1 + 16 * 3), dst); vec_store_le(v14, (64 * 2 + 16 * 3), dst); vec_store_le(v15, (64 * 3 + 16 * 3), dst); src += 4*64; dst += 4*64; nblks -= 4; } while (nblks); vec_vsx_st(state3, 3 * 16, state); /* store counter */ return 0; } #if SIZEOF_UNSIGNED_LONG == 8 /********************************************************************** 4-way stitched chacha20-poly1305 **********************************************************************/ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ __asm__ ("addc %0, %3, %0\n" \ "adde %1, %4, %1\n" \ "adde %2, %5, %2\n" \ : "+r" (A0), "+r" (A1), "+r" (A2) \ : "r" (B0), "r" (B1), "r" (B2) \ : "cc" ) #define MUL_MOD_1305_64_PART1(H2, H1, H0, R1, R0, R1_MULT5) do { \ /* x = a * r (partial mod 2^130-5) */ \ umul_ppmm(x0_hi, x0_lo, H0, R0); /* h0 * r0 */ \ umul_ppmm(x1_hi, x1_lo, H0, R1); /* h0 * r1 */ \ \ umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \ } while (0) #define MUL_MOD_1305_64_PART2(H2, H1, H0, R1, R0, R1_MULT5) do { \ add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \ umul_ppmm(t1_hi, t1_lo, H1, R0); /* h1 * r0 */ \ add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \ \ t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \ t1_hi = H2 * R0; /* h2 * r0 */ \ add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \ \ /* carry propagation */ \ H2 = H0 & 3; \ H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \ ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \ } while (0) #define POLY1305_BLOCK_PART1(in_pos) do { \ m0 = buf_get_le64(poly1305_src + (in_pos) + 0); \ m1 = buf_get_le64(poly1305_src + (in_pos) + 8); \ /* a = h + m */ \ ADD_1305_64(h2, h1, h0, m2, m1, m0); \ /* h = a * r (partial mod 2^130-5) */ \ MUL_MOD_1305_64_PART1(h2, h1, h0, r1, r0, r1_mult5); \ } while (0) #define POLY1305_BLOCK_PART2(in_pos) do { \ MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \ } while (0) unsigned int ASM_FUNC_ATTR _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src) { vector4x_u32 counters_0123 = { 0, 1, 2, 3 }; vector4x_u32 counter_4 = { 4, 0, 0, 0 }; vector4x_u32 rotate_16 = { 16, 16, 16, 16 }; vector4x_u32 rotate_12 = { 12, 12, 12, 12 }; vector4x_u32 rotate_8 = { 8, 8, 8, 8 }; vector4x_u32 rotate_7 = { 7, 7, 7, 7 }; vector4x_u32 state0, state1, state2, state3; vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7; vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15; vector4x_u32 tmp; u64 r0, r1, r1_mult5; u64 h0, h1, h2; u64 m0, m1, m2; u64 x0_lo, x0_hi, x1_lo, x1_hi; u64 t0_lo, t0_hi, t1_lo, t1_hi; - int i; + unsigned int i, o; /* load poly1305 state */ m2 = 1; h0 = st->h[0] + ((u64)st->h[1] << 32); h1 = st->h[2] + ((u64)st->h[3] << 32); h2 = st->h[4]; r0 = st->r[0] + ((u64)st->r[1] << 32); r1 = st->r[2] + ((u64)st->r[3] << 32); r1_mult5 = (r1 >> 2) + r1; /* force preload of constants to vector registers */ __asm__ ("": "+v" (counters_0123) :: "memory"); __asm__ ("": "+v" (counter_4) :: "memory"); __asm__ ("": "+v" (rotate_16) :: "memory"); __asm__ ("": "+v" (rotate_12) :: "memory"); __asm__ ("": "+v" (rotate_8) :: "memory"); __asm__ ("": "+v" (rotate_7) :: "memory"); state0 = vec_vsx_ld(0 * 16, state); state1 = vec_vsx_ld(1 * 16, state); state2 = vec_vsx_ld(2 * 16, state); state3 = vec_vsx_ld(3 * 16, state); do { v0 = vec_splat(state0, 0); v1 = vec_splat(state0, 1); v2 = vec_splat(state0, 2); v3 = vec_splat(state0, 3); v4 = vec_splat(state1, 0); v5 = vec_splat(state1, 1); v6 = vec_splat(state1, 2); v7 = vec_splat(state1, 3); v8 = vec_splat(state2, 0); v9 = vec_splat(state2, 1); v10 = vec_splat(state2, 2); v11 = vec_splat(state2, 3); v12 = vec_splat(state3, 0); v13 = vec_splat(state3, 1); v14 = vec_splat(state3, 2); v15 = vec_splat(state3, 3); v12 += counters_0123; v13 -= vec_cmplt(v12, counters_0123); - for (i = 0; i < 16; i += 2) - { - POLY1305_BLOCK_PART1((i + 0) * 16); - QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) - POLY1305_BLOCK_PART2(); - QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) - POLY1305_BLOCK_PART1((i + 1) * 16); - QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) - POLY1305_BLOCK_PART2(); - QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14) - } - for (; i < 20; i += 2) + for (o = 20; o; o -= 10) { + for (i = 8; i; i -= 2) + { + POLY1305_BLOCK_PART1(0 * 16); + QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) + POLY1305_BLOCK_PART2(); + QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) + POLY1305_BLOCK_PART1(1 * 16); + poly1305_src += 2 * 16; + QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) + POLY1305_BLOCK_PART2(); + QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14) + } + QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14) } v0 += vec_splat(state0, 0); v1 += vec_splat(state0, 1); v2 += vec_splat(state0, 2); v3 += vec_splat(state0, 3); v4 += vec_splat(state1, 0); v5 += vec_splat(state1, 1); v6 += vec_splat(state1, 2); v7 += vec_splat(state1, 3); v8 += vec_splat(state2, 0); v9 += vec_splat(state2, 1); v10 += vec_splat(state2, 2); v11 += vec_splat(state2, 3); tmp = vec_splat(state3, 0); tmp += counters_0123; v12 += tmp; v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123); v14 += vec_splat(state3, 2); v15 += vec_splat(state3, 3); state3 += counter_4; /* update counter */ transpose_4x4(v0, v1, v2, v3); transpose_4x4(v4, v5, v6, v7); transpose_4x4(v8, v9, v10, v11); transpose_4x4(v12, v13, v14, v15); v0 ^= vec_load_le((64 * 0 + 16 * 0), src); v1 ^= vec_load_le((64 * 1 + 16 * 0), src); v2 ^= vec_load_le((64 * 2 + 16 * 0), src); v3 ^= vec_load_le((64 * 3 + 16 * 0), src); v4 ^= vec_load_le((64 * 0 + 16 * 1), src); v5 ^= vec_load_le((64 * 1 + 16 * 1), src); v6 ^= vec_load_le((64 * 2 + 16 * 1), src); v7 ^= vec_load_le((64 * 3 + 16 * 1), src); v8 ^= vec_load_le((64 * 0 + 16 * 2), src); v9 ^= vec_load_le((64 * 1 + 16 * 2), src); v10 ^= vec_load_le((64 * 2 + 16 * 2), src); v11 ^= vec_load_le((64 * 3 + 16 * 2), src); v12 ^= vec_load_le((64 * 0 + 16 * 3), src); v13 ^= vec_load_le((64 * 1 + 16 * 3), src); v14 ^= vec_load_le((64 * 2 + 16 * 3), src); v15 ^= vec_load_le((64 * 3 + 16 * 3), src); vec_store_le(v0, (64 * 0 + 16 * 0), dst); vec_store_le(v1, (64 * 1 + 16 * 0), dst); vec_store_le(v2, (64 * 2 + 16 * 0), dst); vec_store_le(v3, (64 * 3 + 16 * 0), dst); vec_store_le(v4, (64 * 0 + 16 * 1), dst); vec_store_le(v5, (64 * 1 + 16 * 1), dst); vec_store_le(v6, (64 * 2 + 16 * 1), dst); vec_store_le(v7, (64 * 3 + 16 * 1), dst); vec_store_le(v8, (64 * 0 + 16 * 2), dst); vec_store_le(v9, (64 * 1 + 16 * 2), dst); vec_store_le(v10, (64 * 2 + 16 * 2), dst); vec_store_le(v11, (64 * 3 + 16 * 2), dst); vec_store_le(v12, (64 * 0 + 16 * 3), dst); vec_store_le(v13, (64 * 1 + 16 * 3), dst); vec_store_le(v14, (64 * 2 + 16 * 3), dst); vec_store_le(v15, (64 * 3 + 16 * 3), dst); src += 4*64; dst += 4*64; - poly1305_src += 16*16; nblks -= 4; } while (nblks); vec_vsx_st(state3, 3 * 16, state); /* store counter */ /* store poly1305 state */ st->h[0] = h0; st->h[1] = h0 >> 32; st->h[2] = h1; st->h[3] = h1 >> 32; st->h[4] = h2; return 0; } #endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* ENABLE_PPC_CRYPTO_SUPPORT */