Index: cipher/rijndael.c =================================================================== --- cipher/rijndael.c +++ cipher/rijndael.c @@ -201,28 +201,711 @@ #ifdef USE_PPC_ASM /* POWER 8 AES extensions */ -extern void aes_p8_encrypt (const unsigned char *in, - unsigned char *out, - const RIJNDAEL_context *ctx); +#include + +typedef vector unsigned char block; +vector unsigned char backwards = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + +#ifdef __LITTLE_ENDIAN__ +#define swap_if_le(a) \ + vec_perm(a, a, backwards) +#elif __BIG_ENDIAN__ +#define swap_if_le(a) (a) +#else +#error "What endianness?" +#endif + +/* Passes in AltiVec registers (big-endian) + * sadly compilers don't know how to unroll outer loops into + * inner loops with more registers on static functions, + * so that this can be properly optimized for OOO multi-issue + * without having to hand-unroll. + */ +static block _gcry_aes_ppc8_encrypt_altivec (const RIJNDAEL_context *ctx, + block a) { + int r; + uintptr_t zero = 0; + int rounds = ctx->rounds; + block *rk = (block*)ctx->keyschenc; + + //hexDump("sa", &a, sizeof(a)); + a = rk[0] ^ a; + //hexDump("sa", &a, sizeof(a)); + for (r = 1;r < rounds;r++) { + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (a) + :"v" (rk[r]) + ); + //hexDump("sa", &a, sizeof(a)); + } + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (a) + :"v" (rk[r]) + ); + //hexDump("end", &a, sizeof(a)); + return a; +} + + +static block _gcry_aes_ppc8_decrypt_altivec (const RIJNDAEL_context *ctx, + block a) { + int r; + uintptr_t zero = 0; + int rounds = ctx->rounds; + block *rk = (block*)ctx->keyschdec; + + //hexDump("sa", &a, sizeof(a)); + a = rk[0] ^ a; + //hexDump("sa", &a, sizeof(a)); + for (r = 1;r < rounds;r++) { + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (a) + :"v" (rk[r]) + ); + //hexDump("sa", &a, sizeof(a)); + } + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (a) + :"v" (rk[r]) + ); + //hexDump("end", &a, sizeof(a)); + return a; +} + static unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx, - unsigned char *out, - const unsigned char *in) -{ - /* When I tried to switch these registers in the assembly it broke. */ - aes_p8_encrypt (in, out, ctx); + unsigned char *b, + const unsigned char *a) { + uintptr_t zero = 0; + block sa; + //hexDump("key", rk_c, 16 * 15); + + if ((uintptr_t)a % 16 == 0) { + sa = vec_ld(0, a); + } else { + block unalignedprev, unalignedcur; + unalignedprev = vec_ld(0, a); + unalignedcur = vec_ld(16, a); + sa = vec_perm(unalignedprev, unalignedcur, vec_lvsl(0, a)); + } + + sa = swap_if_le(sa); + sa = _gcry_aes_ppc8_encrypt_altivec(ctx, sa); + + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + : + : "wa" (sa), "r" (zero), "r" ((uintptr_t)b)); + return 0; /* does not use stack */ } - /* this is the decryption key part of context */ -extern void aes_p8_decrypt (const unsigned char *in, - unsigned char *out, - const void *sboxes); static unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx, - unsigned char *out, - const unsigned char *in) + unsigned char *b, + const unsigned char *a) { - aes_p8_decrypt (in, out, &ctx->u2); + int r; + uintptr_t zero = 0; + int rounds = ctx->rounds; + block sa, unalignedprev, unalignedcur; + block *rk = (block*)ctx->keyschdec; + + //hexDump("key", rk, 16 * 15); + + if ((uintptr_t)a % 16 == 0) { + sa = vec_ld(0, a); + } else { + unalignedprev = vec_ld(0, a); + unalignedcur = vec_ld(16, a); + sa = vec_perm(unalignedprev, unalignedcur, vec_lvsl(0, a)); + } + + sa = swap_if_le(sa); + sa = _gcry_aes_ppc8_decrypt_altivec(ctx, sa); + + //hexDump("sa", &sa, sizeof(sa)); + if ((uintptr_t)b % 16 == 0) + vec_vsx_st(swap_if_le(sa), 0, b); + else { + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + : + : "wa" (sa), "r" (zero), "r" ((uintptr_t)b)); + } return 0; /* does not use stack */ } +size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt) { + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + + block *in = (block*)inbuf; + block *out = (block*)outbuf; + uintptr_t zero = 0; + int r; + int rounds = ctx->rounds; + int burn_depth = 0; + + if (encrypt) + { + const int unroll = 8; + block unalignedprev, ctr, iv; + if (((uintptr_t)inbuf % 16) != 0) { + unalignedprev = vec_ld(0, in++); + } + + iv = vec_ld(0, (block*)&c->u_iv.iv); + ctr = vec_ld(0, (block*)&c->u_ctr.ctr); + +//hexDump("ctr", &ctr, 16); +//hexDump("key", &ctx->u1, sizeof(ctx->u1)); + for ( ;nblocks >= unroll; nblocks -= unroll) + { + u64 i = c->u_mode.ocb.data_nblocks + 1; + block l0, l1, l2, l3, l4, l5, l6, l7; + block b0, b1, b2, b3, b4, b5, b6, b7; + block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7; + const block *rk = (block*)&ctx->keyschenc; + int j; + + c->u_mode.ocb.data_nblocks += unroll; + +//hexDump("iv", &iv, 16); + iv0 = iv; + if ((uintptr_t)inbuf % 16 == 0) + { + b0 = vec_ld(0, in++); +//hexDump("start", &b0, 16); + b1 = vec_ld(0, in++); + b2 = vec_ld(0, in++); + b3 = vec_ld(0, in++); + b4 = vec_ld(0, in++); + b5 = vec_ld(0, in++); + b6 = vec_ld(0, in++); + b7 = vec_ld(0, in++); + } + else + { + block unaligned0, unaligned1, unaligned2, + unaligned3, unaligned4, unaligned5, unaligned6; + unaligned0 = vec_ld(0, in++); + unaligned1 = vec_ld(0, in++); + unaligned2 = vec_ld(0, in++); + unaligned3 = vec_ld(0, in++); + unaligned4 = vec_ld(0, in++); + unaligned5 = vec_ld(0, in++); + unaligned6 = vec_ld(0, in++); + b0 = vec_perm(unalignedprev, unaligned0, vec_lvsl(0, inbuf)); +//hexDump("start", &b0, 16); + unalignedprev = vec_ld(0, in++); + b1 = vec_perm(unaligned0, unaligned1, vec_lvsl(0, inbuf)); + b2 = vec_perm(unaligned1, unaligned2, vec_lvsl(0, inbuf)); + b3 = vec_perm(unaligned2, unaligned3, vec_lvsl(0, inbuf)); + b4 = vec_perm(unaligned3, unaligned4, vec_lvsl(0, inbuf)); + b5 = vec_perm(unaligned4, unaligned5, vec_lvsl(0, inbuf)); + b6 = vec_perm(unaligned5, unaligned6, vec_lvsl(0, inbuf)); + b7 = vec_perm(unaligned6, unalignedprev, vec_lvsl(0, inbuf)); + } + +//hexDump("i", &i, sizeof(i)); + l0 = *(block*)ocb_get_l(c, i++); +//hexDump("l", &l0, 16); + l1 = *(block*)ocb_get_l(c, i++); + l2 = *(block*)ocb_get_l(c, i++); + l3 = *(block*)ocb_get_l(c, i++); + l4 = *(block*)ocb_get_l(c, i++); + l5 = *(block*)ocb_get_l(c, i++); + l6 = *(block*)ocb_get_l(c, i++); + l7 = *(block*)ocb_get_l(c, i++); + + ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7; + + iv0 ^= l0; +//hexDump("xorl", &iv0, 16); + b0 ^= iv0; +//hexDump("xor", &b0, 16); + iv1 = iv0 ^ l1; + b1 ^= iv1; + iv2 = iv1 ^ l2; + b2 ^= iv2; + iv3 = iv2 ^ l3; + b3 ^= iv3; + iv4 = iv3 ^ l4; + b4 ^= iv4; + iv5 = iv4 ^ l5; + b5 ^= iv5; + iv6 = iv5 ^ l6; + b6 ^= iv6; + iv7 = iv6 ^ l7; + b7 ^= iv7; + + b0 = swap_if_le(b0); +//hexDump("swap", &b0, 16); + b1 = swap_if_le(b1); + b2 = swap_if_le(b2); + b3 = swap_if_le(b3); + b4 = swap_if_le(b4); + b5 = swap_if_le(b5); + b6 = swap_if_le(b6); + b7 = swap_if_le(b7); + + b0 ^= rk[0]; +//hexDump("xor ??", &b0, 16); + b1 ^= rk[0]; + b2 ^= rk[0]; + b3 ^= rk[0]; + b4 ^= rk[0]; + b5 ^= rk[0]; + b6 ^= rk[0]; + b7 ^= rk[0]; + + for (r = 1;r < rounds;r++) + { + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b0) + :"v" (rk[r]) + ); +//hexDump("round", &b0, 16); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b1) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b2) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b3) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b4) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b5) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b6) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipher %0, %0, %1\n\t" + :"+v" (b7) + :"v" (rk[r]) + ); + } + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b0) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b1) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b2) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b3) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b4) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b5) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b6) + :"v" (rk[r]) + ); + __asm__ volatile ("vcipherlast %0, %0, %1\n\t" + :"+v" (b7) + :"v" (rk[r]) + ); + +//hexDump("end", &b0, 16); + iv = iv7; +//hexDump("end-iv5", &b0, 16); + + // The unaligned store stxvb16x writes big-endian, + // so in the unaligned case we swap the iv instead of the bytes + if ((uintptr_t)outbuf % 16 == 0) + { + vec_vsx_st(swap_if_le(b0) ^ iv0, 0, out++); +//hexDump("out", out - 1, 16); + vec_vsx_st(swap_if_le(b1) ^ iv1, 0, out++); + vec_vsx_st(swap_if_le(b2) ^ iv2, 0, out++); + vec_vsx_st(swap_if_le(b3) ^ iv3, 0, out++); + vec_vsx_st(swap_if_le(b4) ^ iv4, 0, out++); + vec_vsx_st(swap_if_le(b5) ^ iv5, 0, out++); + vec_vsx_st(swap_if_le(b6) ^ iv6, 0, out++); + vec_vsx_st(swap_if_le(b7) ^ iv7, 0, out++); + } + else + { + b0 ^= swap_if_le(iv0); + b1 ^= swap_if_le(iv1); + b2 ^= swap_if_le(iv2); + b3 ^= swap_if_le(iv3); + b4 ^= swap_if_le(iv4); + b5 ^= swap_if_le(iv5); + b6 ^= swap_if_le(iv6); + b7 ^= swap_if_le(iv7); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++))); +//hexDump("out-un", out - 1, 16); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++))); + } + } + + for ( ;nblocks; nblocks-- ) + { + block b; + u64 i = ++c->u_mode.ocb.data_nblocks; + const block l = *(block*)ocb_get_l(c, i); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + iv ^= l; + if ((uintptr_t)in % 16 == 0) { + b = vec_ld(0, in++); + } else { + block unalignedprevprev; + unalignedprevprev = unalignedprev; + unalignedprev = vec_ld(0, in++); + b = vec_perm(unalignedprevprev, unalignedprev, vec_lvsl(0, inbuf)); + } +//hexDump("start", &b, 16); + /* Checksum_i = Checksum_{i-1} xor P_i */ + ctr ^= b; +//hexDump("ctr", &ctr, 16); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + b ^= iv; +//hexDump("xoriv", &b, 16); + b = swap_if_le(b); + b = _gcry_aes_ppc8_encrypt_altivec (ctx, b); +//hexDump("crypt", &b, 16); + if ((uintptr_t)out % 16 == 0) + vec_vsx_st(swap_if_le(b) ^ iv, 0, out++); + else { + b ^= swap_if_le(iv); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + : + : "wa" (b), "r" (zero), "r" ((uintptr_t)out++)); + } +//hexDump("out", out - 1, 16); + } + + // We want to store iv and ctr big-endian and the unaligned + // store stxvb16x stores them little endian, so we have to swap them. + iv = swap_if_le(iv); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv)); + ctr = swap_if_le(ctr); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr)); + } + else + { + const int unroll = 8; + block unalignedprev, ctr, iv; + if (((uintptr_t)inbuf % 16) != 0) { + unalignedprev = vec_ld(0, in++); + } + + iv = vec_ld(0, (block*)&c->u_iv.iv); + ctr = vec_ld(0, (block*)&c->u_ctr.ctr); + +//hexDump("ctr", &ctr, 16); +//hexDump("key", &ctx->u1, sizeof(ctx->u1)); + for ( ;nblocks >= unroll; nblocks -= unroll) + { + u64 i = c->u_mode.ocb.data_nblocks + 1; + block l0, l1, l2, l3, l4, l5, l6, l7; + block b0, b1, b2, b3, b4, b5, b6, b7; + block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7; + const block *rk = (block*)&ctx->keyschdec; + int j; + + c->u_mode.ocb.data_nblocks += unroll; + +//hexDump("iv", &iv, 16); + iv0 = iv; + if ((uintptr_t)inbuf % 16 == 0) + { + b0 = vec_ld(0, in++); +//hexDump("start", &b0, 16); + b1 = vec_ld(0, in++); + b2 = vec_ld(0, in++); + b3 = vec_ld(0, in++); + b4 = vec_ld(0, in++); + b5 = vec_ld(0, in++); + b6 = vec_ld(0, in++); + b7 = vec_ld(0, in++); + } + else + { + block unaligned0, unaligned1, unaligned2, + unaligned3, unaligned4, unaligned5, unaligned6; + unaligned0 = vec_ld(0, in++); + unaligned1 = vec_ld(0, in++); + unaligned2 = vec_ld(0, in++); + unaligned3 = vec_ld(0, in++); + unaligned4 = vec_ld(0, in++); + unaligned5 = vec_ld(0, in++); + unaligned6 = vec_ld(0, in++); + b0 = vec_perm(unalignedprev, unaligned0, vec_lvsl(0, inbuf)); +//hexDump("start", &b0, 16); + unalignedprev = vec_ld(0, in++); + b1 = vec_perm(unaligned0, unaligned1, vec_lvsl(0, inbuf)); + b2 = vec_perm(unaligned1, unaligned2, vec_lvsl(0, inbuf)); + b3 = vec_perm(unaligned2, unaligned3, vec_lvsl(0, inbuf)); + b4 = vec_perm(unaligned3, unaligned4, vec_lvsl(0, inbuf)); + b5 = vec_perm(unaligned4, unaligned5, vec_lvsl(0, inbuf)); + b6 = vec_perm(unaligned5, unaligned6, vec_lvsl(0, inbuf)); + b7 = vec_perm(unaligned6, unalignedprev, vec_lvsl(0, inbuf)); + } + +//hexDump("i", &i, sizeof(i)); + l0 = *(block*)ocb_get_l(c, i++); +//hexDump("l", &l0, 16); + l1 = *(block*)ocb_get_l(c, i++); + l2 = *(block*)ocb_get_l(c, i++); + l3 = *(block*)ocb_get_l(c, i++); + l4 = *(block*)ocb_get_l(c, i++); + l5 = *(block*)ocb_get_l(c, i++); + l6 = *(block*)ocb_get_l(c, i++); + l7 = *(block*)ocb_get_l(c, i++); + + iv0 ^= l0; +//hexDump("xorl", &iv0, 16); + b0 ^= iv0; +//hexDump("xor", &b0, 16); + iv1 = iv0 ^ l1; + b1 ^= iv1; + iv2 = iv1 ^ l2; + b2 ^= iv2; + iv3 = iv2 ^ l3; + b3 ^= iv3; + iv4 = iv3 ^ l4; + b4 ^= iv4; + iv5 = iv4 ^ l5; + b5 ^= iv5; + iv6 = iv5 ^ l6; + b6 ^= iv6; + iv7 = iv6 ^ l7; + b7 ^= iv7; + + b0 = swap_if_le(b0); +//hexDump("swap", &b0, 16); + b1 = swap_if_le(b1); + b2 = swap_if_le(b2); + b3 = swap_if_le(b3); + b4 = swap_if_le(b4); + b5 = swap_if_le(b5); + b6 = swap_if_le(b6); + b7 = swap_if_le(b7); + + b0 ^= rk[0]; +//hexDump("xor ??", &b0, 16); + b1 ^= rk[0]; + b2 ^= rk[0]; + b3 ^= rk[0]; + b4 ^= rk[0]; + b5 ^= rk[0]; + b6 ^= rk[0]; + b7 ^= rk[0]; + + for (r = 1;r < rounds;r++) + { + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b0) + :"v" (rk[r]) + ); +//hexDump("round", &b0, 16); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b1) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b2) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b3) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b4) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b5) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b6) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipher %0, %0, %1\n\t" + :"+v" (b7) + :"v" (rk[r]) + ); + } + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b0) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b1) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b2) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b3) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b4) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b5) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b6) + :"v" (rk[r]) + ); + __asm__ volatile ("vncipherlast %0, %0, %1\n\t" + :"+v" (b7) + :"v" (rk[r]) + ); + +//hexDump("end", &b0, 16); + iv = iv7; +//hexDump("end-iv5", &b0, 16); + + b0 = swap_if_le(b0) ^ iv0; + b1 = swap_if_le(b1) ^ iv1; + b2 = swap_if_le(b2) ^ iv2; + b3 = swap_if_le(b3) ^ iv3; + b4 = swap_if_le(b4) ^ iv4; + b5 = swap_if_le(b5) ^ iv5; + b6 = swap_if_le(b6) ^ iv6; + b7 = swap_if_le(b7) ^ iv7; + + ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7; + + // The unaligned store stxvb16x writes big-endian + if ((uintptr_t)outbuf % 16 == 0) + { + vec_vsx_st(b0, 0, out++); + vec_vsx_st(b1, 0, out++); + vec_vsx_st(b2, 0, out++); + vec_vsx_st(b3, 0, out++); + vec_vsx_st(b4, 0, out++); + vec_vsx_st(b5, 0, out++); + vec_vsx_st(b6, 0, out++); + vec_vsx_st(b7, 0, out++); + } + else + { + b0 = swap_if_le(b0); + b1 = swap_if_le(b1); + b2 = swap_if_le(b2); + b3 = swap_if_le(b3); + b4 = swap_if_le(b4); + b5 = swap_if_le(b5); + b6 = swap_if_le(b6); + b7 = swap_if_le(b7); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++))); +//hexDump("out-un", out - 1, 16); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++))); + __asm__ ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++))); + } + } + + for ( ;nblocks; nblocks-- ) + { + block b; + u64 i = ++c->u_mode.ocb.data_nblocks; + const block l = *(block*)ocb_get_l(c, i); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + iv ^= l; + if ((uintptr_t)in % 16 == 0) { + b = vec_ld(0, in++); + } else { + block unalignedprevprev; + unalignedprevprev = unalignedprev; + unalignedprev = vec_ld(0, in++); + b = vec_perm(unalignedprevprev, unalignedprev, vec_lvsl(0, inbuf)); + } +//hexDump("start", &b, 16); + /* Checksum_i = Checksum_{i-1} xor P_i */ +//hexDump("ctr", &ctr, 16); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + b ^= iv; +//hexDump("xoriv", &b, 16); + b = swap_if_le(b); + b = _gcry_aes_ppc8_decrypt_altivec (ctx, b); +//hexDump("crypt", &b, 16); + b = swap_if_le(b) ^ iv; + ctr ^= b; + if ((uintptr_t)out % 16 == 0) + vec_vsx_st(b, 0, out++); + else { + b = swap_if_le(b); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + : + : "wa" (b), "r" (zero), "r" ((uintptr_t)out++)); + } +//hexDump("out", out - 1, 16); + } + + // We want to store iv and ctr big-endian and the unaligned + // store stxvb16x stores them little endian, so we have to swap them. + iv = swap_if_le(iv); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv)); + ctr = swap_if_le(ctr); + __asm__ volatile ("stxvb16x %x0, %1, %2\n\t" + :: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr)); + } + return 0; +} extern int aes_p8_set_encrypt_key (const unsigned char *userKey, const int bits, RIJNDAEL_context *key); extern int aes_p8_set_decrypt_key (const unsigned char *userKey, const int bits, @@ -297,7 +980,7 @@ const RIJNDAEL_context *ctx = context; const uint64_t two32 = 1ULL << 32; int overflow; - u64 s[2], e[2]; + u64 s[2]; s[0] = buf_get_be64(ctr + 8); overflow = two32 - (s[0] % two32) < nblocks; #ifdef __builtin_expect @@ -566,6 +1249,7 @@ hd->bulk.cbc_enc = _gcry_aes_ppc8_cbc_enc; hd->bulk.xts_crypt = _gcry_aes_ppc8_xts_crypt; hd->bulk.ctr_enc = _gcry_aes_ppc8_ctr_enc; + hd->bulk.ocb_crypt = _gcry_aes_ppc8_ocb_crypt; } } #endif @@ -1539,6 +2223,12 @@ return _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); } #endif /*USE_ARM_CE*/ +#ifdef USE_PPC_ASM + else if (ctx->use_ppc_asm) + { + return _gcry_aes_ppc8_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); + } +#endif /*USE_PPC_ASM*/ else if (encrypt) { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;