Page Menu
Home
GnuPG
Search
Configure Global Search
Log In
Files
F34634075
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Size
50 KB
Subscribers
None
View Options
diff --git a/cipher/rijndael-riscv-zvkned.c b/cipher/rijndael-riscv-zvkned.c
index e3ba6769..703950e1 100644
--- a/cipher/rijndael-riscv-zvkned.c
+++ b/cipher/rijndael-riscv-zvkned.c
@@ -1,1608 +1,1606 @@
/* rijndael-riscv-zvkned.c - RISC-V vector crypto implementation of AES
* Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#if defined (__riscv) && \
defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
#include "g10lib.h"
#include "simd-common-riscv.h"
#include "rijndael-internal.h"
#include "cipher-internal.h"
#include <riscv_vector.h>
#define ALWAYS_INLINE inline __attribute__((always_inline))
#define NO_INLINE __attribute__((noinline))
#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
#define ASM_FUNC_ATTR_INLINE ALWAYS_INLINE ASM_FUNC_ATTR
#define ASM_FUNC_ATTR_NOINLINE NO_INLINE ASM_FUNC_ATTR
#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
#else
# define FUNC_ATTR_OPT_O2
#endif
/*
* Helper macro and functions
*/
#define cast_u8m1_u32m1(a) __riscv_vreinterpret_v_u8m1_u32m1(a)
#define cast_u8m1_u64m1(a) __riscv_vreinterpret_v_u8m1_u64m1(a)
#define cast_u32m1_u8m1(a) __riscv_vreinterpret_v_u32m1_u8m1(a)
#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
#define cast_u64m1_u8m1(a) __riscv_vreinterpret_v_u64m1_u8m1(a)
#define cast_u8m2_u32m2(a) __riscv_vreinterpret_v_u8m2_u32m2(a)
#define cast_u32m2_u8m2(a) __riscv_vreinterpret_v_u32m2_u8m2(a)
#define cast_u8m4_u32m4(a) __riscv_vreinterpret_v_u8m4_u32m4(a)
#define cast_u32m4_u8m4(a) __riscv_vreinterpret_v_u32m4_u8m4(a)
#define cast_u64m1_u32m1(a) __riscv_vreinterpret_v_u64m1_u32m1(a)
#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
#define cast_u64m1_i64m1(a) __riscv_vreinterpret_v_u64m1_i64m1(a)
#define cast_i64m1_u64m1(a) __riscv_vreinterpret_v_i64m1_u64m1(a)
#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
static ASM_FUNC_ATTR_INLINE vuint32m1_t
bswap128_u32m1(vuint32m1_t vec, size_t vl_u32)
{
static const byte bswap128_arr[16] =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
size_t vl_bytes = vl_u32 * 4;
vuint8m1_t bswap128 = __riscv_vle8_v_u8m1(bswap128_arr, vl_bytes);
return cast_u8m1_u32m1(
__riscv_vrgather_vv_u8m1(cast_u32m1_u8m1(vec), bswap128, vl_bytes));
}
static ASM_FUNC_ATTR_INLINE vuint32m1_t
unaligned_load_u32m1(const void *ptr, size_t vl_u32)
{
size_t vl_bytes = vl_u32 * 4;
return cast_u8m1_u32m1(__riscv_vle8_v_u8m1(ptr, vl_bytes));
}
static ASM_FUNC_ATTR_INLINE void
unaligned_store_u32m1(void *ptr, vuint32m1_t vec, size_t vl_u32)
{
size_t vl_bytes = vl_u32 * 4;
__riscv_vse8_v_u8m1(ptr, cast_u32m1_u8m1(vec), vl_bytes);
}
static ASM_FUNC_ATTR_INLINE vuint32m4_t
unaligned_load_u32m4(const void *ptr, size_t vl_u32)
{
size_t vl_bytes = vl_u32 * 4;
return cast_u8m4_u32m4(__riscv_vle8_v_u8m4(ptr, vl_bytes));
}
static ASM_FUNC_ATTR_INLINE void
unaligned_store_u32m4(void *ptr, vuint32m4_t vec, size_t vl_u32)
{
size_t vl_bytes = vl_u32 * 4;
__riscv_vse8_v_u8m4(ptr, cast_u32m4_u8m4(vec), vl_bytes);
}
static vuint32m1_t
vxor_u8_u32m1(vuint32m1_t a, vuint32m1_t b, size_t vl_u32)
{
size_t vl_bytes = vl_u32 * 4;
return cast_u8m1_u32m1(__riscv_vxor_vv_u8m1(cast_u32m1_u8m1(a),
cast_u32m1_u8m1(b), vl_bytes));
}
static vuint32m4_t
vxor_u8_u32m4(vuint32m4_t a, vuint32m4_t b, size_t vl_u32)
{
size_t vl_bytes = vl_u32 * 4;
return cast_u8m4_u32m4(__riscv_vxor_vv_u8m4(cast_u32m4_u8m4(a),
cast_u32m4_u8m4(b), vl_bytes));
}
/*
* HW support detection
*/
int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_setup_acceleration(RIJNDAEL_context *ctx)
{
(void)ctx;
return (__riscv_vsetvl_e32m1(4) == 4);
}
/*
* Key expansion
*/
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
aes128_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
{
size_t vl = 4;
vuint32m1_t round_key = unaligned_load_u32m1 (key, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[0][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 1, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[1][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 2, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[2][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 3, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[3][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 4, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[4][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 5, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[5][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 6, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[6][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 7, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[7][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 8, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[8][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 9, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[9][0], round_key, vl);
round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 10, vl);
__riscv_vse32_v_u32m1 (&ctx->keyschenc32[10][0], round_key, vl);
clear_vec_regs();
}
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
aes192_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
{
size_t vl = 4;
u32 *w = &ctx->keyschenc32[0][0];
u32 wr;
vuint32m1_t rk_0_7;
vuint32m1_t rk_4_11;
rk_0_7 = unaligned_load_u32m1 (&key[0], vl);
rk_4_11 = unaligned_load_u32m1 (&key[8], vl);
__riscv_vse32_v_u32m1 (&w[0], rk_0_7, vl);
__riscv_vse32_v_u32m1 (&w[2], rk_4_11, vl);
#define AES192_KF1_GEN(out, input, round192, vl) \
({ \
- u32 temp_array[4] = { 0, 0, 0, 0 }; \
- vuint32m1_t temp_vec; \
- temp_array[3] = (input); \
- temp_vec = __riscv_vle32_v_u32m1(temp_array, (vl)); \
+ vuint32m1_t temp_vec = __riscv_vmv_v_x_u32m1(0, (vl)); \
+ temp_vec = __riscv_vslide1down_vx_u32m1(temp_vec, (input), (vl)); \
temp_vec = __riscv_vaeskf1_vi_u32m1(temp_vec, (round192), (vl)); \
(out) = __riscv_vmv_x_s_u32m1_u32(temp_vec); \
})
#define AES192_EXPAND_BLOCK(w, round192, wr, last) \
({ \
(w)[(round192) * 6 + 0] = (w)[(round192) * 6 - 6] ^ (wr); \
(w)[(round192) * 6 + 1] = (w)[(round192) * 6 - 5] ^ (w)[(round192) * 6 + 0]; \
(w)[(round192) * 6 + 2] = (w)[(round192) * 6 - 4] ^ (w)[(round192) * 6 + 1]; \
(w)[(round192) * 6 + 3] = (w)[(round192) * 6 - 3] ^ (w)[(round192) * 6 + 2]; \
if (!(last)) \
{ \
(w)[(round192) * 6 + 4] = (w)[(round192) * 6 - 2] ^ (w)[(round192) * 6 + 3]; \
(w)[(round192) * 6 + 5] = (w)[(round192) * 6 - 1] ^ (w)[(round192) * 6 + 4]; \
} \
})
AES192_KF1_GEN(wr, w[5], 1, vl);
AES192_EXPAND_BLOCK(w, 1, wr, 0);
AES192_KF1_GEN(wr, w[11], 2, vl);
AES192_EXPAND_BLOCK(w, 2, wr, 0);
AES192_KF1_GEN(wr, w[17], 3, vl);
AES192_EXPAND_BLOCK(w, 3, wr, 0);
AES192_KF1_GEN(wr, w[23], 4, vl);
AES192_EXPAND_BLOCK(w, 4, wr, 0);
AES192_KF1_GEN(wr, w[29], 5, vl);
AES192_EXPAND_BLOCK(w, 5, wr, 0);
AES192_KF1_GEN(wr, w[35], 6, vl);
AES192_EXPAND_BLOCK(w, 6, wr, 0);
AES192_KF1_GEN(wr, w[41], 7, vl);
AES192_EXPAND_BLOCK(w, 7, wr, 0);
AES192_KF1_GEN(wr, w[47], 8, vl);
AES192_EXPAND_BLOCK(w, 8, wr, 1);
#undef AES192_KF1_GEN
#undef AES192_EXPAND_BLOCK
clear_vec_regs();
}
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
aes256_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
{
size_t vl = 4;
vuint32m1_t rk_a = unaligned_load_u32m1 (&key[0], vl);
vuint32m1_t rk_b = unaligned_load_u32m1 (&key[16], vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[0][0], rk_a, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[1][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 2, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[2][0], rk_a, vl);
rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 3, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[3][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 4, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[4][0], rk_a, vl);
rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 5, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[5][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 6, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[6][0], rk_a, vl);
rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 7, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[7][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 8, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[8][0], rk_a, vl);
rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 9, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[9][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 10, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[10][0], rk_a, vl);
rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 11, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[11][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 12, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[12][0], rk_a, vl);
rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 13, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[13][0], rk_b, vl);
rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 14, vl);
__riscv_vse32_v_u32m1(&ctx->keyschenc32[14][0], rk_a, vl);
clear_vec_regs();
}
void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key)
{
unsigned int rounds = ctx->rounds;
if (rounds < 12)
{
aes128_riscv_setkey(ctx, key);
}
else if (rounds == 12)
{
aes192_riscv_setkey(ctx, key);
_gcry_burn_stack(64);
}
else
{
aes256_riscv_setkey(ctx, key);
}
}
static ASM_FUNC_ATTR_INLINE void
do_prepare_decryption(RIJNDAEL_context *ctx)
{
u32 *ekey = (u32 *)(void *)ctx->keyschenc;
u32 *dkey = (u32 *)(void *)ctx->keyschdec;
int rounds = ctx->rounds;
size_t vl = 4;
int rr;
int r;
r = 0;
rr = rounds;
for (r = 0, rr = rounds; r <= rounds; r++, rr--)
{
__riscv_vse32_v_u32m1(dkey + r * 4,
__riscv_vle32_v_u32m1(ekey + rr * 4, vl),
vl);
}
}
void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_prepare_decryption(RIJNDAEL_context *ctx)
{
do_prepare_decryption(ctx);
clear_vec_regs();
}
/*
* Encryption / Decryption
*/
#define ROUND_KEY_VARIABLES \
vuint32m1_t rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7, rk8; \
vuint32m1_t rk9, rk10, rk11, rk12, rk13, rk_last;
#define PRELOAD_ROUND_KEYS(rk, nrounds, vl) \
do { \
rk0 = __riscv_vle32_v_u32m1(rk + 0 * 4, vl); \
rk1 = __riscv_vle32_v_u32m1(rk + 1 * 4, vl); \
rk2 = __riscv_vle32_v_u32m1(rk + 2 * 4, vl); \
rk3 = __riscv_vle32_v_u32m1(rk + 3 * 4, vl); \
rk4 = __riscv_vle32_v_u32m1(rk + 4 * 4, vl); \
rk5 = __riscv_vle32_v_u32m1(rk + 5 * 4, vl); \
rk6 = __riscv_vle32_v_u32m1(rk + 6 * 4, vl); \
rk7 = __riscv_vle32_v_u32m1(rk + 7 * 4, vl); \
rk8 = __riscv_vle32_v_u32m1(rk + 8 * 4, vl); \
rk9 = __riscv_vle32_v_u32m1(rk + 9 * 4, vl); \
if (UNLIKELY(nrounds >= 12)) \
{ \
rk10 = __riscv_vle32_v_u32m1(rk + 10 * 4, vl); \
rk11 = __riscv_vle32_v_u32m1(rk + 11 * 4, vl); \
if (LIKELY(nrounds > 12)) \
{ \
rk12 = __riscv_vle32_v_u32m1(rk + 12 * 4, vl); \
rk13 = __riscv_vle32_v_u32m1(rk + 13 * 4, vl); \
} \
else \
{ \
rk12 = __riscv_vundefined_u32m1(); \
rk13 = __riscv_vundefined_u32m1(); \
} \
} \
else \
{ \
rk10 = __riscv_vundefined_u32m1(); \
rk11 = __riscv_vundefined_u32m1(); \
rk12 = __riscv_vundefined_u32m1(); \
rk13 = __riscv_vundefined_u32m1(); \
} \
rk_last = __riscv_vle32_v_u32m1(rk + nrounds * 4, vl); \
} while (0)
#ifdef HAVE_BROKEN_VAES_VS_INTRINSIC
#define AES_CRYPT(e_d, mx, nrounds, blk, vlen) \
asm ( "vsetvli zero,%[vl],e32,"#mx",ta,ma;\n\t" \
"vaesz.vs %[block],%[rk0];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk1];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk2];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk3];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk4];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk5];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk6];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk7];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk8];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk9];\n\t" \
"blt %[rounds],%[num12],.Lcryptlast%=;\n\t" \
"vaes"#e_d"m.vs %[block],%[rk10];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk11];\n\t" \
"beq %[rounds],%[num12],.Lcryptlast%=;\n\t" \
"vaes"#e_d"m.vs %[block],%[rk12];\n\t" \
"vaes"#e_d"m.vs %[block],%[rk13];\n\t" \
".Lcryptlast%=:\n\t" \
"vaes"#e_d"f.vs %[block],%[rk_last];\n\t" \
: [block] "+vr" (blk) \
: [vl] "r" (vlen), [rounds] "r" (nrounds), [num12] "r" (12), \
[rk0] "vr" (rk0), [rk1] "vr" (rk1), [rk2] "vr" (rk2), \
[rk3] "vr" (rk3), [rk4] "vr" (rk4), [rk5] "vr" (rk5), \
[rk6] "vr" (rk6), [rk7] "vr" (rk7), [rk8] "vr" (rk8), \
[rk9] "vr" (rk9), [rk10] "vr" (rk10), [rk11] "vr" (rk11), \
[rk12] "vr" (rk12), [rk13] "vr" (rk13), \
[rk_last] "vr" (rk_last) \
: "vl")
#else
#define AES_CRYPT(e_d, mx, rounds, block, vl) \
({ \
(block) = __riscv_vaesz_vs_u32m1_u32##mx((block), rk0, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk1, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk2, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk3, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk4, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk5, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk6, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk7, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk8, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk9, (vl)); \
if (UNLIKELY((rounds) >= 12)) \
{ \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk10, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk11, (vl)); \
if (LIKELY((rounds) > 12)) \
{ \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk12, (vl)); \
(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk13, (vl)); \
} \
} \
(block) = __riscv_vaes##e_d##f_vs_u32m1_u32##mx((block), rk_last, (vl)); \
})
#endif
unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *out,
const unsigned char *in)
{
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
vuint32m1_t block;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
block = unaligned_load_u32m1(in, vl);
AES_CRYPT(e, m1, rounds, block, vl);
unaligned_store_u32m1(out, block, vl);
clear_vec_regs();
return 0; /* does not use stack */
}
unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *out,
const unsigned char *in)
{
const u32 *rk = ctx->keyschdec32[0];
int rounds = ctx->rounds;
size_t vl = 4;
vuint32m1_t block;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
block = unaligned_load_u32m1(in, vl);
AES_CRYPT(d, m1, rounds, block, vl);
unaligned_store_u32m1(out, block, vl);
clear_vec_regs();
return 0; /* does not use stack */
}
static ASM_FUNC_ATTR_INLINE void
aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks, int encrypt)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = encrypt ? ctx->keyschenc32[0] : ctx->keyschdec32[0];
int rounds = ctx->rounds;
size_t vl = 4;
ROUND_KEY_VARIABLES;
if (!encrypt && !ctx->decryption_prepared)
{
do_prepare_decryption(ctx);
ctx->decryption_prepared = 1;
}
PRELOAD_ROUND_KEYS (rk, rounds, vl);
for (; nblocks >= 4; nblocks -= 4)
{
vuint32m4_t blocks;
blocks = unaligned_load_u32m4(inbuf, vl * 4);
if (encrypt)
AES_CRYPT(e, m4, rounds, blocks, vl * 4);
else
AES_CRYPT(d, m4, rounds, blocks, vl * 4);
unaligned_store_u32m4(outbuf, blocks, vl * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
for (; nblocks; nblocks--)
{
vuint32m1_t block;
block = unaligned_load_u32m1(inbuf, vl);
if (encrypt)
AES_CRYPT(e, m1, rounds, block, vl);
else
AES_CRYPT(d, m1, rounds, block, vl);
unaligned_store_u32m1(outbuf, block, vl);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
clear_vec_regs();
}
static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
aes_riscv_zvkned_ecb_enc (void *context, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
{
aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 1);
}
static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
aes_riscv_zvkned_ecb_dec (void *context, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
{
aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 0);
}
void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt)
{
if (encrypt)
aes_riscv_zvkned_ecb_enc (context, outbuf_arg, inbuf_arg, nblocks);
else
aes_riscv_zvkned_ecb_dec (context, outbuf_arg, inbuf_arg, nblocks);
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
vuint32m1_t iv;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
iv = unaligned_load_u32m1(iv_arg, vl);
for (; nblocks; nblocks--)
{
vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
AES_CRYPT(e, m1, rounds, iv, vl);
data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(iv), data, vl_bytes);
__riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
iv = cast_u8m1_u32m1(data);
outbuf += BLOCKSIZE;
inbuf += BLOCKSIZE;
}
unaligned_store_u32m1(iv_arg, iv, vl);
clear_vec_regs();
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int cbc_mac)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
size_t outbuf_add = (!cbc_mac) * BLOCKSIZE;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
vuint32m1_t iv;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
iv = unaligned_load_u32m1(iv_arg, vl);
for (; nblocks; nblocks--)
{
vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
iv = cast_u8m1_u32m1(
__riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes));
AES_CRYPT(e, m1, rounds, iv, vl);
__riscv_vse8_v_u8m1(outbuf, cast_u32m1_u8m1(iv), vl_bytes);
inbuf += BLOCKSIZE;
outbuf += outbuf_add;
}
unaligned_store_u32m1(iv_arg, iv, vl);
clear_vec_regs();
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
static const byte add_u8_array[4][16] =
{
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
};
static const u64 carry_add[2] = { 1, 1 };
static const u64 nocarry_add[2] = { 1, 0 };
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
u64 ctrlow;
vuint32m1_t ctr;
vuint8m1_t add1;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
add1 = __riscv_vle8_v_u8m1(add_u8_array[0], vl_bytes);
ctr = unaligned_load_u32m1(ctr_arg, vl);
ctrlow = __riscv_vmv_x_s_u64m1_u64(cast_u32m1_u64m1(bswap128_u32m1(ctr, vl)));
memory_barrier_with_vec(add1);
if (nblocks >= 4)
{
vuint8m1_t add2 = __riscv_vle8_v_u8m1(add_u8_array[1], vl_bytes);
vuint8m1_t add3 = __riscv_vle8_v_u8m1(add_u8_array[2], vl_bytes);
vuint8m1_t add4 = __riscv_vle8_v_u8m1(add_u8_array[3], vl_bytes);
memory_barrier_with_vec(add2);
memory_barrier_with_vec(add3);
memory_barrier_with_vec(add4);
for (; nblocks >= 4; nblocks -= 4)
{
vuint8m4_t data4blks;
vuint32m4_t ctr4blks;
/* detect if 8-bit carry handling is needed */
if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
{
static const u64 *adders[5][4] =
{
{ nocarry_add, nocarry_add, nocarry_add, carry_add },
{ nocarry_add, nocarry_add, carry_add, nocarry_add },
{ nocarry_add, carry_add, nocarry_add, nocarry_add },
{ carry_add, nocarry_add, nocarry_add, nocarry_add },
{ nocarry_add, nocarry_add, nocarry_add, nocarry_add }
};
unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
vuint64m1_t ctr_u64;
vuint32m1_t ctr_u32_1;
vuint32m1_t ctr_u32_2;
vuint32m1_t ctr_u32_3;
vuint32m1_t ctr_u32_4;
vuint64m1_t add_u64;
/* Byte swap counter */
ctr_u64 = cast_u32m1_u64m1(bswap128_u32m1(ctr, vl));
/* Addition with carry handling */
add_u64 = __riscv_vle64_v_u64m1(adders[idx][0], vl / 2);
ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
ctr_u32_1 = cast_u64m1_u32m1(ctr_u64);
add_u64 = __riscv_vle64_v_u64m1(adders[idx][1], vl / 2);
ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
ctr_u32_2 = cast_u64m1_u32m1(ctr_u64);
add_u64 = __riscv_vle64_v_u64m1(adders[idx][2], vl / 2);
ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
ctr_u32_3 = cast_u64m1_u32m1(ctr_u64);
add_u64 = __riscv_vle64_v_u64m1(adders[idx][3], vl / 2);
ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
ctr_u32_4 = cast_u64m1_u32m1(ctr_u64);
/* Byte swap counters */
ctr_u32_1 = bswap128_u32m1(ctr_u32_1, vl);
ctr_u32_2 = bswap128_u32m1(ctr_u32_2, vl);
ctr_u32_3 = bswap128_u32m1(ctr_u32_3, vl);
ctr_u32_4 = bswap128_u32m1(ctr_u32_4, vl);
ctr4blks = __riscv_vundefined_u32m4();
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr_u32_1);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr_u32_2);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr_u32_3);
ctr = ctr_u32_4;
}
else
{
/* Fast path addition without carry handling */
vuint8m1_t ctr_u8 = cast_u32m1_u8m1(ctr);
vuint8m1_t ctr1 = __riscv_vadd_vv_u8m1(ctr_u8, add1, vl_bytes);
vuint8m1_t ctr2 = __riscv_vadd_vv_u8m1(ctr_u8, add2, vl_bytes);
vuint8m1_t ctr3 = __riscv_vadd_vv_u8m1(ctr_u8, add3, vl_bytes);
vuint8m4_t ctr0123_u8 = __riscv_vundefined_u8m4();
ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(ctr_u8, add4,
vl_bytes));
ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 0, ctr_u8);
ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 1, ctr1);
ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 2, ctr2);
ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 3, ctr3);
ctr4blks = cast_u8m4_u32m4(ctr0123_u8);
}
data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);
AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);
data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
vl_bytes * 4);
__riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
}
for (; nblocks; nblocks--)
{
vuint32m1_t block = ctr;
vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
/* detect if 8-bit carry handling is needed */
if (UNLIKELY((++ctrlow & 0xff) == 0))
{
const u64 *add_arr = UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add;
vuint64m1_t add_val = __riscv_vle64_v_u64m1(add_arr, vl / 2);
/* Byte swap counter */
ctr = bswap128_u32m1(ctr, vl);
/* Addition with carry handling */
ctr = cast_u64m1_u32m1(__riscv_vadd_vv_u64m1(cast_u32m1_u64m1(ctr),
add_val, vl / 2));
/* Byte swap counter */
ctr = bswap128_u32m1(ctr, vl);
}
else
{
/* Fast path addition without carry handling */
ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(cast_u32m1_u8m1(ctr),
add1, vl_bytes));
}
AES_CRYPT(e, m1, rounds, block, vl);
data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
__riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
unaligned_store_u32m1(ctr_arg, ctr, vl);
clear_vec_regs();
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
static const u32 add_u32_array[4][16] =
{
{ 1, }, { 2, }, { 3, }, { 4, }
};
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
vuint32m1_t ctr;
vuint32m1_t add1;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
add1 = __riscv_vle32_v_u32m1(add_u32_array[0], vl);
ctr = unaligned_load_u32m1(ctr_arg, vl);
memory_barrier_with_vec(add1);
if (nblocks >= 4)
{
vuint32m1_t add2 = __riscv_vle32_v_u32m1(add_u32_array[1], vl);
vuint32m1_t add3 = __riscv_vle32_v_u32m1(add_u32_array[2], vl);
vuint32m1_t add4 = __riscv_vle32_v_u32m1(add_u32_array[3], vl);
memory_barrier_with_vec(add2);
memory_barrier_with_vec(add3);
memory_barrier_with_vec(add4);
for (; nblocks >= 4; nblocks -= 4)
{
vuint32m1_t ctr1 = __riscv_vadd_vv_u32m1(ctr, add1, vl);
vuint32m1_t ctr2 = __riscv_vadd_vv_u32m1(ctr, add2, vl);
vuint32m1_t ctr3 = __riscv_vadd_vv_u32m1(ctr, add3, vl);
vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
vuint8m4_t data4blks;
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr1);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr2);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr3);
ctr = __riscv_vadd_vv_u32m1(ctr, add4, vl);
data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);
AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);
data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
vl_bytes * 4);
__riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
}
for (; nblocks; nblocks--)
{
vuint32m1_t block = ctr;
vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
ctr = __riscv_vadd_vv_u32m1(ctr, add1, vl);
AES_CRYPT(e, m1, rounds, block, vl);
data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
__riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
unaligned_store_u32m1(ctr_arg, ctr, vl);
clear_vec_regs();
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
vuint32m1_t iv;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
iv = unaligned_load_u32m1(iv_arg, vl);
for (; nblocks >= 4; nblocks -= 4)
{
vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
vuint32m4_t iv4blks = __riscv_vundefined_u32m4();
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
iv = iv4;
AES_CRYPT(e, m4, rounds, iv4blks, vl * 4);
data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
unaligned_store_u32m4(outbuf, data4blks, vl * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
for (; nblocks; nblocks--)
{
vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
vuint32m1_t new_iv = data;
AES_CRYPT(e, m1, rounds, iv, vl);
data = vxor_u8_u32m1(iv, data, vl);
unaligned_store_u32m1(outbuf, data, vl);
iv = new_iv;
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
unaligned_store_u32m1(iv_arg, iv, vl);
clear_vec_regs();
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschdec32[0];
int rounds = ctx->rounds;
size_t vl = 4;
vuint32m1_t iv;
ROUND_KEY_VARIABLES;
if (!ctx->decryption_prepared)
{
do_prepare_decryption(ctx);
ctx->decryption_prepared = 1;
}
PRELOAD_ROUND_KEYS (rk, rounds, vl);
iv = unaligned_load_u32m1(iv_arg, vl);
for (; nblocks >= 4; nblocks -= 4)
{
vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
vuint32m4_t iv4blks = __riscv_vundefined_u32m4();
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
unaligned_store_u32m4(outbuf, data4blks, vl * 4);
iv = iv4;
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
for (; nblocks; nblocks--)
{
vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
vuint32m1_t new_iv = data;
AES_CRYPT(d, m1, rounds, data, vl);
data = vxor_u8_u32m1(iv, data, vl);
unaligned_store_u32m1(outbuf, data, vl);
iv = new_iv;
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
unaligned_store_u32m1(iv_arg, iv, vl);
clear_vec_regs();
}
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
aes_riscv_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
{
RIJNDAEL_context *ctx = (void *)&c->context.c;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
vuint32m1_t iv;
vuint32m1_t ctr;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
/* Preload Offset and Checksum */
iv = unaligned_load_u32m1(c->u_iv.iv, vl);
ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);
if (nblocks >= 4)
{
vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
for (; nblocks >= 4; nblocks -= 4)
{
const unsigned char *l;
vuint8m1_t l_ntzi;
vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
vuint32m4_t offsets = __riscv_vundefined_u32m4();
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
unaligned_store_u32m4(outbuf, data4blks, vl * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
__riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
}
for (; nblocks; nblocks--)
{
const unsigned char *l;
vuint8m1_t l_ntzi;
vuint32m1_t data;
data = unaligned_load_u32m1(inbuf, vl);
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr = vxor_u8_u32m1(ctr, data, vl);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
data = vxor_u8_u32m1(data, iv, vl);
AES_CRYPT(e, m1, rounds, data, vl);
data = vxor_u8_u32m1(iv, data, vl);
unaligned_store_u32m1(outbuf, data, vl);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
c->u_mode.ocb.data_nblocks = n;
unaligned_store_u32m1(c->u_iv.iv, iv, vl);
unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);
clear_vec_regs();
return 0;
}
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
aes_riscv_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
{
RIJNDAEL_context *ctx = (void *)&c->context.c;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
const u32 *rk = ctx->keyschdec32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
vuint32m1_t iv;
vuint32m1_t ctr;
ROUND_KEY_VARIABLES;
if (!ctx->decryption_prepared)
{
do_prepare_decryption(ctx);
ctx->decryption_prepared = 1;
}
PRELOAD_ROUND_KEYS (rk, rounds, vl);
/* Preload Offset and Checksum */
iv = unaligned_load_u32m1(c->u_iv.iv, vl);
ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);
if (nblocks >= 4)
{
vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
for (; nblocks >= 4; nblocks -= 4)
{
const unsigned char *l;
vuint8m1_t l_ntzi;
vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
vuint32m4_t offsets = __riscv_vundefined_u32m4();
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
unaligned_store_u32m4(outbuf, data4blks, vl * 4);
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
__riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
}
for (; nblocks; nblocks--)
{
const unsigned char *l;
vuint8m1_t l_ntzi;
vuint8m1_t data;
vuint32m1_t block;
l = ocb_get_l(c, ++n);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
data = __riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes);
block = cast_u8m1_u32m1(data);
AES_CRYPT(d, m1, rounds, block, vl);
block = vxor_u8_u32m1(iv, block, vl);
unaligned_store_u32m1(outbuf, block, vl);
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr = vxor_u8_u32m1(ctr, block, vl);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
c->u_mode.ocb.data_nblocks = n;
unaligned_store_u32m1(c->u_iv.iv, iv, vl);
unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);
clear_vec_regs();
return 0;
}
size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt)
{
if (encrypt)
return aes_riscv_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
else
return aes_riscv_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
}
size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks)
{
RIJNDAEL_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
u64 n = c->u_mode.ocb.aad_nblocks;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
size_t vl_bytes = vl * 4;
vuint32m1_t iv;
vuint32m1_t ctr;
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
/* Preload Offset and Sum */
iv = unaligned_load_u32m1(c->u_mode.ocb.aad_offset, vl);
ctr = unaligned_load_u32m1(c->u_mode.ocb.aad_sum, vl);
if (nblocks >= 4)
{
vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
for (; nblocks >= 4; nblocks -= 4)
{
const unsigned char *l;
vuint8m1_t l_ntzi;
vuint32m4_t data4blks = unaligned_load_u32m4(abuf, vl * 4);
vuint32m4_t offsets = __riscv_vundefined_u32m4();
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
abuf += 4 * BLOCKSIZE;
}
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
__riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
}
for (; nblocks; nblocks--)
{
const unsigned char *l;
vuint8m1_t l_ntzi;
vuint32m1_t data;
data = unaligned_load_u32m1(abuf, vl);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
l = ocb_get_l(c, ++n);
l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
data = vxor_u8_u32m1(data, iv, vl);
AES_CRYPT(e, m1, rounds, data, vl);
ctr = vxor_u8_u32m1(ctr, data, vl);
abuf += BLOCKSIZE;
}
c->u_mode.ocb.aad_nblocks = n;
unaligned_store_u32m1(c->u_mode.ocb.aad_offset, iv, vl);
unaligned_store_u32m1(c->u_mode.ocb.aad_sum, ctr, vl);
clear_vec_regs();
return 0;
}
static const u64 xts_gfmul_const[2] = { 0x87, 0x01 };
static const u64 xts_swap64_const[2] = { 1, 0 };
static ASM_FUNC_ATTR_INLINE vuint32m1_t
xts_gfmul_byA (vuint32m1_t vec_in, vuint64m1_t xts_gfmul,
vuint64m1_t xts_swap64, size_t vl)
{
vuint64m1_t in_u64 = cast_u32m1_u64m1(vec_in);
vuint64m1_t tmp1;
tmp1 =
__riscv_vrgather_vv_u64m1(cast_u32m1_u64m1(vec_in), xts_swap64, vl / 2);
tmp1 = cast_i64m1_u64m1(
__riscv_vsra_vx_i64m1(cast_u64m1_i64m1(tmp1), 63, vl / 2));
in_u64 = __riscv_vadd_vv_u64m1(in_u64, in_u64, vl / 2);
tmp1 = __riscv_vand_vv_u64m1(tmp1, xts_gfmul, vl / 2);
return cast_u64m1_u32m1(__riscv_vxor_vv_u64m1(in_u64, tmp1, vl / 2));
}
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
aes_riscv_xts_enc (void *context, unsigned char *tweak_arg, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschenc32[0];
int rounds = ctx->rounds;
size_t vl = 4;
vuint32m1_t tweak;
vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
ROUND_KEY_VARIABLES;
PRELOAD_ROUND_KEYS (rk, rounds, vl);
/* Preload tweak */
tweak = unaligned_load_u32m1(tweak_arg, vl);
memory_barrier_with_vec(xts_gfmul);
memory_barrier_with_vec(xts_swap64);
for (; nblocks >= 4; nblocks -= 4)
{
vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
vuint32m4_t tweaks = __riscv_vundefined_u32m4();
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
unaligned_store_u32m4(outbuf, data4blks, vl * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
for (; nblocks; nblocks--)
{
vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
vuint32m1_t tweak0 = tweak;
data = vxor_u8_u32m1(data, tweak0, vl);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
AES_CRYPT(e, m1, rounds, data, vl);
data = vxor_u8_u32m1(data, tweak0, vl);
unaligned_store_u32m1(outbuf, data, vl);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
unaligned_store_u32m1(tweak_arg, tweak, vl);
clear_vec_regs();
}
static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
aes_riscv_xts_dec (void *context, unsigned char *tweak_arg, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
const u32 *rk = ctx->keyschdec32[0];
int rounds = ctx->rounds;
size_t vl = 4;
vuint32m1_t tweak;
vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
ROUND_KEY_VARIABLES;
if (!ctx->decryption_prepared)
{
do_prepare_decryption(ctx);
ctx->decryption_prepared = 1;
}
PRELOAD_ROUND_KEYS (rk, rounds, vl);
/* Preload tweak */
tweak = unaligned_load_u32m1(tweak_arg, vl);
memory_barrier_with_vec(xts_gfmul);
memory_barrier_with_vec(xts_swap64);
for (; nblocks >= 4; nblocks -= 4)
{
vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
vuint32m4_t tweaks = __riscv_vundefined_u32m4();
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
unaligned_store_u32m4(outbuf, data4blks, vl * 4);
inbuf += 4 * BLOCKSIZE;
outbuf += 4 * BLOCKSIZE;
}
for (; nblocks; nblocks--)
{
vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
vuint32m1_t tweak0 = tweak;
data = vxor_u8_u32m1(data, tweak0, vl);
tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
AES_CRYPT(d, m1, rounds, data, vl);
data = vxor_u8_u32m1(data, tweak0, vl);
unaligned_store_u32m1(outbuf, data, vl);
inbuf += BLOCKSIZE;
outbuf += BLOCKSIZE;
}
unaligned_store_u32m1(tweak_arg, tweak, vl);
clear_vec_regs();
}
ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int encrypt)
{
if (encrypt)
aes_riscv_xts_enc(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
else
aes_riscv_xts_dec(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
}
#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Tue, Jan 20, 11:43 PM (1 d, 22 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
47/89/026161f4e7b392d5592373e56b34
Attached To
rC libgcrypt
Event Timeline
Log In to Comment