No OneTemporary
Actions

Size

50 KB

Subscribers

None

View Options

	diff --git a/cipher/rijndael-riscv-zvkned.c b/cipher/rijndael-riscv-zvkned.c
	index e3ba6769..703950e1 100644
	--- a/cipher/rijndael-riscv-zvkned.c
	+++ b/cipher/rijndael-riscv-zvkned.c
	@@ -1,1608 +1,1606 @@
	/* rijndael-riscv-zvkned.c - RISC-V vector crypto implementation of AES
	* Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna@iki.fi>
	*
	* This file is part of Libgcrypt.
	*
	* Libgcrypt is free software; you can redistribute it and/or modify
	* it under the terms of the GNU Lesser General Public License as
	* published by the Free Software Foundation; either version 2.1 of
	* the License, or (at your option) any later version.
	*
	* Libgcrypt is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this program; if not, see <http://www.gnu.org/licenses/>.
	*/

	#include <config.h>

	#if defined (__riscv) && \
	defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
	defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)

	#include "g10lib.h"
	#include "simd-common-riscv.h"
	#include "rijndael-internal.h"
	#include "cipher-internal.h"

	#include <riscv_vector.h>


	#define ALWAYS_INLINE inline __attribute__((always_inline))
	#define NO_INLINE __attribute__((noinline))
	#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))

	#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
	#define ASM_FUNC_ATTR_INLINE ALWAYS_INLINE ASM_FUNC_ATTR
	#define ASM_FUNC_ATTR_NOINLINE NO_INLINE ASM_FUNC_ATTR

	#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
	# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
	#else
	# define FUNC_ATTR_OPT_O2
	#endif


	/*
	* Helper macro and functions
	*/

	#define cast_u8m1_u32m1(a) __riscv_vreinterpret_v_u8m1_u32m1(a)
	#define cast_u8m1_u64m1(a) __riscv_vreinterpret_v_u8m1_u64m1(a)
	#define cast_u32m1_u8m1(a) __riscv_vreinterpret_v_u32m1_u8m1(a)
	#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
	#define cast_u64m1_u8m1(a) __riscv_vreinterpret_v_u64m1_u8m1(a)

	#define cast_u8m2_u32m2(a) __riscv_vreinterpret_v_u8m2_u32m2(a)
	#define cast_u32m2_u8m2(a) __riscv_vreinterpret_v_u32m2_u8m2(a)

	#define cast_u8m4_u32m4(a) __riscv_vreinterpret_v_u8m4_u32m4(a)
	#define cast_u32m4_u8m4(a) __riscv_vreinterpret_v_u32m4_u8m4(a)

	#define cast_u64m1_u32m1(a) __riscv_vreinterpret_v_u64m1_u32m1(a)
	#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)

	#define cast_u64m1_i64m1(a) __riscv_vreinterpret_v_u64m1_i64m1(a)
	#define cast_i64m1_u64m1(a) __riscv_vreinterpret_v_i64m1_u64m1(a)

	#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")


	static ASM_FUNC_ATTR_INLINE vuint32m1_t
	bswap128_u32m1(vuint32m1_t vec, size_t vl_u32)
	{
	static const byte bswap128_arr[16] =
	{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
	size_t vl_bytes = vl_u32 * 4;
	vuint8m1_t bswap128 = __riscv_vle8_v_u8m1(bswap128_arr, vl_bytes);

	return cast_u8m1_u32m1(
	__riscv_vrgather_vv_u8m1(cast_u32m1_u8m1(vec), bswap128, vl_bytes));
	}

	static ASM_FUNC_ATTR_INLINE vuint32m1_t
	unaligned_load_u32m1(const void *ptr, size_t vl_u32)
	{
	size_t vl_bytes = vl_u32 * 4;

	return cast_u8m1_u32m1(__riscv_vle8_v_u8m1(ptr, vl_bytes));
	}

	static ASM_FUNC_ATTR_INLINE void
	unaligned_store_u32m1(void *ptr, vuint32m1_t vec, size_t vl_u32)
	{
	size_t vl_bytes = vl_u32 * 4;

	__riscv_vse8_v_u8m1(ptr, cast_u32m1_u8m1(vec), vl_bytes);
	}

	static ASM_FUNC_ATTR_INLINE vuint32m4_t
	unaligned_load_u32m4(const void *ptr, size_t vl_u32)
	{
	size_t vl_bytes = vl_u32 * 4;

	return cast_u8m4_u32m4(__riscv_vle8_v_u8m4(ptr, vl_bytes));
	}

	static ASM_FUNC_ATTR_INLINE void
	unaligned_store_u32m4(void *ptr, vuint32m4_t vec, size_t vl_u32)
	{
	size_t vl_bytes = vl_u32 * 4;

	__riscv_vse8_v_u8m4(ptr, cast_u32m4_u8m4(vec), vl_bytes);
	}

	static vuint32m1_t
	vxor_u8_u32m1(vuint32m1_t a, vuint32m1_t b, size_t vl_u32)
	{
	size_t vl_bytes = vl_u32 * 4;

	return cast_u8m1_u32m1(__riscv_vxor_vv_u8m1(cast_u32m1_u8m1(a),
	cast_u32m1_u8m1(b), vl_bytes));
	}

	static vuint32m4_t
	vxor_u8_u32m4(vuint32m4_t a, vuint32m4_t b, size_t vl_u32)
	{
	size_t vl_bytes = vl_u32 * 4;

	return cast_u8m4_u32m4(__riscv_vxor_vv_u8m4(cast_u32m4_u8m4(a),
	cast_u32m4_u8m4(b), vl_bytes));
	}


	/*
	* HW support detection
	*/

	int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_setup_acceleration(RIJNDAEL_context *ctx)
	{
	(void)ctx;
	return (__riscv_vsetvl_e32m1(4) == 4);
	}


	/*
	* Key expansion
	*/

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	aes128_riscv_setkey (RIJNDAEL_context ctx, const byte key)
	{
	size_t vl = 4;

	vuint32m1_t round_key = unaligned_load_u32m1 (key, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[0][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 1, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[1][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 2, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[2][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 3, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[3][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 4, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[4][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 5, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[5][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 6, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[6][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 7, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[7][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 8, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[8][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 9, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[9][0], round_key, vl);

	round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 10, vl);
	__riscv_vse32_v_u32m1 (&ctx->keyschenc32[10][0], round_key, vl);

	clear_vec_regs();
	}

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	aes192_riscv_setkey (RIJNDAEL_context ctx, const byte key)
	{
	size_t vl = 4;
	u32 *w = &ctx->keyschenc32[0][0];
	u32 wr;
	vuint32m1_t rk_0_7;
	vuint32m1_t rk_4_11;

	rk_0_7 = unaligned_load_u32m1 (&key[0], vl);
	rk_4_11 = unaligned_load_u32m1 (&key[8], vl);
	__riscv_vse32_v_u32m1 (&w[0], rk_0_7, vl);
	__riscv_vse32_v_u32m1 (&w[2], rk_4_11, vl);

	#define AES192_KF1_GEN(out, input, round192, vl) \
	({ \
	- u32 temp_array[4] = { 0, 0, 0, 0 }; \
	- vuint32m1_t temp_vec; \
	- temp_array[3] = (input); \
	- temp_vec = __riscv_vle32_v_u32m1(temp_array, (vl)); \
	+ vuint32m1_t temp_vec = __riscv_vmv_v_x_u32m1(0, (vl)); \
	+ temp_vec = __riscv_vslide1down_vx_u32m1(temp_vec, (input), (vl)); \
	temp_vec = __riscv_vaeskf1_vi_u32m1(temp_vec, (round192), (vl)); \
	(out) = __riscv_vmv_x_s_u32m1_u32(temp_vec); \
	})

	#define AES192_EXPAND_BLOCK(w, round192, wr, last) \
	({ \
	(w)[(round192) * 6 + 0] = (w)[(round192) * 6 - 6] ^ (wr); \
	(w)[(round192) * 6 + 1] = (w)[(round192) * 6 - 5] ^ (w)[(round192) * 6 + 0]; \
	(w)[(round192) * 6 + 2] = (w)[(round192) * 6 - 4] ^ (w)[(round192) * 6 + 1]; \
	(w)[(round192) * 6 + 3] = (w)[(round192) * 6 - 3] ^ (w)[(round192) * 6 + 2]; \
	if (!(last)) \
	{ \
	(w)[(round192) * 6 + 4] = (w)[(round192) * 6 - 2] ^ (w)[(round192) * 6 + 3]; \
	(w)[(round192) * 6 + 5] = (w)[(round192) * 6 - 1] ^ (w)[(round192) * 6 + 4]; \
	} \
	})

	AES192_KF1_GEN(wr, w[5], 1, vl);
	AES192_EXPAND_BLOCK(w, 1, wr, 0);

	AES192_KF1_GEN(wr, w[11], 2, vl);
	AES192_EXPAND_BLOCK(w, 2, wr, 0);

	AES192_KF1_GEN(wr, w[17], 3, vl);
	AES192_EXPAND_BLOCK(w, 3, wr, 0);

	AES192_KF1_GEN(wr, w[23], 4, vl);
	AES192_EXPAND_BLOCK(w, 4, wr, 0);

	AES192_KF1_GEN(wr, w[29], 5, vl);
	AES192_EXPAND_BLOCK(w, 5, wr, 0);

	AES192_KF1_GEN(wr, w[35], 6, vl);
	AES192_EXPAND_BLOCK(w, 6, wr, 0);

	AES192_KF1_GEN(wr, w[41], 7, vl);
	AES192_EXPAND_BLOCK(w, 7, wr, 0);

	AES192_KF1_GEN(wr, w[47], 8, vl);
	AES192_EXPAND_BLOCK(w, 8, wr, 1);

	#undef AES192_KF1_GEN
	#undef AES192_EXPAND_BLOCK

	clear_vec_regs();
	}

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	aes256_riscv_setkey (RIJNDAEL_context ctx, const byte key)
	{
	size_t vl = 4;

	vuint32m1_t rk_a = unaligned_load_u32m1 (&key[0], vl);
	vuint32m1_t rk_b = unaligned_load_u32m1 (&key[16], vl);

	__riscv_vse32_v_u32m1(&ctx->keyschenc32[0][0], rk_a, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[1][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 2, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[2][0], rk_a, vl);

	rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 3, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[3][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 4, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[4][0], rk_a, vl);

	rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 5, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[5][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 6, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[6][0], rk_a, vl);

	rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 7, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[7][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 8, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[8][0], rk_a, vl);

	rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 9, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[9][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 10, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[10][0], rk_a, vl);

	rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 11, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[11][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 12, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[12][0], rk_a, vl);

	rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 13, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[13][0], rk_b, vl);

	rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 14, vl);
	__riscv_vse32_v_u32m1(&ctx->keyschenc32[14][0], rk_a, vl);

	clear_vec_regs();
	}

	void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context ctx, const byte key)
	{
	unsigned int rounds = ctx->rounds;

	if (rounds < 12)
	{
	aes128_riscv_setkey(ctx, key);
	}
	else if (rounds == 12)
	{
	aes192_riscv_setkey(ctx, key);
	_gcry_burn_stack(64);
	}
	else
	{
	aes256_riscv_setkey(ctx, key);
	}
	}

	static ASM_FUNC_ATTR_INLINE void
	do_prepare_decryption(RIJNDAEL_context *ctx)
	{
	u32 ekey = (u32 )(void *)ctx->keyschenc;
	u32 dkey = (u32 )(void *)ctx->keyschdec;
	int rounds = ctx->rounds;
	size_t vl = 4;
	int rr;
	int r;

	r = 0;
	rr = rounds;
	for (r = 0, rr = rounds; r <= rounds; r++, rr--)
	{
	__riscv_vse32_v_u32m1(dkey + r * 4,
	__riscv_vle32_v_u32m1(ekey + rr * 4, vl),
	vl);
	}
	}

	void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_prepare_decryption(RIJNDAEL_context *ctx)
	{
	do_prepare_decryption(ctx);
	clear_vec_regs();
	}


	/*
	* Encryption / Decryption
	*/

	#define ROUND_KEY_VARIABLES \
	vuint32m1_t rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7, rk8; \
	vuint32m1_t rk9, rk10, rk11, rk12, rk13, rk_last;

	#define PRELOAD_ROUND_KEYS(rk, nrounds, vl) \
	do { \
	rk0 = __riscv_vle32_v_u32m1(rk + 0 * 4, vl); \
	rk1 = __riscv_vle32_v_u32m1(rk + 1 * 4, vl); \
	rk2 = __riscv_vle32_v_u32m1(rk + 2 * 4, vl); \
	rk3 = __riscv_vle32_v_u32m1(rk + 3 * 4, vl); \
	rk4 = __riscv_vle32_v_u32m1(rk + 4 * 4, vl); \
	rk5 = __riscv_vle32_v_u32m1(rk + 5 * 4, vl); \
	rk6 = __riscv_vle32_v_u32m1(rk + 6 * 4, vl); \
	rk7 = __riscv_vle32_v_u32m1(rk + 7 * 4, vl); \
	rk8 = __riscv_vle32_v_u32m1(rk + 8 * 4, vl); \
	rk9 = __riscv_vle32_v_u32m1(rk + 9 * 4, vl); \
	if (UNLIKELY(nrounds >= 12)) \
	{ \
	rk10 = __riscv_vle32_v_u32m1(rk + 10 * 4, vl); \
	rk11 = __riscv_vle32_v_u32m1(rk + 11 * 4, vl); \
	if (LIKELY(nrounds > 12)) \
	{ \
	rk12 = __riscv_vle32_v_u32m1(rk + 12 * 4, vl); \
	rk13 = __riscv_vle32_v_u32m1(rk + 13 * 4, vl); \
	} \
	else \
	{ \
	rk12 = __riscv_vundefined_u32m1(); \
	rk13 = __riscv_vundefined_u32m1(); \
	} \
	} \
	else \
	{ \
	rk10 = __riscv_vundefined_u32m1(); \
	rk11 = __riscv_vundefined_u32m1(); \
	rk12 = __riscv_vundefined_u32m1(); \
	rk13 = __riscv_vundefined_u32m1(); \
	} \
	rk_last = __riscv_vle32_v_u32m1(rk + nrounds * 4, vl); \
	} while (0)

	#ifdef HAVE_BROKEN_VAES_VS_INTRINSIC
	#define AES_CRYPT(e_d, mx, nrounds, blk, vlen) \
	asm ( "vsetvli zero,%[vl],e32,"#mx",ta,ma;\n\t" \
	"vaesz.vs %[block],%[rk0];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk1];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk2];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk3];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk4];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk5];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk6];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk7];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk8];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk9];\n\t" \
	"blt %[rounds],%[num12],.Lcryptlast%=;\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk10];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk11];\n\t" \
	"beq %[rounds],%[num12],.Lcryptlast%=;\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk12];\n\t" \
	"vaes"#e_d"m.vs %[block],%[rk13];\n\t" \
	".Lcryptlast%=:\n\t" \
	"vaes"#e_d"f.vs %[block],%[rk_last];\n\t" \
	: [block] "+vr" (blk) \
	: [vl] "r" (vlen), [rounds] "r" (nrounds), [num12] "r" (12), \
	[rk0] "vr" (rk0), [rk1] "vr" (rk1), [rk2] "vr" (rk2), \
	[rk3] "vr" (rk3), [rk4] "vr" (rk4), [rk5] "vr" (rk5), \
	[rk6] "vr" (rk6), [rk7] "vr" (rk7), [rk8] "vr" (rk8), \
	[rk9] "vr" (rk9), [rk10] "vr" (rk10), [rk11] "vr" (rk11), \
	[rk12] "vr" (rk12), [rk13] "vr" (rk13), \
	[rk_last] "vr" (rk_last) \
	: "vl")
	#else
	#define AES_CRYPT(e_d, mx, rounds, block, vl) \
	({ \
	(block) = __riscv_vaesz_vs_u32m1_u32##mx((block), rk0, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk1, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk2, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk3, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk4, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk5, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk6, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk7, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk8, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk9, (vl)); \
	if (UNLIKELY((rounds) >= 12)) \
	{ \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk10, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk11, (vl)); \
	if (LIKELY((rounds) > 12)) \
	{ \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk12, (vl)); \
	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk13, (vl)); \
	} \
	} \
	(block) = __riscv_vaes##e_d##f_vs_u32m1_u32##mx((block), rk_last, (vl)); \
	})
	#endif

	unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context ctx, unsigned char out,
	const unsigned char *in)
	{
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	vuint32m1_t block;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	block = unaligned_load_u32m1(in, vl);

	AES_CRYPT(e, m1, rounds, block, vl);

	unaligned_store_u32m1(out, block, vl);

	clear_vec_regs();

	return 0; /* does not use stack */
	}

	unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context ctx, unsigned char out,
	const unsigned char *in)
	{
	const u32 *rk = ctx->keyschdec32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	vuint32m1_t block;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	block = unaligned_load_u32m1(in, vl);

	AES_CRYPT(d, m1, rounds, block, vl);

	unaligned_store_u32m1(out, block, vl);

	clear_vec_regs();

	return 0; /* does not use stack */
	}

	static ASM_FUNC_ATTR_INLINE void
	aes_riscv_zvkned_ecb_crypt (void context, void outbuf_arg,
	const void *inbuf_arg, size_t nblocks, int encrypt)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = encrypt ? ctx->keyschenc32[0] : ctx->keyschdec32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	ROUND_KEY_VARIABLES;

	if (!encrypt && !ctx->decryption_prepared)
	{
	do_prepare_decryption(ctx);
	ctx->decryption_prepared = 1;
	}

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint32m4_t blocks;

	blocks = unaligned_load_u32m4(inbuf, vl * 4);

	if (encrypt)
	AES_CRYPT(e, m4, rounds, blocks, vl * 4);
	else
	AES_CRYPT(d, m4, rounds, blocks, vl * 4);

	unaligned_store_u32m4(outbuf, blocks, vl * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t block;

	block = unaligned_load_u32m1(inbuf, vl);

	if (encrypt)
	AES_CRYPT(e, m1, rounds, block, vl);
	else
	AES_CRYPT(d, m1, rounds, block, vl);

	unaligned_store_u32m1(outbuf, block, vl);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	clear_vec_regs();
	}

	static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	aes_riscv_zvkned_ecb_enc (void context, void outbuf_arg,
	const void *inbuf_arg, size_t nblocks)
	{
	aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 1);
	}

	static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	aes_riscv_zvkned_ecb_dec (void context, void outbuf_arg,
	const void *inbuf_arg, size_t nblocks)
	{
	aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 0);
	}

	void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_ecb_crypt (void context, void outbuf_arg,
	const void *inbuf_arg, size_t nblocks,
	int encrypt)
	{
	if (encrypt)
	aes_riscv_zvkned_ecb_enc (context, outbuf_arg, inbuf_arg, nblocks);
	else
	aes_riscv_zvkned_ecb_dec (context, outbuf_arg, inbuf_arg, nblocks);
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_cfb_enc (void context, unsigned char iv_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	vuint32m1_t iv;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	iv = unaligned_load_u32m1(iv_arg, vl);

	for (; nblocks; nblocks--)
	{
	vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);

	AES_CRYPT(e, m1, rounds, iv, vl);

	data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(iv), data, vl_bytes);
	__riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
	iv = cast_u8m1_u32m1(data);

	outbuf += BLOCKSIZE;
	inbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(iv_arg, iv, vl);

	clear_vec_regs();
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_cbc_enc (void context, unsigned char iv_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks, int cbc_mac)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	size_t outbuf_add = (!cbc_mac) * BLOCKSIZE;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	vuint32m1_t iv;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	iv = unaligned_load_u32m1(iv_arg, vl);

	for (; nblocks; nblocks--)
	{
	vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
	iv = cast_u8m1_u32m1(
	__riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes));

	AES_CRYPT(e, m1, rounds, iv, vl);

	__riscv_vse8_v_u8m1(outbuf, cast_u32m1_u8m1(iv), vl_bytes);

	inbuf += BLOCKSIZE;
	outbuf += outbuf_add;
	}

	unaligned_store_u32m1(iv_arg, iv, vl);

	clear_vec_regs();
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_ctr_enc (void context, unsigned char ctr_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks)
	{
	static const byte add_u8_array[4][16] =
	{
	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
	};
	static const u64 carry_add[2] = { 1, 1 };
	static const u64 nocarry_add[2] = { 1, 0 };
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	u64 ctrlow;
	vuint32m1_t ctr;
	vuint8m1_t add1;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	add1 = __riscv_vle8_v_u8m1(add_u8_array[0], vl_bytes);
	ctr = unaligned_load_u32m1(ctr_arg, vl);
	ctrlow = __riscv_vmv_x_s_u64m1_u64(cast_u32m1_u64m1(bswap128_u32m1(ctr, vl)));

	memory_barrier_with_vec(add1);

	if (nblocks >= 4)
	{
	vuint8m1_t add2 = __riscv_vle8_v_u8m1(add_u8_array[1], vl_bytes);
	vuint8m1_t add3 = __riscv_vle8_v_u8m1(add_u8_array[2], vl_bytes);
	vuint8m1_t add4 = __riscv_vle8_v_u8m1(add_u8_array[3], vl_bytes);

	memory_barrier_with_vec(add2);
	memory_barrier_with_vec(add3);
	memory_barrier_with_vec(add4);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint8m4_t data4blks;
	vuint32m4_t ctr4blks;

	/* detect if 8-bit carry handling is needed */
	if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
	{
	static const u64 *adders[5][4] =
	{
	{ nocarry_add, nocarry_add, nocarry_add, carry_add },
	{ nocarry_add, nocarry_add, carry_add, nocarry_add },
	{ nocarry_add, carry_add, nocarry_add, nocarry_add },
	{ carry_add, nocarry_add, nocarry_add, nocarry_add },
	{ nocarry_add, nocarry_add, nocarry_add, nocarry_add }
	};
	unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
	vuint64m1_t ctr_u64;
	vuint32m1_t ctr_u32_1;
	vuint32m1_t ctr_u32_2;
	vuint32m1_t ctr_u32_3;
	vuint32m1_t ctr_u32_4;
	vuint64m1_t add_u64;

	/* Byte swap counter */
	ctr_u64 = cast_u32m1_u64m1(bswap128_u32m1(ctr, vl));

	/* Addition with carry handling */
	add_u64 = __riscv_vle64_v_u64m1(adders[idx][0], vl / 2);
	ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
	ctr_u32_1 = cast_u64m1_u32m1(ctr_u64);

	add_u64 = __riscv_vle64_v_u64m1(adders[idx][1], vl / 2);
	ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
	ctr_u32_2 = cast_u64m1_u32m1(ctr_u64);

	add_u64 = __riscv_vle64_v_u64m1(adders[idx][2], vl / 2);
	ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
	ctr_u32_3 = cast_u64m1_u32m1(ctr_u64);

	add_u64 = __riscv_vle64_v_u64m1(adders[idx][3], vl / 2);
	ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
	ctr_u32_4 = cast_u64m1_u32m1(ctr_u64);

	/* Byte swap counters */
	ctr_u32_1 = bswap128_u32m1(ctr_u32_1, vl);
	ctr_u32_2 = bswap128_u32m1(ctr_u32_2, vl);
	ctr_u32_3 = bswap128_u32m1(ctr_u32_3, vl);
	ctr_u32_4 = bswap128_u32m1(ctr_u32_4, vl);

	ctr4blks = __riscv_vundefined_u32m4();
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr_u32_1);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr_u32_2);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr_u32_3);
	ctr = ctr_u32_4;
	}
	else
	{
	/* Fast path addition without carry handling */
	vuint8m1_t ctr_u8 = cast_u32m1_u8m1(ctr);
	vuint8m1_t ctr1 = __riscv_vadd_vv_u8m1(ctr_u8, add1, vl_bytes);
	vuint8m1_t ctr2 = __riscv_vadd_vv_u8m1(ctr_u8, add2, vl_bytes);
	vuint8m1_t ctr3 = __riscv_vadd_vv_u8m1(ctr_u8, add3, vl_bytes);
	vuint8m4_t ctr0123_u8 = __riscv_vundefined_u8m4();

	ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(ctr_u8, add4,
	vl_bytes));

	ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 0, ctr_u8);
	ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 1, ctr1);
	ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 2, ctr2);
	ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 3, ctr3);

	ctr4blks = cast_u8m4_u32m4(ctr0123_u8);
	}

	data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);

	AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);

	data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
	vl_bytes * 4);
	__riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t block = ctr;
	vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);

	/* detect if 8-bit carry handling is needed */
	if (UNLIKELY((++ctrlow & 0xff) == 0))
	{
	const u64 *add_arr = UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add;
	vuint64m1_t add_val = __riscv_vle64_v_u64m1(add_arr, vl / 2);

	/* Byte swap counter */
	ctr = bswap128_u32m1(ctr, vl);

	/* Addition with carry handling */
	ctr = cast_u64m1_u32m1(__riscv_vadd_vv_u64m1(cast_u32m1_u64m1(ctr),
	add_val, vl / 2));

	/* Byte swap counter */
	ctr = bswap128_u32m1(ctr, vl);
	}
	else
	{
	/* Fast path addition without carry handling */
	ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(cast_u32m1_u8m1(ctr),
	add1, vl_bytes));
	}

	AES_CRYPT(e, m1, rounds, block, vl);

	data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
	__riscv_vse8_v_u8m1(outbuf, data, vl_bytes);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(ctr_arg, ctr, vl);

	clear_vec_regs();
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_ctr32le_enc (void context, unsigned char ctr_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks)
	{
	static const u32 add_u32_array[4][16] =
	{
	{ 1, }, { 2, }, { 3, }, { 4, }
	};
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	vuint32m1_t ctr;
	vuint32m1_t add1;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	add1 = __riscv_vle32_v_u32m1(add_u32_array[0], vl);
	ctr = unaligned_load_u32m1(ctr_arg, vl);

	memory_barrier_with_vec(add1);

	if (nblocks >= 4)
	{
	vuint32m1_t add2 = __riscv_vle32_v_u32m1(add_u32_array[1], vl);
	vuint32m1_t add3 = __riscv_vle32_v_u32m1(add_u32_array[2], vl);
	vuint32m1_t add4 = __riscv_vle32_v_u32m1(add_u32_array[3], vl);

	memory_barrier_with_vec(add2);
	memory_barrier_with_vec(add3);
	memory_barrier_with_vec(add4);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint32m1_t ctr1 = __riscv_vadd_vv_u32m1(ctr, add1, vl);
	vuint32m1_t ctr2 = __riscv_vadd_vv_u32m1(ctr, add2, vl);
	vuint32m1_t ctr3 = __riscv_vadd_vv_u32m1(ctr, add3, vl);
	vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
	vuint8m4_t data4blks;

	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr1);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr2);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr3);
	ctr = __riscv_vadd_vv_u32m1(ctr, add4, vl);

	data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);

	AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);

	data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
	vl_bytes * 4);
	__riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t block = ctr;
	vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);

	ctr = __riscv_vadd_vv_u32m1(ctr, add1, vl);

	AES_CRYPT(e, m1, rounds, block, vl);

	data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
	__riscv_vse8_v_u8m1(outbuf, data, vl_bytes);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(ctr_arg, ctr, vl);

	clear_vec_regs();
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_cfb_dec (void context, unsigned char iv_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	vuint32m1_t iv;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	iv = unaligned_load_u32m1(iv_arg, vl);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
	vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
	vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
	vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
	vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
	vuint32m4_t iv4blks = __riscv_vundefined_u32m4();

	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
	iv = iv4;

	AES_CRYPT(e, m4, rounds, iv4blks, vl * 4);

	data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
	unaligned_store_u32m4(outbuf, data4blks, vl * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
	vuint32m1_t new_iv = data;

	AES_CRYPT(e, m1, rounds, iv, vl);

	data = vxor_u8_u32m1(iv, data, vl);
	unaligned_store_u32m1(outbuf, data, vl);
	iv = new_iv;

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(iv_arg, iv, vl);

	clear_vec_regs();
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_cbc_dec (void context, unsigned char iv_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschdec32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	vuint32m1_t iv;
	ROUND_KEY_VARIABLES;

	if (!ctx->decryption_prepared)
	{
	do_prepare_decryption(ctx);
	ctx->decryption_prepared = 1;
	}

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	iv = unaligned_load_u32m1(iv_arg, vl);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
	vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
	vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
	vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
	vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
	vuint32m4_t iv4blks = __riscv_vundefined_u32m4();

	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
	iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);

	AES_CRYPT(d, m4, rounds, data4blks, vl * 4);

	data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
	unaligned_store_u32m4(outbuf, data4blks, vl * 4);
	iv = iv4;

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
	vuint32m1_t new_iv = data;

	AES_CRYPT(d, m1, rounds, data, vl);

	data = vxor_u8_u32m1(iv, data, vl);
	unaligned_store_u32m1(outbuf, data, vl);
	iv = new_iv;

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(iv_arg, iv, vl);

	clear_vec_regs();
	}

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
	aes_riscv_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
	const void *inbuf_arg, size_t nblocks)
	{
	RIJNDAEL_context ctx = (void )&c->context.c;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	u64 n = c->u_mode.ocb.data_nblocks;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	vuint32m1_t iv;
	vuint32m1_t ctr;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	/* Preload Offset and Checksum */
	iv = unaligned_load_u32m1(c->u_iv.iv, vl);
	ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);

	if (nblocks >= 4)
	{
	vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
	vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);

	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);

	for (; nblocks >= 4; nblocks -= 4)
	{
	const unsigned char *l;
	vuint8m1_t l_ntzi;
	vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
	vuint32m4_t offsets = __riscv_vundefined_u32m4();

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);

	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);

	data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);

	AES_CRYPT(e, m4, rounds, data4blks, vl * 4);

	data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);

	unaligned_store_u32m4(outbuf, data4blks, vl * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
	__riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
	ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
	ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
	}

	for (; nblocks; nblocks--)
	{
	const unsigned char *l;
	vuint8m1_t l_ntzi;
	vuint32m1_t data;

	data = unaligned_load_u32m1(inbuf, vl);

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr = vxor_u8_u32m1(ctr, data, vl);

	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);

	data = vxor_u8_u32m1(data, iv, vl);

	AES_CRYPT(e, m1, rounds, data, vl);

	data = vxor_u8_u32m1(iv, data, vl);
	unaligned_store_u32m1(outbuf, data, vl);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	c->u_mode.ocb.data_nblocks = n;

	unaligned_store_u32m1(c->u_iv.iv, iv, vl);
	unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);

	clear_vec_regs();

	return 0;
	}

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
	aes_riscv_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
	const void *inbuf_arg, size_t nblocks)
	{
	RIJNDAEL_context ctx = (void )&c->context.c;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	u64 n = c->u_mode.ocb.data_nblocks;
	const u32 *rk = ctx->keyschdec32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	vuint32m1_t iv;
	vuint32m1_t ctr;
	ROUND_KEY_VARIABLES;

	if (!ctx->decryption_prepared)
	{
	do_prepare_decryption(ctx);
	ctx->decryption_prepared = 1;
	}

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	/* Preload Offset and Checksum */
	iv = unaligned_load_u32m1(c->u_iv.iv, vl);
	ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);

	if (nblocks >= 4)
	{
	vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
	vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);

	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);

	for (; nblocks >= 4; nblocks -= 4)
	{
	const unsigned char *l;
	vuint8m1_t l_ntzi;
	vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
	vuint32m4_t offsets = __riscv_vundefined_u32m4();

	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
	/* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);

	data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);

	AES_CRYPT(d, m4, rounds, data4blks, vl * 4);

	data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);

	unaligned_store_u32m4(outbuf, data4blks, vl * 4);

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
	__riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
	ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
	ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
	}

	for (; nblocks; nblocks--)
	{
	const unsigned char *l;
	vuint8m1_t l_ntzi;
	vuint8m1_t data;
	vuint32m1_t block;

	l = ocb_get_l(c, ++n);

	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	data = __riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes);
	block = cast_u8m1_u32m1(data);

	AES_CRYPT(d, m1, rounds, block, vl);

	block = vxor_u8_u32m1(iv, block, vl);
	unaligned_store_u32m1(outbuf, block, vl);

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr = vxor_u8_u32m1(ctr, block, vl);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	c->u_mode.ocb.data_nblocks = n;

	unaligned_store_u32m1(c->u_iv.iv, iv, vl);
	unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);

	clear_vec_regs();

	return 0;
	}

	size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
	const void *inbuf_arg, size_t nblocks,
	int encrypt)
	{
	if (encrypt)
	return aes_riscv_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
	else
	return aes_riscv_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
	}

	size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
	_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
	size_t nblocks)
	{
	RIJNDAEL_context ctx = (void )&c->context.c;
	const unsigned char *abuf = abuf_arg;
	u64 n = c->u_mode.ocb.aad_nblocks;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	size_t vl_bytes = vl * 4;
	vuint32m1_t iv;
	vuint32m1_t ctr;
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	/* Preload Offset and Sum */
	iv = unaligned_load_u32m1(c->u_mode.ocb.aad_offset, vl);
	ctr = unaligned_load_u32m1(c->u_mode.ocb.aad_sum, vl);

	if (nblocks >= 4)
	{
	vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
	vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);

	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
	ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);

	for (; nblocks >= 4; nblocks -= 4)
	{
	const unsigned char *l;
	vuint8m1_t l_ntzi;
	vuint32m4_t data4blks = unaligned_load_u32m4(abuf, vl * 4);
	vuint32m4_t offsets = __riscv_vundefined_u32m4();

	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);

	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
	offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);

	data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);

	AES_CRYPT(e, m4, rounds, data4blks, vl * 4);

	ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);

	abuf += 4 * BLOCKSIZE;
	}

	/* Checksum_i = Checksum_{i-1} xor P_i */
	ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
	__riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
	ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
	ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
	}

	for (; nblocks; nblocks--)
	{
	const unsigned char *l;
	vuint8m1_t l_ntzi;
	vuint32m1_t data;

	data = unaligned_load_u32m1(abuf, vl);

	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
	l = ocb_get_l(c, ++n);
	l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
	iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);

	data = vxor_u8_u32m1(data, iv, vl);

	AES_CRYPT(e, m1, rounds, data, vl);

	ctr = vxor_u8_u32m1(ctr, data, vl);

	abuf += BLOCKSIZE;
	}

	c->u_mode.ocb.aad_nblocks = n;

	unaligned_store_u32m1(c->u_mode.ocb.aad_offset, iv, vl);
	unaligned_store_u32m1(c->u_mode.ocb.aad_sum, ctr, vl);

	clear_vec_regs();

	return 0;
	}

	static const u64 xts_gfmul_const[2] = { 0x87, 0x01 };
	static const u64 xts_swap64_const[2] = { 1, 0 };

	static ASM_FUNC_ATTR_INLINE vuint32m1_t
	xts_gfmul_byA (vuint32m1_t vec_in, vuint64m1_t xts_gfmul,
	vuint64m1_t xts_swap64, size_t vl)
	{
	vuint64m1_t in_u64 = cast_u32m1_u64m1(vec_in);
	vuint64m1_t tmp1;

	tmp1 =
	__riscv_vrgather_vv_u64m1(cast_u32m1_u64m1(vec_in), xts_swap64, vl / 2);
	tmp1 = cast_i64m1_u64m1(
	__riscv_vsra_vx_i64m1(cast_u64m1_i64m1(tmp1), 63, vl / 2));
	in_u64 = __riscv_vadd_vv_u64m1(in_u64, in_u64, vl / 2);
	tmp1 = __riscv_vand_vv_u64m1(tmp1, xts_gfmul, vl / 2);

	return cast_u64m1_u32m1(__riscv_vxor_vv_u64m1(in_u64, tmp1, vl / 2));
	}

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	aes_riscv_xts_enc (void context, unsigned char tweak_arg, void *outbuf_arg,
	const void *inbuf_arg, size_t nblocks)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschenc32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	vuint32m1_t tweak;
	vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
	vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
	ROUND_KEY_VARIABLES;

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	/* Preload tweak */
	tweak = unaligned_load_u32m1(tweak_arg, vl);

	memory_barrier_with_vec(xts_gfmul);
	memory_barrier_with_vec(xts_swap64);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
	vuint32m4_t tweaks = __riscv_vundefined_u32m4();

	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);

	data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);

	AES_CRYPT(e, m4, rounds, data4blks, vl * 4);

	data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);

	unaligned_store_u32m4(outbuf, data4blks, vl * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
	vuint32m1_t tweak0 = tweak;

	data = vxor_u8_u32m1(data, tweak0, vl);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);

	AES_CRYPT(e, m1, rounds, data, vl);

	data = vxor_u8_u32m1(data, tweak0, vl);
	unaligned_store_u32m1(outbuf, data, vl);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(tweak_arg, tweak, vl);

	clear_vec_regs();
	}

	static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	aes_riscv_xts_dec (void context, unsigned char tweak_arg, void *outbuf_arg,
	const void *inbuf_arg, size_t nblocks)
	{
	RIJNDAEL_context *ctx = context;
	unsigned char *outbuf = outbuf_arg;
	const unsigned char *inbuf = inbuf_arg;
	const u32 *rk = ctx->keyschdec32[0];
	int rounds = ctx->rounds;
	size_t vl = 4;
	vuint32m1_t tweak;
	vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
	vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
	ROUND_KEY_VARIABLES;

	if (!ctx->decryption_prepared)
	{
	do_prepare_decryption(ctx);
	ctx->decryption_prepared = 1;
	}

	PRELOAD_ROUND_KEYS (rk, rounds, vl);

	/* Preload tweak */
	tweak = unaligned_load_u32m1(tweak_arg, vl);

	memory_barrier_with_vec(xts_gfmul);
	memory_barrier_with_vec(xts_swap64);

	for (; nblocks >= 4; nblocks -= 4)
	{
	vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
	vuint32m4_t tweaks = __riscv_vundefined_u32m4();

	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
	tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);

	data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);

	AES_CRYPT(d, m4, rounds, data4blks, vl * 4);

	data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);

	unaligned_store_u32m4(outbuf, data4blks, vl * 4);

	inbuf += 4 * BLOCKSIZE;
	outbuf += 4 * BLOCKSIZE;
	}

	for (; nblocks; nblocks--)
	{
	vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
	vuint32m1_t tweak0 = tweak;

	data = vxor_u8_u32m1(data, tweak0, vl);
	tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);

	AES_CRYPT(d, m1, rounds, data, vl);

	data = vxor_u8_u32m1(data, tweak0, vl);
	unaligned_store_u32m1(outbuf, data, vl);

	inbuf += BLOCKSIZE;
	outbuf += BLOCKSIZE;
	}

	unaligned_store_u32m1(tweak_arg, tweak, vl);

	clear_vec_regs();
	}

	ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
	_gcry_aes_riscv_zvkned_xts_crypt (void context, unsigned char tweak_arg,
	void outbuf_arg, const void inbuf_arg,
	size_t nblocks, int encrypt)
	{
	if (encrypt)
	aes_riscv_xts_enc(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
	else
	aes_riscv_xts_dec(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
	}

	#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */

File Metadata

Mime Type: text/x-diff
Expires: Tue, Jan 20, 11:43 PM (1 d, 22 h)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: 47/89/026161f4e7b392d5592373e56b34

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions