diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index cded7cb2..69805085 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -1,687 +1,700 @@
 /* poly1305.c  -  Poly1305 internals and generic implementation
  * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "poly1305-internal.h"
 
 #include "mpi-internal.h"
 #include "longlong.h"
 
 
 static const char *selftest (void);
 
 
 #undef USE_MPI_64BIT
 #undef USE_MPI_32BIT
 #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64_TYPEDEF)
 # define USE_MPI_64BIT 1
 #elif BYTES_PER_MPI_LIMB == 4
 # define USE_MPI_32BIT 1
 #else
 # error please implement for this limb size.
 #endif
 
 
 static void poly1305_init (poly1305_context_t *ctx,
 			   const byte key[POLY1305_KEYLEN])
 {
   POLY1305_STATE *st = &ctx->state;
 
   ctx->leftover = 0;
 
   st->h[0] = 0;
   st->h[1] = 0;
   st->h[2] = 0;
   st->h[3] = 0;
   st->h[4] = 0;
 
   st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
   st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
   st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
   st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
 
   st->k[0] = buf_get_le32(key + 16);
   st->k[1] = buf_get_le32(key + 20);
   st->k[2] = buf_get_le32(key + 24);
   st->k[3] = buf_get_le32(key + 28);
 }
 
 
 #ifdef USE_MPI_64BIT
 
 #if defined (__aarch64__) && __GNUC__ >= 4
 
 /* A += B (armv8/aarch64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("adds %0, %3, %0\n" \
 	       "adcs %1, %4, %1\n" \
 	       "adc  %2, %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "r" (B0), "r" (B1), "r" (B2) \
 	       : "cc" )
 
 #endif /* __aarch64__ */
 
 #if defined (__x86_64__) && __GNUC__ >= 4
 
 /* A += B (x86-64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("addq %3, %0\n" \
 	       "adcq %4, %1\n" \
 	       "adcq %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "g" (B0), "g" (B1), "g" (B2) \
 	       : "cc" )
 
 #endif /* __x86_64__ */
 
+#if defined (__powerpc__) && __GNUC__ >= 4
+
+/* A += B (ppc64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("addc %0, %3, %0\n" \
+	       "adde %1, %4, %1\n" \
+	       "adde %2, %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "r" (B0), "r" (B1), "r" (B2) \
+	       : "cc" )
+
+#endif /* __powerpc__ */
+
 #ifndef ADD_1305_64
 /* A += B (generic, mpi) */
 #  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
     u64 carry; \
     add_ssaaaa(carry, A0, 0, A0, 0, B0); \
     add_ssaaaa(A2, A1, A2, A1, B2, B1); \
     add_ssaaaa(A2, A1, A2, A1, 0, carry); \
   } while (0)
 #endif
 
 /* H = H * R mod 2¹³⁰-5 */
 #define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
     u64 x0_lo, x0_hi, x1_lo, x1_hi; \
     u64 t0_lo, t0_hi, t1_lo, t1_hi; \
     \
     /* x = a * r (partial mod 2^130-5) */ \
     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
     \
     umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
     add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
     umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
     add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
     \
     t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
     t1_hi = H2 * R0;       /* h2 * r0 */ \
     add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
     \
     /* carry propagation */ \
     H2 = H0 & 3; \
     H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
     ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
   } while (0)
 
 static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u64 r0, r1, r1_mult5;
   u64 h0, h1, h2;
   u64 m0, m1, m2;
 
   m2 = high_pad;
 
   h0 = st->h[0] + ((u64)st->h[1] << 32);
   h1 = st->h[2] + ((u64)st->h[3] << 32);
   h2 = st->h[4];
 
   r0 = st->r[0] + ((u64)st->r[1] << 32);
   r1 = st->r[2] + ((u64)st->r[3] << 32);
 
   r1_mult5 = (r1 >> 2) + r1;
 
   m0 = buf_get_le64(buf + 0);
   m1 = buf_get_le64(buf + 8);
   buf += POLY1305_BLOCKSIZE;
   len -= POLY1305_BLOCKSIZE;
 
   while (len >= POLY1305_BLOCKSIZE)
     {
       /* a = h + m */
       ADD_1305_64(h2, h1, h0, m2, m1, m0);
 
       m0 = buf_get_le64(buf + 0);
       m1 = buf_get_le64(buf + 8);
 
       /* h = a * r (partial mod 2^130-5) */
       MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
       buf += POLY1305_BLOCKSIZE;
       len -= POLY1305_BLOCKSIZE;
     }
 
   /* a = h + m */
   ADD_1305_64(h2, h1, h0, m2, m1, m0);
 
   /* h = a * r (partial mod 2^130-5) */
   MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
   st->h[0] = h0;
   st->h[1] = h0 >> 32;
   st->h[2] = h1;
   st->h[3] = h1 >> 32;
   st->h[4] = h2;
 
   return 6 * sizeof (void *) + 18 * sizeof (u64);
 }
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
 				    byte mac[POLY1305_TAGLEN])
 {
   POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
   u64 u, carry;
   u64 k0, k1;
   u64 h0, h1;
   u64 h2;
 
   /* process the remaining block */
   if (ctx->leftover)
     {
       ctx->buffer[ctx->leftover++] = 1;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
 	{
 	  memset (&ctx->buffer[ctx->leftover], 0,
 		  POLY1305_BLOCKSIZE - ctx->leftover);
 	  ctx->leftover = POLY1305_BLOCKSIZE;
 	}
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
   h0 = st->h[0] + ((u64)st->h[1] << 32);
   h1 = st->h[2] + ((u64)st->h[3] << 32);
   h2 = st->h[4];
 
   k0 = st->k[0] + ((u64)st->k[1] << 32);
   k1 = st->k[2] + ((u64)st->k[3] << 32);
 
   /* check if h is more than 2^130-5, by adding 5. */
   add_ssaaaa(carry, u, 0, h0, 0, 5);
   add_ssaaaa(carry, u, 0, carry, 0, h1);
   u = (carry + h2) >> 2; /* u == 0 or 1 */
 
   /* minus 2^130-5 ... (+5) */
   u = (-u) & 5;
   add_ssaaaa(h1, h0, h1, h0, 0, u);
 
   /* add high part of key + h */
   add_ssaaaa(h1, h0, h1, h0, k1, k0);
   buf_put_le64(mac + 0, h0);
   buf_put_le64(mac + 8, h1);
 
   /* burn_stack */
   return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
 }
 
 #endif /* USE_MPI_64BIT */
 
 #ifdef USE_MPI_32BIT
 
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 /* HI:LO += A * B (arm) */
 #define UMUL_ADD_32(HI, LO, A, B) \
       __asm__ ("umlal %1, %0, %4, %5" \
 	       : "=r" (HI), "=r" (LO) \
 	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
 
 /* A += B (arm) */
 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
       __asm__ ("adds %0, %0, %5\n" \
 	       "adcs %1, %1, %6\n" \
 	       "adcs %2, %2, %7\n" \
 	       "adcs %3, %3, %8\n" \
 	       "adc %4, %4, %9\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
 	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3), "r" (B4) \
 	       : "cc" )
 
 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
 
 #if defined (__i386__) && __GNUC__ >= 4
 
 /* A += B (i386) */
 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
       __asm__ ("addl %5, %0\n" \
 	       "adcl %6, %1\n" \
 	       "adcl %7, %2\n" \
 	       "adcl %8, %3\n" \
 	       "adcl %9, %4\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
 	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
 	       : "cc" )
 
 #endif /* __i386__ */
 
 #ifndef UMUL_ADD_32
 /* HI:LO += A * B (generic, mpi) */
 #  define UMUL_ADD_32(HI, LO, A, B) do { \
     u32 t_lo, t_hi; \
     umul_ppmm(t_hi, t_lo, A, B); \
     add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
   } while (0)
 #endif
 
 #ifndef ADD_1305_32
 /* A += B (generic, mpi) */
 #  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
     u32 carry0, carry1, carry2; \
     add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
     add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
     add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
     add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
     add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
     add_ssaaaa(A4, A3, A4, A3, B4, B3); \
     add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
   } while (0)
 #endif
 
 /* H = H * R mod 2¹³⁰-5 */
 #define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
                         R3_MULT5, R2_MULT5, R1_MULT5) do { \
     u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
     u32 t0_lo, t0_hi; \
     \
     /* x = a * r (partial mod 2^130-5) */ \
     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
     umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
     umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
     UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
     H1 = x0_hi; \
     UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
     \
     t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
     t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
     add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
     add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
     t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
     t0_hi = H4 * R0;       /* h4 * r0 */ \
     add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
     \
     /* carry propagation */ \
     H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
     H4 = H4 & 3; \
     ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
   } while (0)
 
 static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u32 r1_mult5, r2_mult5, r3_mult5;
   u32 h0, h1, h2, h3, h4;
   u32 m0, m1, m2, m3, m4;
 
   m4 = high_pad;
 
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
   r1_mult5 = (st->r[1] >> 2) + st->r[1];
   r2_mult5 = (st->r[2] >> 2) + st->r[2];
   r3_mult5 = (st->r[3] >> 2) + st->r[3];
 
   while (len >= POLY1305_BLOCKSIZE)
     {
       m0 = buf_get_le32(buf + 0);
       m1 = buf_get_le32(buf + 4);
       m2 = buf_get_le32(buf + 8);
       m3 = buf_get_le32(buf + 12);
 
       /* a = h + m */
       ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
 
       /* h = a * r (partial mod 2^130-5) */
       MUL_MOD_1305_32(h4, h3, h2, h1, h0,
 		      st->r[3], st->r[2], st->r[1], st->r[0],
 		      r3_mult5, r2_mult5, r1_mult5);
 
       buf += POLY1305_BLOCKSIZE;
       len -= POLY1305_BLOCKSIZE;
     }
 
   st->h[0] = h0;
   st->h[1] = h1;
   st->h[2] = h2;
   st->h[3] = h3;
   st->h[4] = h4;
 
   return 6 * sizeof (void *) + 28 * sizeof (u32);
 }
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
 				    byte mac[POLY1305_TAGLEN])
 {
   POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
   u32 carry, tmp0, tmp1, tmp2, u;
   u32 h4, h3, h2, h1, h0;
 
   /* process the remaining block */
   if (ctx->leftover)
     {
       ctx->buffer[ctx->leftover++] = 1;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
 	{
 	  memset (&ctx->buffer[ctx->leftover], 0,
 		  POLY1305_BLOCKSIZE - ctx->leftover);
 	  ctx->leftover = POLY1305_BLOCKSIZE;
 	}
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
   /* check if h is more than 2^130-5, by adding 5. */
   add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
   u = (carry + h4) >> 2; /* u == 0 or 1 */
 
   /* minus 2^130-5 ... (+5) */
   u = (-u) & 5;
   add_ssaaaa(carry, h0, 0, h0, 0, u);
   add_ssaaaa(carry, h1, 0, h1, 0, carry);
   add_ssaaaa(carry, h2, 0, h2, 0, carry);
   add_ssaaaa(carry, h3, 0, h3, 0, carry);
 
   /* add high part of key + h */
   add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
   add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
   add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
   add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
   add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
   add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
   h3 += tmp2;
 
   buf_put_le32(mac + 0, h0);
   buf_put_le32(mac + 4, h1);
   buf_put_le32(mac + 8, h2);
   buf_put_le32(mac + 12, h3);
 
   /* burn_stack */
   return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
 }
 
 #endif /* USE_MPI_32BIT */
 
 
 unsigned int
 _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
 			    size_t bytes)
 {
   unsigned int burn = 0;
 
   /* handle leftover */
   if (ctx->leftover)
     {
       size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
       if (want > bytes)
 	want = bytes;
       buf_cpy (ctx->buffer + ctx->leftover, m, want);
       bytes -= want;
       m += want;
       ctx->leftover += want;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
 	return 0;
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
       ctx->leftover = 0;
     }
 
   /* process full blocks */
   if (bytes >= POLY1305_BLOCKSIZE)
     {
       size_t nblks = bytes / POLY1305_BLOCKSIZE;
       burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
       m += nblks * POLY1305_BLOCKSIZE;
       bytes -= nblks * POLY1305_BLOCKSIZE;
     }
 
   /* store leftover */
   if (bytes)
     {
       buf_cpy (ctx->buffer + ctx->leftover, m, bytes);
       ctx->leftover += bytes;
     }
 
   return burn;
 }
 
 
 void
 _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
 {
   unsigned int burn;
 
   burn = _gcry_poly1305_update_burn (ctx, m, bytes);
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 void
 _gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
 {
   unsigned int burn;
 
   burn = poly1305_final (ctx, mac);
 
   _gcry_burn_stack (burn);
 }
 
 
 gcry_err_code_t
 _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
 		     size_t keylen)
 {
   static int initialized;
   static const char *selftest_failed;
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
 	log_error ("Poly1305 selftest failed (%s)\n", selftest_failed);
     }
 
   if (keylen != POLY1305_KEYLEN)
     return GPG_ERR_INV_KEYLEN;
 
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   poly1305_init (ctx, key);
 
   return 0;
 }
 
 
 static void
 poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes,
 	       const byte * key)
 {
   poly1305_context_t ctx;
 
   memset (&ctx, 0, sizeof (ctx));
 
   _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN);
   _gcry_poly1305_update (&ctx, m, bytes);
   _gcry_poly1305_finish (&ctx, mac);
 
   wipememory (&ctx, sizeof (ctx));
 }
 
 
 static const char *
 selftest (void)
 {
   /* example from nacl */
   static const byte nacl_key[POLY1305_KEYLEN] = {
     0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
     0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
     0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
     0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80,
   };
 
   static const byte nacl_msg[131] = {
     0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
     0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
     0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
     0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
     0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
     0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
     0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
     0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
     0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
     0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
     0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
     0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
     0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
     0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
     0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
     0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
     0xe3, 0x55, 0xa5
   };
 
   static const byte nacl_mac[16] = {
     0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
     0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
   };
 
   /* generates a final value of (2^130 - 2) == 3 */
   static const byte wrap_key[POLY1305_KEYLEN] = {
     0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
 
   static const byte wrap_msg[16] = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   };
 
   static const byte wrap_mac[16] = {
     0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
 
   /* mac of the macs of messages of length 0 to 256, where the key and messages
    * have all their values set to the length
    */
   static const byte total_key[POLY1305_KEYLEN] = {
     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   };
 
   static const byte total_mac[16] = {
     0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd,
     0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39
   };
 
   poly1305_context_t ctx;
   poly1305_context_t total_ctx;
   byte all_key[POLY1305_KEYLEN];
   byte all_msg[256];
   byte mac[16];
   size_t i, j;
 
   memset (&ctx, 0, sizeof (ctx));
   memset (&total_ctx, 0, sizeof (total_ctx));
 
   memset (mac, 0, sizeof (mac));
   poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key);
   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 1 failed.";
 
   /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so
    * make sure everything still works varying between them */
   memset (mac, 0, sizeof (mac));
   _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN);
   _gcry_poly1305_update (&ctx, nacl_msg + 0, 32);
   _gcry_poly1305_update (&ctx, nacl_msg + 32, 64);
   _gcry_poly1305_update (&ctx, nacl_msg + 96, 16);
   _gcry_poly1305_update (&ctx, nacl_msg + 112, 8);
   _gcry_poly1305_update (&ctx, nacl_msg + 120, 4);
   _gcry_poly1305_update (&ctx, nacl_msg + 124, 2);
   _gcry_poly1305_update (&ctx, nacl_msg + 126, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 127, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 128, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 129, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 130, 1);
   _gcry_poly1305_finish (&ctx, mac);
   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 2 failed.";
 
   memset (mac, 0, sizeof (mac));
   poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key);
   if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 3 failed.";
 
   _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN);
   for (i = 0; i < 256; i++)
     {
       /* set key and message to 'i,i,i..' */
       for (j = 0; j < sizeof (all_key); j++)
 	all_key[j] = i;
       for (j = 0; j < i; j++)
 	all_msg[j] = i;
       poly1305_auth (mac, all_msg, i, all_key);
       _gcry_poly1305_update (&total_ctx, mac, 16);
     }
   _gcry_poly1305_finish (&total_ctx, mac);
   if (memcmp (total_mac, mac, sizeof (total_mac)) != 0)
     return "Poly1305 test 4 failed.";
 
   return NULL;
 }