diff --git a/src/3rdparty/monocypher/AUTHORS.md b/src/3rdparty/monocypher/AUTHORS.md
index 007c4c6f8c..65e2c01685 100644
--- a/src/3rdparty/monocypher/AUTHORS.md
+++ b/src/3rdparty/monocypher/AUTHORS.md
@@ -1,21 +1,21 @@
 Designers
 ---------
 
-- **Chacha20:** Daniel J. Bernstein.
+- **ChaCha20:** Daniel J. Bernstein.
 - **Poly1305:** Daniel J. Bernstein.
-- **Blake2:**   Jean-Philippe Aumasson, Christian Winnerlein, Samuel Neves,
-                and Zooko Wilcox-O'Hearn
-- **Argon2:**   Alex Biryukov, Daniel Dinu, and Dmitry Khovratovich
-- **X25519:**   Daniel J. Bernstein
+- **BLAKE2:**   Jean-Philippe Aumasson, Christian Winnerlein, Samuel Neves,
+                and Zooko Wilcox-O'Hearn.
+- **Argon2:**   Alex Biryukov, Daniel Dinu, and Dmitry Khovratovich.
+- **X25519:**   Daniel J. Bernstein.
 - **EdDSA:**    Daniel J. Bernstein, Bo-Yin Yang, Niels Duif, Peter
-                Schwabe, and Tanja Lange
+                Schwabe, and Tanja Lange.
 
 Implementors
 ------------
 
-- **Chacha20:** Loup Vaillant, implemented from spec.
+- **ChaCha20:** Loup Vaillant, implemented from spec.
 - **Poly1305:** Loup Vaillant, implemented from spec.
-- **Blake2b:**  Loup Vaillant, implemented from spec.
+- **BLAKE2b:**  Loup Vaillant, implemented from spec.
 - **Argon2i:**  Loup Vaillant, implemented from spec.
 - **X25519:**   Daniel J. Bernstein, taken and packaged from SUPERCOP
                 ref10.
@@ -42,7 +42,7 @@ Thanks
 
 Fabio Scotoni provided much needed advice about testing, interface,
 packaging, and the general direction of the whole project.  He also
-redesigned monocypher.org style sheets.
+redesigned the monocypher.org style sheets.
 
 Mike Pechkin and André Maroneze found bugs in earlier versions of
 Monocypher.
@@ -55,3 +55,9 @@ Mike Hamburg explained comb algorithms, including the signed
 all-bits-set comb described in his 2012 paper, Fast and compact
 elliptic-curve cryptography.  This made EdDSA signatures over twice as
 fast.
+
+Samuel Lucas found many typos in both the manual and the website.
+
+Jens Alfke added some #ifdefs that enabled Monocypher to compile into
+a C++ namespace, preventing symbol collisions with similarly-named
+functions in other crypto libraries.
diff --git a/src/3rdparty/monocypher/CHANGELOG.md b/src/3rdparty/monocypher/CHANGELOG.md
index ddbf926b64..fb5d3a5159 100644
--- a/src/3rdparty/monocypher/CHANGELOG.md
+++ b/src/3rdparty/monocypher/CHANGELOG.md
@@ -1,3 +1,18 @@
+3.1.3
+-----
+2022/04/25
+
+- Fixed many typos in the documentation.
+- Fixed buffer overflow in speed benchmarks.
+- Fixed some MSVC warnings.
+- Fixed a minor violation of the Elligator2 reverse map specs.
+- Added `change-prefix.sh` to help changing the `crypto_` prefix.
+- Added the `MONOCYPHER_CPP_NAMESPACE` preprocessor definition to
+  support namespaces for C++.
+- Deprecated `crypto_key_exchange()`
+- Use GitHub actions to automate the regular test suite.
+
+
 3.1.2
 -----
 2020/12/27
@@ -32,7 +47,7 @@
 
 - Added Elligator 2 mappings (hash to curve, curve to hash).
 - Added OPRF support (with scalar inversion).
-- Added Edwards25519 -> Curve25519 conversions
+- Added Edwards25519 -> Curve25519 conversions.
 
 
 3.0.0
@@ -47,8 +62,8 @@
 - `crypto_x25519()` and `crypto_key_exchange()` now return `void`.
 - Added a custom hash interface to EdDSA.  Several instances of EdDSA
   can share the same binary.
-- Added optional support for HMAC SHA-512
-- Moved all SHA-512 operations to `src/optional/monocypher-ed25519.(h|c)`
+- Added optional support for HMAC SHA-512.
+- Moved SHA-512 operations to `src/optional/monocypher-ed25519.(h|c)`.
 - Optional support for Ed25519 no longer requires a preprocessor flag.
   Add `src/optional/monocypher-ed25519.(h|c)` to your project instead.
 
@@ -57,9 +72,9 @@
 -----
 2019/10/21
 
-- Added the `BLAKE2_NO_UNROLLING` preprocessor definition. Activating it
-  makes the binary about 5KB smaller, and speeds up processing times on
-  many embedded processors.
+- Added the `BLAKE2_NO_UNROLLING` preprocessor definition. Activating
+  it makes the binary about 5KB smaller and speeds up processing times
+  on many embedded processors.
 - Reduced the stack usage of signature verification by about
   40%. Signature verification now fits in smaller machines.
 - Fixed many implicit casts warnings.
@@ -78,8 +93,8 @@
 -----
 2018/06/24
 
-- Corrected a critical vulnerability in EdDSA, where crypto_check() was
-  accepting invalid signatures.  (Found by Mike Pechkin.)  The current
+- Corrected a critical vulnerability found by Mike Pechkin in EdDSA,
+  where crypto_check() was accepting invalid signatures.  The current
   fix removes a buggy optimisation, effectively halving the performance
   of EdDSA.
 - The test suite no longer tries to allocate zero bytes (some platforms
@@ -89,8 +104,8 @@
 -----
 2018/06/16
 
-- Corrected undefined behaviour in Blake2b
-- Improved the test suite (faster, better coverage)
+- Corrected undefined behaviour in BLAKE2b.
+- Improved the test suite (faster, better coverage).
 
 2.0.2
 -----
@@ -118,13 +133,13 @@
 - Changed the authenticated encryption format.  It now conforms to
   RFC 7539, with one exception: it uses XChacha20 initialisation instead
   of the IETF version of Chacha20.  This new format conforms to
-  Libsodium's `crypto_aead_xchacha20poly1305_ietf_encrypt`.
+  libsodium's `crypto_aead_xchacha20poly1305_ietf_encrypt`.
 - Removed `crypto_lock_encrypt()` and `crypto_lock_auth()`.
 - Renamed `crypto_lock_aead_auth()` to `crypto_lock_auth_ad()`.
 - Renamed `crypto_unlock_aead_auth()` to `crypto_unlock_auth_ad()`.
-- Added  `crypto_lock_auth_message()` and `crypto_unlock_auth_message()`
-- Renamed `crypto_aead_lock` to `crypto_lock_aead`;
-- Renamed `crypto_aead_unlock` to `crypto_unlock_aead`;
+- Added `crypto_lock_auth_message()` and `crypto_unlock_auth_message()`.
+- Renamed `crypto_aead_lock` to `crypto_lock_aead`.
+- Renamed `crypto_aead_unlock` to `crypto_unlock_aead`.
 
 The format change facilitates optimisation by aligning data to block
 boundaries.  The API changes increase consistency.
@@ -144,8 +159,8 @@ boundaries.  The API changes increase consistency.
 2017/07/23
 
 - Optimised the loading and unloading code of the symmetric crypto
-  (Blake2b, sha512, Chacha20, and Poly1305).
-- Fused self contained tests together for easier analysis with Frama-C
+  (BLAKE2b, SHA-512, Chacha20, and Poly1305).
+- Fused self-contained tests together for easier analysis with Frama-C
   and the TIS interpreter.
 
 1.0
@@ -164,7 +179,7 @@ boundaries.  The API changes increase consistency.
 
 - Added about a hundred lines of code to improve performance of public
   key cryptography.  Diffie-Hellman is now 20% faster than before.
-  (The effects are less pronounces for EdDSA).
+  The effects are less pronounced for EdDSA.
 - Added random self-consistency tests.
 - Added a speed benchmark against libsodium.
 
@@ -175,30 +190,30 @@ boundaries.  The API changes increase consistency.
 - Slightly changed the authenticated encryption API.  Functions are
   now all in "detached" mode.  The reason is better support for
   authenticated encryption _without_ additional data.
-- Rewrote Blake2b from spec, so it can use the same licence as
+- Rewrote BLAKE2b from spec so it can use the same licence as
   everything else.
 - Added random tests that compare Monocypher with libsodium and
   ed25519-donna.
 - Added explicit support for Frama-C analysis (this doesn't affect the
-  source code)
+  source code).
 
 0.6
 ---
 2017/03/17
 
-- Fixed incorrect poly1305 output on empty messages.  (Found by Mike
+- Fixed incorrect Poly1305 output on empty messages.  (Found by Mike
   Pechkin.)
 
 0.5
 ---
 2017/03/10
 
-- Fixed many undefined behaviours in curve25519, that occur whenever
+- Fixed many undefined behaviours in Curve25519 that occur whenever
   we perform a left shift on a signed negative integer.  It doesn't
   affect the generated code, but you never know.  (Found with Frama-C
   by André Maroneze.)
 
-Fun fact: TweetNaCl and ref10 have the same bug.  Libsodium have
+Fun fact: TweetNaCl and ref10 have the same bug.  Libsodium has
 corrected the issue, though.
 
 For those who don't comprehend the magnitude of this madness, the
@@ -210,10 +225,10 @@ section 6.5.7(§4) of the C11 standard.
 2017/03/09
 
 - Fixed critical bug causing Argon2i to fail whenever it uses more
-  than 512 blocks.  It was reading uninitialised memory, and the
+  than 512 blocks.  It was reading uninitialised memory and the
   results were incorrect.  (Found by Mike Pechkin.)
-- Fixed an undefined behaviour in curve25519 (`fe_tobytes()`).  It was
-  accessing uninitialised memory, before throwing it away.  It didn't
+- Fixed an undefined behaviour in Curve25519 (`fe_tobytes()`).  It was
+  accessing uninitialised memory before throwing it away.  It didn't
   affect the compiled code nor the results, but you never know.
   (Found with [Frama-C](http://frama-c.com) by André Maroneze.)
 
@@ -221,10 +236,10 @@ section 6.5.7(§4) of the C11 standard.
 ---
 2017/02/27
 
-- Got the invariants of poly1305 right, put them in the comments.
+- Got the invariants of Poly1305 right and put them in the comments.
   There was no bug, but that was lucky (turned out the IETF test
   vectors were designed to trigger the bugs I was afraid of).
-- Simplified poly1305 finalisation (replaced conditional subtraction
+- Simplified Poly1305 finalisation (replaced conditional subtraction
   by a carry propagation).
 - Made a few cosmetic changes here and there.
 
@@ -235,9 +250,9 @@ section 6.5.7(§4) of the C11 standard.
 - Public interface significantly reworked. Removed redundant, hard to
   mess up constructions.
 - Added AEAD.
-- Sped up curve25519 by a factor of more than 6 (switched to ref10
-  arithmetic)
-- Added various test vectors, completed the consistency tests.
+- Sped up Curve25519 by a factor of more than 6 (switched to ref10
+  arithmetic).
+- Added various test vectors and completed the consistency tests.
 
 0.1
 ---
diff --git a/src/3rdparty/monocypher/README.md b/src/3rdparty/monocypher/README.md
index 15e6b85be1..9b168e70d2 100644
--- a/src/3rdparty/monocypher/README.md
+++ b/src/3rdparty/monocypher/README.md
@@ -3,14 +3,37 @@ Monocypher
 
 Monocypher is an easy to use, easy to deploy, auditable crypto library
 written in portable C.  It approaches the size of [TweetNaCl][] and the
-speed of [Libsodium][].
+speed of [libsodium][].
 
 [Official site.](https://monocypher.org/)  
 [Official releases.](https://monocypher.org/download/)
 
-[Libsodium]: https://libsodium.org
+[libsodium]: https://libsodium.org
 [TweetNaCl]: https://tweetnacl.cr.yp.to/
 
+
+Features
+--------
+
+- [Authenticated Encryption][AEAD] with XChaCha20 and Poly1305
+  (RFC&nbsp;8439).
+- [Hashing][HASH] with BLAKE2b.
+- [Password Hashing][PWH] with Argon2i.
+- [Public Key Cryptography][PKC] with X25519 (key exchange).
+- [Public Key Signatures][PKS] with EdDSA (RFC 8032) and Ed25519.
+- [Steganography support][STEG] with Elligator&nbsp;2.
+- [OPRF and PAKE support][PAKE] with Elligator&nbsp;2 and scalar
+  inversion.
+
+[AEAD]: https://monocypher.org/manual/aead
+[HASH]: https://monocypher.org/manual/hash
+[PWH]:  https://monocypher.org/manual/argon2i
+[PKC]:  https://monocypher.org/manual/key_exchange
+[PKS]:  https://monocypher.org/manual/sign
+[STEG]: https://monocypher.org/manual/advanced/elligator
+[PAKE]: https://monocypher.org/manual/advanced/x25519_inverse
+
+
 Manual
 ------
 
@@ -123,14 +146,14 @@ Speed benchmark
 
 This will give you an idea how fast Monocypher is on your machine.  Make
 sure you run it on the target platform if performance is a concern.  If
-Monocypher is too slow, try Libsodium.  If you're not sure, you can
+Monocypher is too slow, try libsodium.  If you're not sure, you can
 always switch later.
 
 
 Note: the speed benchmark currently requires the POSIX
 `clock_gettime()` function.
 
-There are similar benchmarks for Libsodium, TweetNaCl, LibHydrogen,
+There are similar benchmarks for libsodium, TweetNaCl, LibHydrogen,
 c25519, and ed25519-donna (the portable, 32-bit version):
 
     $ make speed-sodium
@@ -169,10 +192,23 @@ is activated by compiling monocypher.c with the `-DBLAKE2_NO_UNROLLING`
 option.
 
 The `-DBLAKE2_NO_UNROLLING` option is a performance tweak.  By default,
-Monocypher unrolls the Blake2b inner loop, because doing so is over 25%
+Monocypher unrolls the BLAKE2b inner loop, because doing so is over 25%
 faster on modern processors.  Some embedded processors however, run the
 unrolled loop _slower_ (possibly because of the cost of fetching 5KB of
 additional code).  If you're using an embedded platform, try this
 option.  The binary will be about 5KB smaller, and in some cases faster.
 
+The `MONOCYPHER_CPP_NAMESPACE` preprocessor definition allows C++ users
+who compile Monocypher as C++ to wrap it in a namespace. When it is not
+defined (the default), we assume Monocypher is compiled as C, and an
+`extern "C"` declaration is added when we detect that the header is
+included in C++ code.
+
+The `change-prefix.sh` script can rename all functions by replacing
+"crypto_" by a chosen prefix, so you can avoid name clashes. For
+instance, the following command changes all instances of "crypto_" by
+"foobar_" (note the absence of the underscore):
+
+    ./change-prefix.sh foobar
+
 
diff --git a/src/3rdparty/monocypher/monocypher.c b/src/3rdparty/monocypher/monocypher.c
index 4d5162e3fe..bd73306f9f 100644
--- a/src/3rdparty/monocypher/monocypher.c
+++ b/src/3rdparty/monocypher/monocypher.c
@@ -1,4 +1,4 @@
-// Monocypher version 3.1.2
+// Monocypher version 3.1.3
 //
 // This file is dual-licensed.  Choose whichever licence you want from
 // the two licences listed below.
@@ -53,13 +53,17 @@
 
 #include "monocypher.h"
 
+#ifdef MONOCYPHER_CPP_NAMESPACE
+namespace MONOCYPHER_CPP_NAMESPACE {
+#endif
+
 /////////////////
 /// Utilities ///
 /////////////////
 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
 #define FOR(i, start, end)         FOR_T(size_t, i, start, end)
-#define COPY(dst, src, size)       FOR(i, 0, size) (dst)[i] = (src)[i]
-#define ZERO(buf, size)            FOR(i, 0, size) (buf)[i] = 0
+#define COPY(dst, src, size)       FOR(i__, 0, size) (dst)[i__] = (src)[i__]
+#define ZERO(buf, size)            FOR(i__, 0, size) (buf)[i__] = 0
 #define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
 #define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
 #define MIN(a, b)                  ((a) <= (b) ? (a) : (b))
@@ -192,19 +196,17 @@ static void chacha20_rounds(u32 out[16], const u32 in[16])
     out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
 }
 
-static void chacha20_init_key(u32 block[16], const u8 key[32])
-{
-    load32_le_buf(block  , (const u8*)"expand 32-byte k", 4); // constant
-    load32_le_buf(block+4, key                          , 8); // key
-}
+const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes
 
 void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
 {
     u32 block[16];
-    chacha20_init_key(block, key);
-    // input
-    load32_le_buf(block + 12, in, 4);
+    load32_le_buf(block     , chacha20_constant, 4);
+    load32_le_buf(block +  4, key              , 8);
+    load32_le_buf(block + 12, in               , 4);
+
     chacha20_rounds(block, block);
+
     // prevent reversal of the rounds by revealing only half of the buffer.
     store32_le_buf(out   , block   , 4); // constant
     store32_le_buf(out+16, block+12, 4); // counter and nonce
@@ -216,10 +218,11 @@ u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
                         u64 ctr)
 {
     u32 input[16];
-    chacha20_init_key(input, key);
+    load32_le_buf(input     , chacha20_constant, 4);
+    load32_le_buf(input +  4, key              , 8);
+    load32_le_buf(input + 14, nonce            , 2);
     input[12] = (u32) ctr;
     input[13] = (u32)(ctr >> 32);
-    load32_le_buf(input+14, nonce, 2);
 
     // Whole blocks
     u32    pool[16];
@@ -316,18 +319,21 @@ void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
 // h = (h + c) * r
 // preconditions:
 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
-//   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
+//   end    <= 1
 // Postcondition:
 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
-static void poly_block(crypto_poly1305_ctx *ctx)
+static void poly_block(crypto_poly1305_ctx *ctx, const u8 in[16], unsigned end)
 {
+    u32 s[4];
+    load32_le_buf(s, in, 4);
+
     // s = h + c, without carry propagation
-    const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
-    const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
-    const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
-    const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
-    const u32 s4 = ctx->h[4] +      ctx->c[4]; // s4 <=          5
+    const u64 s0 = ctx->h[0] + (u64)s[0]; // s0 <= 1_fffffffe
+    const u64 s1 = ctx->h[1] + (u64)s[1]; // s1 <= 1_fffffffe
+    const u64 s2 = ctx->h[2] + (u64)s[2]; // s2 <= 1_fffffffe
+    const u64 s3 = ctx->h[3] + (u64)s[3]; // s3 <= 1_fffffffe
+    const u32 s4 = ctx->h[4] + end;       // s4 <=          5
 
     // Local all the things!
     const u32 r0 = ctx->r[0];       // r0  <= 0fffffff
@@ -362,40 +368,10 @@ static void poly_block(crypto_poly1305_ctx *ctx)
     ctx->h[4] = (u32)u4; // u4 <=          4
 }
 
-// (re-)initialises the input counter and input buffer
-static void poly_clear_c(crypto_poly1305_ctx *ctx)
-{
-    ZERO(ctx->c, 4);
-    ctx->c_idx = 0;
-}
-
-static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
-{
-    size_t word = ctx->c_idx >> 2;
-    size_t byte = ctx->c_idx & 3;
-    ctx->c[word] |= (u32)input << (byte * 8);
-    ctx->c_idx++;
-}
-
-static void poly_update(crypto_poly1305_ctx *ctx,
-                        const u8 *message, size_t message_size)
-{
-    FOR (i, 0, message_size) {
-        poly_take_input(ctx, message[i]);
-        if (ctx->c_idx == 16) {
-            poly_block(ctx);
-            poly_clear_c(ctx);
-        }
-    }
-}
-
 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
 {
-    // Initial hash is zero
-    ZERO(ctx->h, 5);
-    // add 2^130 to every input block
-    ctx->c[4] = 1;
-    poly_clear_c(ctx);
+    ZERO(ctx->h, 5); // Initial hash is zero
+    ctx->c_idx = 0;
     // load r and pad (r has some of its bits cleared)
     load32_le_buf(ctx->r  , key   , 4);
     load32_le_buf(ctx->pad, key+16, 4);
@@ -406,41 +382,45 @@ void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
                             const u8 *message, size_t message_size)
 {
-    if (message_size == 0) {
-        return;
-    }
     // Align ourselves with block boundaries
     size_t aligned = MIN(align(ctx->c_idx, 16), message_size);
-    poly_update(ctx, message, aligned);
-    message      += aligned;
-    message_size -= aligned;
+    FOR (i, 0, aligned) {
+        ctx->c[ctx->c_idx] = *message;
+        ctx->c_idx++;
+        message++;
+        message_size--;
+    }
+
+    // If block is complete, process it
+    if (ctx->c_idx == 16) {
+        poly_block(ctx, ctx->c, 1);
+        ctx->c_idx = 0;
+    }
 
     // Process the message block by block
     size_t nb_blocks = message_size >> 4;
     FOR (i, 0, nb_blocks) {
-        load32_le_buf(ctx->c, message, 4);
-        poly_block(ctx);
+        poly_block(ctx, message, 1);
         message += 16;
     }
-    if (nb_blocks > 0) {
-        poly_clear_c(ctx);
-    }
     message_size &= 15;
 
-    // remaining bytes
-    poly_update(ctx, message, message_size);
+    // remaining bytes (we never complete a block here)
+    FOR (i, 0, message_size) {
+        ctx->c[ctx->c_idx] = message[i];
+        ctx->c_idx++;
+    }
 }
 
 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
 {
     // Process the last block (if any)
+    // We move the final 1 according to remaining input length
+    // (this will add less than 2^130 to the last input block)
     if (ctx->c_idx != 0) {
-        // move the final 1 according to remaining input length
-        // (We may add less than 2^130 to the last input block)
-        ctx->c[4] = 0;
-        poly_take_input(ctx, 1);
-        // one last hash update
-        poly_block(ctx);
+        ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx);
+        ctx->c[ctx->c_idx] = 1;
+        poly_block(ctx, ctx->c, 0);
     }
 
     // check if we should subtract 2^130-5 by performing the
@@ -471,7 +451,7 @@ void crypto_poly1305(u8     mac[16],  const u8 *message,
 }
 
 ////////////////
-/// Blake2 b ///
+/// BLAKE2 b ///
 ////////////////
 static const u64 iv[8] = {
     0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
@@ -480,17 +460,6 @@ static const u64 iv[8] = {
     0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 };
 
-// increment the input offset
-static void blake2b_incr(crypto_blake2b_ctx *ctx)
-{
-    u64   *x = ctx->input_offset;
-    size_t y = ctx->input_idx;
-    x[0] += y;
-    if (x[0] < y) {
-        x[1]++;
-    }
-}
-
 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
 {
     static const u8 sigma[12][16] = {
@@ -508,6 +477,14 @@ static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
     };
 
+    // increment input offset
+    u64   *x = ctx->input_offset;
+    size_t y = ctx->input_idx;
+    x[0] += y;
+    if (x[0] < y) {
+        x[1]++;
+    }
+
     // init work vector
     u64 v0 = ctx->hash[0];  u64 v8  = iv[0];
     u64 v1 = ctx->hash[1];  u64 v9  = iv[1];
@@ -560,26 +537,6 @@ static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index)
     size_t word = index >> 3;
     size_t byte = index & 7;
     ctx->input[word] |= (u64)input << (byte << 3);
-
-}
-
-static void blake2b_end_block(crypto_blake2b_ctx *ctx)
-{
-    if (ctx->input_idx == 128) {  // If buffer is full,
-        blake2b_incr(ctx);        // update the input offset
-        blake2b_compress(ctx, 0); // and compress the (not last) block
-        ctx->input_idx = 0;
-    }
-}
-
-static void blake2b_update(crypto_blake2b_ctx *ctx,
-                           const u8 *message, size_t message_size)
-{
-    FOR (i, 0, message_size) {
-        blake2b_end_block(ctx);
-        blake2b_set_input(ctx, message[i], ctx->input_idx);
-        ctx->input_idx++;
-    }
 }
 
 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
@@ -612,26 +569,39 @@ void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
                            const u8 *message, size_t message_size)
 {
-    if (message_size == 0) {
-        return;
-    }
     // Align ourselves with block boundaries
+    // The block that may result is not compressed yet
     size_t aligned = MIN(align(ctx->input_idx, 128), message_size);
-    blake2b_update(ctx, message, aligned);
-    message      += aligned;
-    message_size -= aligned;
+    FOR (i, 0, aligned) {
+        blake2b_set_input(ctx, *message, ctx->input_idx);
+        ctx->input_idx++;
+        message++;
+        message_size--;
+    }
 
     // Process the message block by block
-    FOR (i, 0, message_size >> 7) { // number of blocks
-        blake2b_end_block(ctx);
+    // The last block is not compressed yet.
+    size_t nb_blocks = message_size >> 7;
+    FOR (i, 0, nb_blocks) {
+        if (ctx->input_idx == 128) {
+            blake2b_compress(ctx, 0);
+        }
         load64_le_buf(ctx->input, message, 16);
         message += 128;
         ctx->input_idx = 128;
     }
     message_size &= 127;
 
-    // remaining bytes
-    blake2b_update(ctx, message, message_size);
+    // Fill remaining bytes (not the whole buffer)
+    // The last block is never fully filled
+    FOR (i, 0, message_size) {
+        if (ctx->input_idx == 128) {
+            blake2b_compress(ctx, 0);
+            ctx->input_idx = 0;
+        }
+        blake2b_set_input(ctx, message[i], ctx->input_idx);
+        ctx->input_idx++;
+    }
 }
 
 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
@@ -640,7 +610,6 @@ void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
     FOR (i, ctx->input_idx, 128) {
         blake2b_set_input(ctx, 0, i);
     }
-    blake2b_incr(ctx);        // update the input offset
     blake2b_compress(ctx, 1); // compress the last block
     size_t nb_words = ctx->hash_size >> 3;
     store64_le_buf(hash, ctx->hash, nb_words);
@@ -696,7 +665,7 @@ static void wipe_block(block *b)
     ZERO(a, 128);
 }
 
-// updates a Blake2 hash with a 32 bit word, little endian.
+// updates a BLAKE2 hash with a 32 bit word, little endian.
 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
 {
     u8 buf[4];
@@ -792,37 +761,6 @@ static void g_rounds(block *work_block)
     }
 }
 
-// The compression function G (copy version for the first pass)
-static void g_copy(block *result, const block *x, const block *y, block* tmp)
-{
-    copy_block(tmp   , x  ); // tmp    = X
-    xor_block (tmp   , y  ); // tmp    = X ^ Y = R
-    copy_block(result, tmp); // result = R         (only difference with g_xor)
-    g_rounds  (tmp);         // tmp    = Z
-    xor_block (result, tmp); // result = R ^ Z
-}
-
-// The compression function G (xor version for subsequent passes)
-static void g_xor(block *result, const block *x, const block *y, block *tmp)
-{
-    copy_block(tmp   , x  ); // tmp    = X
-    xor_block (tmp   , y  ); // tmp    = X ^ Y = R
-    xor_block (result, tmp); // result = R ^ old   (only difference with g_copy)
-    g_rounds  (tmp);         // tmp    = Z
-    xor_block (result, tmp); // result = R ^ old ^ Z
-}
-
-// Unary version of the compression function.
-// The missing argument is implied zero.
-// Does the transformation in place.
-static void unary_g(block *work_block, block *tmp)
-{
-    // work_block == R
-    copy_block(tmp, work_block); // tmp        = R
-    g_rounds  (work_block);      // work_block = Z
-    xor_block (work_block, tmp); // work_block = Z ^ R
-}
-
 // Argon2i uses a kind of stream cipher to determine which reference
 // block it will take to synthesise the next block.  This context hold
 // that stream's state.  (It's very similar to Chacha20.  The block b
@@ -856,8 +794,12 @@ static void gidx_refresh(gidx_ctx *ctx)
     // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
     // (G "square" function), to get cheap pseudo-random numbers.
     block tmp;
-    unary_g(&ctx->b, &tmp);
-    unary_g(&ctx->b, &tmp);
+    copy_block(&tmp, &ctx->b);
+    g_rounds  (&ctx->b);
+    xor_block (&ctx->b, &tmp);
+    copy_block(&tmp, &ctx->b);
+    g_rounds  (&ctx->b);
+    xor_block (&ctx->b, &tmp);
     wipe_block(&tmp);
 }
 
@@ -954,22 +896,18 @@ void crypto_argon2i_general(u8       *hash,      u32 hash_size,
         crypto_blake2b_final(&ctx, initial_hash);
 
         // fill first 2 blocks
-        block tmp_block;
-        u8    hash_area[1024];
+        u8 hash_area[1024];
         store32_le(initial_hash + 64, 0); // first  additional word
         store32_le(initial_hash + 68, 0); // second additional word
         extended_hash(hash_area, 1024, initial_hash, 72);
-        load_block(&tmp_block, hash_area);
-        copy_block(blocks, &tmp_block);
+        load_block(blocks, hash_area);
 
         store32_le(initial_hash + 64, 1); // slight modification
         extended_hash(hash_area, 1024, initial_hash, 72);
-        load_block(&tmp_block, hash_area);
-        copy_block(blocks + 1, &tmp_block);
+        load_block(blocks + 1, hash_area);
 
         WIPE_BUFFER(initial_hash);
         WIPE_BUFFER(hash_area);
-        wipe_block(&tmp_block);
     }
 
     // Actual number of blocks
@@ -992,15 +930,19 @@ void crypto_argon2i_general(u8       *hash,      u32 hash_size,
             u32 segment_start = segment * segment_size + start_offset;
             u32 segment_end   = (segment + 1) * segment_size;
             FOR_T (u32, current_block, segment_start, segment_end) {
-                u32 reference_block = gidx_next(&ctx);
-                u32 previous_block  = current_block == 0
-                                    ? nb_blocks - 1
-                                    : current_block - 1;
-                block *c = blocks + current_block;
-                block *p = blocks + previous_block;
-                block *r = blocks + reference_block;
-                if (first_pass) { g_copy(c, p, r, &tmp); }
-                else            { g_xor (c, p, r, &tmp); }
+                block *reference = blocks + gidx_next(&ctx);
+                block *current   = blocks + current_block;
+                block *previous  = current_block == 0
+                                 ? blocks + nb_blocks - 1
+                                 : blocks + current_block - 1;
+                // Apply compression function G,
+                // And copy it (or XOR it) to the current block.
+                copy_block(&tmp, previous);
+                xor_block (&tmp, reference);
+                if (first_pass) { copy_block(current, &tmp); }
+                else            { xor_block (current, &tmp); }
+                g_rounds  (&tmp);
+                xor_block (current, &tmp);
             }
         }
     }
@@ -1212,8 +1154,15 @@ static void fe_ccopy(fe f, const fe g, int b)
     h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
     h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
 
-static void fe_frombytes(fe h, const u8 s[32])
+// Decodes a field element from a byte buffer.
+// mask specifies how many bits we ignore.
+// Traditionally we ignore 1. It's useful for EdDSA,
+// which uses that bit to denote the sign of x.
+// Elligator however uses positive representatives,
+// which means ignoring 2 bits instead.
+static void fe_frombytes_mask(fe h, const u8 s[32], unsigned nb_mask)
 {
+    i32 mask = 0xffffff >> nb_mask;
     i64 t0 =  load32_le(s);                        // t0 < 2^32
     i64 t1 =  load24_le(s +  4) << 6;              // t1 < 2^30
     i64 t2 =  load24_le(s +  7) << 5;              // t2 < 2^29
@@ -1223,10 +1172,16 @@ static void fe_frombytes(fe h, const u8 s[32])
     i64 t6 =  load24_le(s + 20) << 7;              // t6 < 2^31
     i64 t7 =  load24_le(s + 23) << 5;              // t7 < 2^29
     i64 t8 =  load24_le(s + 26) << 4;              // t8 < 2^28
-    i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;  // t9 < 2^25
-    FE_CARRY;                                      // Carry recondition OK
+    i64 t9 = (load24_le(s + 29) & mask) << 2;      // t9 < 2^25
+    FE_CARRY;                                      // Carry precondition OK
 }
 
+static void fe_frombytes(fe h, const u8 s[32])
+{
+    fe_frombytes_mask(h, s, 1);
+}
+
+
 // Precondition
 //   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
 //   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
@@ -1404,65 +1359,6 @@ static void fe_sq(fe h, const fe f)
     FE_CARRY;
 }
 
-// h = 2 * (f^2)
-//
-// Precondition
-// -------------
-//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
-//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
-//
-// Note: we could implement fe_sq2() by copying fe_sq(), multiplying
-// each limb by 2, *then* perform the carry.  This saves one carry.
-// However, doing so with the stated preconditions does not work (t2
-// would overflow).  There are 3 ways to solve this:
-//
-// 1. Show that t2 actually never overflows (it really does not).
-// 2. Accept an additional carry, at a small lost of performance.
-// 3. Make sure the input of fe_sq2() is freshly carried.
-//
-// SUPERCOP ref10 relies on (1).
-// Monocypher chose (2) and (3), mostly to save code.
-static void fe_sq2(fe h, const fe f)
-{
-    fe_sq(h, f);
-    fe_mul_small(h, h, 2);
-}
-
-// This could be simplified, but it would be slower
-static void fe_pow22523(fe out, const fe z)
-{
-    fe t0, t1, t2;
-    fe_sq(t0, z);
-    fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
-    fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
-    fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
-    fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
-    fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
-    fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
-    fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
-    fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
-    fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
-    WIPE_BUFFER(t0);
-    WIPE_BUFFER(t1);
-    WIPE_BUFFER(t2);
-}
-
-// Inverting means multiplying by 2^255 - 21
-// 2^255 - 21 = (2^252 - 3) * 8 + 3
-// So we reuse the multiplication chain of fe_pow22523
-static void fe_invert(fe out, const fe z)
-{
-    fe tmp;
-    fe_pow22523(tmp, z);
-    // tmp2^8 * z^3
-    fe_sq(tmp, tmp);                        // 0
-    fe_sq(tmp, tmp);  fe_mul(tmp, tmp, z);  // 1
-    fe_sq(tmp, tmp);  fe_mul(out, tmp, z);  // 1
-    WIPE_BUFFER(tmp);
-}
-
 //  Parity check.  Returns 0 if even, 1 if odd
 static int fe_isodd(const fe f)
 {
@@ -1487,9 +1383,9 @@ static int fe_isequal(const fe f, const fe g)
 }
 
 // Inverse square root.
-// Returns true if x is a non zero square, false otherwise.
+// Returns true if x is a square, false otherwise.
 // After the call:
-//   isr = sqrt(1/x)        if x is non-zero square.
+//   isr = sqrt(1/x)        if x is a non-zero square.
 //   isr = sqrt(sqrt(-1)/x) if x is not a square.
 //   isr = 0                if x is zero.
 // We do not guarantee the sign of the square root.
@@ -1545,19 +1441,63 @@ static int fe_isequal(const fe f, const fe g)
 //      x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
 static int invsqrt(fe isr, const fe x)
 {
-    fe check, quartic;
-    fe_copy(check, x);
-    fe_pow22523(isr, check);
-    fe_sq (quartic, isr);
-    fe_mul(quartic, quartic, check);
+    fe t0, t1, t2;
+
+    // t0 = x^((p-5)/8)
+    // Can be achieved with a simple double & add ladder,
+    // but it would be slower.
+    fe_sq(t0, x);
+    fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, x, t1);
+    fe_mul(t0, t0, t1);
+    fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
+    fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
+    fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
+    fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
+    fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
+    fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
+    fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
+    fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
+    fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(t0, t0, x);
+
+    // quartic = x^((p-1)/4)
+    i32 *quartic = t1;
+    fe_sq (quartic, t0);
+    fe_mul(quartic, quartic, x);
+
+    i32 *check = t2;
+    fe_0  (check);          int z0 = fe_isequal(x      , check);
     fe_1  (check);          int p1 = fe_isequal(quartic, check);
     fe_neg(check, check );  int m1 = fe_isequal(quartic, check);
     fe_neg(check, sqrtm1);  int ms = fe_isequal(quartic, check);
-    fe_mul(check, isr, sqrtm1);
-    fe_ccopy(isr, check, m1 | ms);
-    WIPE_BUFFER(quartic);
-    WIPE_BUFFER(check);
-    return p1 | m1;
+
+    // if quartic == -1 or sqrt(-1)
+    // then  isr = x^((p-1)/4) * sqrt(-1)
+    // else  isr = x^((p-1)/4)
+    fe_mul(isr, t0, sqrtm1);
+    fe_ccopy(isr, t0, 1 - (m1 | ms));
+
+    WIPE_BUFFER(t0);
+    WIPE_BUFFER(t1);
+    WIPE_BUFFER(t2);
+    return p1 | m1 | z0;
+}
+
+// Inverse in terms of inverse square root.
+// Requires two additional squarings to get rid of the sign.
+//
+//   1/x = x * (+invsqrt(x^2))^2
+//       = x * (-invsqrt(x^2))^2
+//
+// A fully optimised exponentiation by p-1 would save 6 field
+// multiplications, but it would require more code.
+static void fe_invert(fe out, const fe x)
+{
+    fe tmp;
+    fe_sq(tmp, x);
+    invsqrt(tmp, tmp);
+    fe_sq(tmp, tmp);
+    fe_mul(out, tmp, x);
+    WIPE_BUFFER(tmp);
 }
 
 // trim a scalar for scalar multiplication
@@ -1682,10 +1622,10 @@ static int is_above_l(const u32 x[8])
     // (-L == ~L + 1)
     u64 carry = 1;
     FOR (i, 0, 8) {
-        carry += (u64)x[i] + ~L[i];
+        carry  += (u64)x[i] + (~L[i] & 0xffffffff);
         carry >>= 32;
     }
-    return (int)carry;
+    return (int)carry; // carry is either 0 or 1
 }
 
 // Final reduction modulo L, by conditionally removing L.
@@ -1735,7 +1675,7 @@ static void mod_l(u8 reduced[32], const u32 x[16])
     // xr = x - xr
     u64 carry = 1;
     FOR (i, 0, 8) {
-        carry  += (u64)x[i] + ~xr[i];
+        carry  += (u64)x[i] + (~xr[i] & 0xffffffff);
         xr[i]   = (u32)carry;
         carry >>= 32;
     }
@@ -1759,9 +1699,7 @@ static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
 {
     u32 A[8];  load32_le_buf(A, a, 8);
     u32 B[8];  load32_le_buf(B, b, 8);
-    u32 p[16];
-    load32_le_buf(p, c, 8);
-    ZERO(p + 8, 8);
+    u32 p[16]; load32_le_buf(p, c, 8);  ZERO(p + 8, 8);
     multiply(p, A, B);
     mod_l(r, p);
     WIPE_BUFFER(p);
@@ -1804,7 +1742,7 @@ static void ge_tobytes(u8 s[32], const ge *h)
     WIPE_BUFFER(y);
 }
 
-// h = s, where s is a point encoded in 32 bytes
+// h = -s, where s is a point encoded in 32 bytes
 //
 // Variable time!  Inputs must not be secret!
 // => Use only to *check* signatures.
@@ -1831,7 +1769,7 @@ static void ge_tobytes(u8 s[32], const ge *h)
 //   isr = invsqrt(num * den)  // abort if not square
 //   x   = num * isr
 // Finally, negate x if its sign is not as specified.
-static int ge_frombytes_vartime(ge *h, const u8 s[32])
+static int ge_frombytes_neg_vartime(ge *h, const u8 s[32])
 {
     fe_frombytes(h->Y, s);
     fe_1(h->Z);
@@ -1845,7 +1783,7 @@ static int ge_frombytes_vartime(ge *h, const u8 s[32])
         return -1;             // Not on the curve, abort
     }
     fe_mul(h->X, h->T, h->X);  // x = sqrt((y^2 - 1) / (d*y^2 + 1))
-    if (fe_isodd(h->X) != (s[31] >> 7)) {
+    if (fe_isodd(h->X) == (s[31] >> 7)) {
         fe_neg(h->X, h->X);
     }
     fe_mul(h->T, h->X, h->Y);
@@ -1916,31 +1854,23 @@ static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
     fe_mul(s->Z, a   , b   );
 }
 
+// Internal buffers are not wiped! Inputs must not be secret!
+// => Use only to *check* signatures.
 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
 {
-    fe_add(a   , p->Y, p->X );
-    fe_sub(b   , p->Y, p->X );
-    fe_mul(a   , a   , q->Ym);
-    fe_mul(b   , b   , q->Yp);
-    fe_add(s->Y, a   , b    );
-    fe_sub(s->X, a   , b    );
-
-    fe_add(s->Z, p->Z, p->Z );
-    fe_mul(s->T, p->T, q->T2);
-    fe_sub(a   , s->Z, s->T );
-    fe_add(b   , s->Z, s->T );
-
-    fe_mul(s->T, s->X, s->Y);
-    fe_mul(s->X, s->X, b   );
-    fe_mul(s->Y, s->Y, a   );
-    fe_mul(s->Z, a   , b   );
+    ge_precomp neg;
+    fe_copy(neg.Ym, q->Yp);
+    fe_copy(neg.Yp, q->Ym);
+    fe_neg (neg.T2, q->T2);
+    ge_madd(s, p, &neg, a, b);
 }
 
 static void ge_double(ge *s, const ge *p, ge *q)
 {
     fe_sq (q->X, p->X);
     fe_sq (q->Y, p->Y);
-    fe_sq2(q->Z, p->Z);
+    fe_sq (q->Z, p->Z);          // qZ = pZ^2
+    fe_mul_small(q->Z, q->Z, 2); // qZ = pZ^2 * 2
     fe_add(q->T, p->X, p->Y);
     fe_sq (s->T, q->T);
     fe_add(q->T, q->Y, q->X);
@@ -2102,26 +2032,6 @@ static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
     }
 }
 
-// R_check = s[B] - h_ram[pk], where B is the base point
-//
-// Variable time! Internal buffers are not wiped! Inputs must not be secret!
-// => Use only to *check* signatures.
-static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
-{
-    ge  A;      // not secret, not wiped
-    u32 s32[8]; // not secret, not wiped
-    load32_le_buf(s32, s, 8);
-    if (ge_frombytes_vartime(&A, pk) ||         // A = pk
-        is_above_l(s32)) {                      // prevent s malleability
-        return -1;
-    }
-    fe_neg(A.X, A.X);
-    fe_neg(A.T, A.T);                           // A = -pk
-    ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
-    ge_tobytes(R_check, &A);                    // R_check = A
-    return 0;
-}
-
 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
 static const ge_precomp b_comb_low[8] = {
     {{-6816601,-2324159,-22559413,124364,18015490,
@@ -2283,7 +2193,7 @@ static void ge_scalarmult_base(ge *p, const u8 scalar[32])
         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
     }
     // Note: we could save one addition at the end if we assumed the
-    // scalar fit in 252 bit.  Which it does in practice if it is
+    // scalar fit in 252 bits.  Which it does in practice if it is
     // selected at random.  However, non-random, non-hashed scalars
     // *can* overflow 252 bits in practice.  Better account for that
     // than leaving that kind of subtle corner case.
@@ -2429,16 +2339,21 @@ void crypto_check_update(crypto_check_ctx_abstract *ctx,
 
 int crypto_check_final(crypto_check_ctx_abstract *ctx)
 {
-    u8 h_ram[64];
+    u8 *s = ctx->buf + 32; // s
+    u8  h_ram[64];
+    u32 s32[8];            // s (different encoding)
+    ge  A;
+
     ctx->hash->final(ctx, h_ram);
     reduce(h_ram);
-    u8 *R       = ctx->buf;      // R
-    u8 *s       = ctx->buf + 32; // s
-    u8 *R_check = ctx->pk;       // overwrite ctx->pk to save stack space
-    if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
+    load32_le_buf(s32, s, 8);
+    if (ge_frombytes_neg_vartime(&A, ctx->pk) ||  // A = -pk
+        is_above_l(s32)) {                        // prevent s malleability
         return -1;
     }
-    return crypto_verify32(R, R_check); // R == R_check ? OK : fail
+    ge_double_scalarmult_vartime(&A, h_ram, s);   // A = [s]B - [h_ram]pk
+    ge_tobytes(ctx->pk, &A);                      // R_check = A
+    return crypto_verify32(ctx->buf, ctx->pk);    // R == R_check ? OK : fail
 }
 
 int crypto_check(const u8  signature[64], const u8 public_key[32],
@@ -2553,10 +2468,14 @@ void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
     // Base point of order 8*L
     // Raw scalar multiplication with it does not clear the cofactor,
     // and the resulting public key will reveal 3 bits of the scalar.
+    //
+    // The low order component of this base point  has been chosen
+    // to yield the same results as crypto_x25519_dirty_fast().
     static const u8 dirty_base_point[32] = {
-        0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
-        0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
-        0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
+        0xd8, 0x86, 0x1a, 0xa2, 0x78, 0x7a, 0xd9, 0x26, 0x8b, 0x74, 0x74, 0xb6,
+        0x82, 0xe3, 0xbe, 0xc3, 0xce, 0x36, 0x9a, 0x1e, 0x5e, 0x31, 0x47, 0xa2,
+        0x6d, 0x37, 0x7c, 0xfd, 0x20, 0xb5, 0xdf, 0x75,
+    };
     // separate the main factor & the cofactor of the scalar
     u8 scalar[32];
     COPY(scalar, secret_key, 32);
@@ -2568,15 +2487,56 @@ void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
     // least significant bits however still have a main factor.  We must
     // remove it for X25519 compatibility.
     //
-    // We exploit the fact that 5*L = 1 (modulo 8)
-    //   cofactor = lsb * 5 * L             (modulo 8*L)
-    //   combined = scalar + cofactor       (modulo 8*L)
-    //   combined = scalar + (lsb * 5 * L)  (modulo 8*L)
-    add_xl(scalar, secret_key[0] * 5);
+    //   cofactor = lsb * L            (modulo 8*L)
+    //   combined = scalar + cofactor  (modulo 8*L)
+    add_xl(scalar, secret_key[0]);
     scalarmult(public_key, scalar, dirty_base_point, 256);
     WIPE_BUFFER(scalar);
 }
 
+// Select low order point
+// We're computing the [cofactor]lop scalar multiplication, where:
+//
+//   cofactor = tweak & 7.
+//   lop      = (lop_x, lop_y)
+//   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
+//   lop_y    = -lop_x * sqrtm1
+//
+// The low order point has order 8. There are 4 such points.  We've
+// chosen the one whose both coordinates are positive (below p/2).
+// The 8 low order points are as follows:
+//
+// [0]lop = ( 0       ,  1    )
+// [1]lop = ( lop_x   ,  lop_y)
+// [2]lop = ( sqrt(-1), -0    )
+// [3]lop = ( lop_x   , -lop_y)
+// [4]lop = (-0       , -1    )
+// [5]lop = (-lop_x   , -lop_y)
+// [6]lop = (-sqrt(-1),  0    )
+// [7]lop = (-lop_x   ,  lop_y)
+//
+// The x coordinate is either 0, sqrt(-1), lop_x, or their opposite.
+// The y coordinate is either 0,      -1 , lop_y, or their opposite.
+// The pattern for both is the same, except for a rotation of 2 (modulo 8)
+//
+// This helper function captures the pattern, and we can use it thus:
+//
+//    select_lop(x, lop_x, sqrtm1, cofactor);
+//    select_lop(y, lop_y, fe_one, cofactor + 2);
+//
+// This is faster than an actual scalar multiplication,
+// and requires less code than naive constant time look up.
+static void select_lop(fe out, const fe x, const fe k, u8 cofactor)
+{
+    fe tmp;
+    fe_0(out);
+    fe_ccopy(out, k  , (cofactor >> 1) & 1); // bit 1
+    fe_ccopy(out, x  , (cofactor >> 0) & 1); // bit 0
+    fe_neg  (tmp, out);
+    fe_ccopy(out, tmp, (cofactor >> 2) & 1); // bit 2
+    WIPE_BUFFER(tmp);
+}
+
 // "Fast" dirty ephemeral key
 // We use this one by default.
 //
@@ -2586,38 +2546,17 @@ void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
 // The cost is a bigger binary for programs that don't also sign messages.
 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
 {
+    // Compute clean scalar multiplication
     u8 scalar[32];
     ge pk;
     COPY(scalar, secret_key, 32);
     trim_scalar(scalar);
     ge_scalarmult_base(&pk, scalar);
 
-    // Select low order point
-    // We're computing the [cofactor]lop scalar multiplication, where:
-    //   cofactor = tweak & 7.
-    //   lop      = (lop_x, lop_y)
-    //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
-    //   lop_y    = -lop_x * sqrtm1
-    // Notes:
-    // - A (single) Montgomery ladder would be twice as slow.
-    // - An actual scalar multiplication would hurt performance.
-    // - A full table lookup would take more code.
-    u8 cofactor = secret_key[0] & 7;
-    int a = (cofactor >> 2) & 1;
-    int b = (cofactor >> 1) & 1;
-    int c = (cofactor >> 0) & 1;
-    fe t1, t2, t3;
-    fe_0(t1);
-    fe_ccopy(t1, sqrtm1, b);
-    fe_ccopy(t1, lop_x , c);
-    fe_neg  (t3, t1);
-    fe_ccopy(t1, t3, a);
-    fe_1(t2);
-    fe_0(t3);
-    fe_ccopy(t2, t3   , b);
-    fe_ccopy(t2, lop_y, c);
-    fe_neg  (t3, t2);
-    fe_ccopy(t2, t3, a^b);
+    // Compute low order point
+    fe t1, t2;
+    select_lop(t1, lop_x, sqrtm1, secret_key[0]);
+    select_lop(t2, lop_y, fe_one, secret_key[0] + 2);
     ge_precomp low_order_point;
     fe_add(low_order_point.Yp, t2, t1);
     fe_sub(low_order_point.Ym, t2, t1);
@@ -2635,9 +2574,9 @@ void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
 
     fe_tobytes(public_key, t1);
 
-    WIPE_BUFFER(t1);  WIPE_BUFFER(scalar);
-    WIPE_BUFFER(t2);  WIPE_CTX(&pk);
-    WIPE_BUFFER(t3);  WIPE_CTX(&low_order_point);
+    WIPE_BUFFER(t1);    WIPE_CTX(&pk);
+    WIPE_BUFFER(t2);    WIPE_CTX(&low_order_point);
+    WIPE_BUFFER(scalar);
 }
 
 ///////////////////
@@ -2705,15 +2644,10 @@ static const fe A = {486662};
 //       u2 = u
 void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
 {
-    // Representatives are encoded in 254 bits.
-    // The two most significant ones are random padding that must be ignored.
-    u8 clamped[32];
-    COPY(clamped, hidden, 32);
-    clamped[31] &= 0x3f;
-
     fe r, u, t1, t2, t3;
-    fe_frombytes(r, clamped);
-    fe_sq2(t1, r);
+    fe_frombytes_mask(r, hidden, 2); // r is encoded in 254 bits.
+    fe_sq(r, r);
+    fe_add(t1, r, r);
     fe_add(u, t1, fe_one);
     fe_sq (t2, u);
     fe_mul(t3, A2, t1);
@@ -2722,8 +2656,7 @@ void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
     fe_mul(t1, t2, u);
     fe_mul(t1, t3, t1);
     int is_square = invsqrt(t1, t1);
-    fe_sq(u, r);
-    fe_mul(u, u, ufactor);
+    fe_mul(u, r, ufactor);
     fe_ccopy(u, fe_one, is_square);
     fe_sq (t1, t1);
     fe_mul(u, u, A);
@@ -2735,7 +2668,7 @@ void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
 
     WIPE_BUFFER(t1);  WIPE_BUFFER(r);
     WIPE_BUFFER(t2);  WIPE_BUFFER(u);
-    WIPE_BUFFER(t3);  WIPE_BUFFER(clamped);
+    WIPE_BUFFER(t3);
 }
 
 // Elligator inverse map
@@ -2755,59 +2688,49 @@ void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
 // Let sq = -non_square * u * (u+A)
 // if sq is not a square, or u = -A, there is no mapping
 // Assuming there is a mapping:
-//   if v is positive: r = sqrt(-(u+A) / u)
-//   if v is negative: r = sqrt(-u / (u+A))
+//    if v is positive: r = sqrt(-u     / (non_square * (u+A)))
+//    if v is negative: r = sqrt(-(u+A) / (non_square * u    ))
 //
 // We compute isr = invsqrt(-non_square * u * (u+A))
-// if it wasn't a non-zero square, abort.
+// if it wasn't a square, abort.
 // else, isr = sqrt(-1 / (non_square * u * (u+A))
 //
-// This causes us to abort if u is zero, even though we shouldn't. This
-// never happens in practice, because (i) a random point in the curve has
-// a negligible chance of being zero, and (ii) scalar multiplication with
-// a trimmed scalar *never* yields zero.
+// If v is positive, we return isr * u:
+//   isr * u = sqrt(-1 / (non_square * u * (u+A)) * u
+//   isr * u = sqrt(-u / (non_square * (u+A))
 //
-// Since:
+// If v is negative, we return isr * (u+A):
 //   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
-//   isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
-// and:
-//   isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
-//   isr = u = sqrt(-u / (non_square * u * (u+A))
-// Therefore:
-//   if v is positive: r = isr * (u+A)
-//   if v is negative: r = isr * u
+//   isr * (u+A) = sqrt(-(u+A) / (non_square * u)
 int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
 {
     fe t1, t2, t3;
-    fe_frombytes(t1, public_key);
+    fe_frombytes(t1, public_key);    // t1 = u
 
-    fe_add(t2, t1, A);
+    fe_add(t2, t1, A);               // t2 = u + A
     fe_mul(t3, t1, t2);
     fe_mul_small(t3, t3, -2);
-    int is_square = invsqrt(t3, t3);
-    if (!is_square) {
+    int is_square = invsqrt(t3, t3); // t3 = sqrt(-1 / non_square * u * (u+A))
+    if (is_square) {
         // The only variable time bit.  This ultimately reveals how many
         // tries it took us to find a representable key.
         // This does not affect security as long as we try keys at random.
-        WIPE_BUFFER(t1);
-        WIPE_BUFFER(t2);
-        WIPE_BUFFER(t3);
-        return -1;
-    }
-    fe_ccopy    (t1, t2, tweak & 1);
-    fe_mul      (t3, t1, t3);
-    fe_mul_small(t1, t3, 2);
-    fe_neg      (t2, t3);
-    fe_ccopy    (t3, t2, fe_isodd(t1));
-    fe_tobytes(hidden, t3);
 
-    // Pad with two random bits
-    hidden[31] |= tweak & 0xc0;
+        fe_ccopy    (t1, t2, tweak & 1); // multiply by u if v is positive,
+        fe_mul      (t3, t1, t3);        // multiply by u+A otherwise
+        fe_mul_small(t1, t3, 2);
+        fe_neg      (t2, t3);
+        fe_ccopy    (t3, t2, fe_isodd(t1));
+        fe_tobytes(hidden, t3);
+
+        // Pad with two random bits
+        hidden[31] |= tweak & 0xc0;
+    }
 
     WIPE_BUFFER(t1);
     WIPE_BUFFER(t2);
     WIPE_BUFFER(t3);
-    return 0;
+    return is_square - 1;
 }
 
 void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
@@ -2861,10 +2784,9 @@ void crypto_key_exchange(u8       shared_key[32],
 //   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
 static void redc(u32 u[8], u32 x[16])
 {
-    static const u32 k[8]  = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
-                               0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
-    static const u32 l[8]  = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
-                               0x00000000, 0x00000000, 0x00000000, 0x10000000,};
+    static const u32 k[8] = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
+                              0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2, };
+
     // s = x * k (modulo 2^256)
     // This is cheaper than the full multiplication.
     u32 s[8] = {0};
@@ -2877,7 +2799,7 @@ static void redc(u32 u[8], u32 x[16])
         }
     }
     u32 t[16] = {0};
-    multiply(t, s, l);
+    multiply(t, s, L);
 
     // t = t + x
     u64 carry = 0;
@@ -2890,8 +2812,6 @@ static void redc(u32 u[8], u32 x[16])
     // u = (t / 2^256) % L
     // Note that t / 2^256 is always below 2*L,
     // So a constant time conditional subtraction is enough
-    // We work with L directly, in a 2's complement encoding
-    // (-L == ~L + 1)
     remove_l(u, t+8);
 
     WIPE_BUFFER(s);
@@ -2925,6 +2845,7 @@ void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
         WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
     }
 
+    // Compute the inverse
     u32 product[16];
     for (int i = 252; i >= 0; i--) {
         ZERO(product, 16);
@@ -3007,16 +2928,14 @@ int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24],
     u8 real_mac[16];
     lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
     WIPE_BUFFER(auth_key);
-    if (crypto_verify16(mac, real_mac)) {
-        WIPE_BUFFER(sub_key);
-        WIPE_BUFFER(real_mac);
-        return -1;
+    int mismatch = crypto_verify16(mac, real_mac);
+    if (!mismatch) {
+        crypto_chacha20_ctr(plain_text, cipher_text, text_size,
+                            sub_key, nonce + 16, 1);
     }
-    crypto_chacha20_ctr(plain_text, cipher_text, text_size,
-                        sub_key, nonce + 16, 1);
     WIPE_BUFFER(sub_key);
     WIPE_BUFFER(real_mac);
-    return 0;
+    return mismatch;
 }
 
 void crypto_lock(u8 mac[16], u8 *cipher_text,
@@ -3033,3 +2952,7 @@ int crypto_unlock(u8 *plain_text,
     return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0,
                               cipher_text, text_size);
 }
+
+#ifdef MONOCYPHER_CPP_NAMESPACE
+}
+#endif
diff --git a/src/3rdparty/monocypher/monocypher.h b/src/3rdparty/monocypher/monocypher.h
index 7d30b44a4b..c7b83968a5 100644
--- a/src/3rdparty/monocypher/monocypher.h
+++ b/src/3rdparty/monocypher/monocypher.h
@@ -1,4 +1,4 @@
-// Monocypher version 3.1.2
+// Monocypher version 3.1.3
 //
 // This file is dual-licensed.  Choose whichever licence you want from
 // the two licences listed below.
@@ -57,7 +57,9 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#ifdef __cplusplus
+#ifdef MONOCYPHER_CPP_NAMESPACE
+namespace MONOCYPHER_CPP_NAMESPACE {
+#elif defined(__cplusplus)
 extern "C" {
 #endif
 
@@ -83,12 +85,12 @@ typedef struct {
 typedef struct {
     uint32_t r[4];   // constant multiplier (from the secret key)
     uint32_t h[5];   // accumulated hash
-    uint32_t c[5];   // chunk of the message
+    uint8_t  c[16];  // chunk of the message
     uint32_t pad[4]; // random number added at the end (from the secret key)
     size_t   c_idx;  // How many bytes are there in the chunk.
 } crypto_poly1305_ctx;
 
-// Hash (Blake2b)
+// Hash (BLAKE2b)
 typedef struct {
     uint64_t hash[8];
     uint64_t input_offset[2];
@@ -158,7 +160,7 @@ int crypto_unlock_aead(uint8_t       *plain_text,
                        const uint8_t *cipher_text, size_t text_size);
 
 
-// General purpose hash (Blake2b)
+// General purpose hash (BLAKE2b)
 // ------------------------------
 
 // Direct interface
@@ -207,7 +209,7 @@ void crypto_key_exchange(uint8_t       shared_key      [32],
                          const uint8_t their_public_key[32]);
 
 
-// Signatures (EdDSA with curve25519 + Blake2b)
+// Signatures (EdDSA with curve25519 + BLAKE2b)
 // --------------------------------------------
 
 // Generate public key