diff --git a/Misc/NEWS.d/next/Library/2021-02-20-20-01-01.bpo-43279.5JwOiY.rst b/Misc/NEWS.d/next/Library/2021-02-20-20-01-01.bpo-43279.5JwOiY.rst
new file mode 100644
index 00000000000000..b81cb72e3a8b12
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-02-20-20-01-01.bpo-43279.5JwOiY.rst
@@ -0,0 +1 @@
+Update code taken from Keccak Code Package. Patch by Illia Volochii.
diff --git a/Modules/_sha3/README.txt b/Modules/_sha3/README.txt
index e34b1d12f702fa..8e93b002de52f3 100644
--- a/Modules/_sha3/README.txt
+++ b/Modules/_sha3/README.txt
@@ -1,11 +1,9 @@
 Keccak Code Package
 ===================
 
-The files in kcp are taken from the Keccak Code Package. They have been
-slightly to be C89 compatible. The architecture specific header file
-KeccakP-1600-SnP.h ha been renamed to KeccakP-1600-SnP-opt32.h or
-KeccakP-1600-SnP-opt64.h.
-
-The 64bit files were generated with generic64lc/libkeccak.a.pack target, the
-32bit files with generic32lc/libkeccak.a.pack.
+The files in kcp are taken from the eXtended Keccak Code Package.
+The architecture specific header file KeccakP-1600-SnP.h has been renamed to
+KeccakP-1600-SnP-opt32.h or KeccakP-1600-SnP-opt64.h.
 
+The 64bit files were generated with generic64lc/libXKCP.a.pack target, the
+32bit files with generic32lc/libXKCP.a.pack.
diff --git a/Modules/_sha3/cleanup.py b/Modules/_sha3/cleanup.py
index 4f53681b49e67b..17f9372a910cd2 100755
--- a/Modules/_sha3/cleanup.py
+++ b/Modules/_sha3/cleanup.py
@@ -37,6 +37,10 @@ def cleanup(f):
         if "brg_endian.h" in line:
             buf.append("/* %s */\n" % line.strip())
             continue
+        # remove #include "config.h"
+        if '#include "config.h"' in line:
+            buf.append("/* %s */\n" % line.strip())
+            continue
         # transform C++ comments into ANSI C comments
         line = CPP1.sub(r"/*\1 */\n", line)
         line = CPP2.sub(r" /*\1 */\n", line)
diff --git a/Modules/_sha3/kcp/KeccakHash.c b/Modules/_sha3/kcp/KeccakHash.c
index e09fb43cacea1d..c660f94076ae9d 100644
--- a/Modules/_sha3/kcp/KeccakHash.c
+++ b/Modules/_sha3/kcp/KeccakHash.c
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Keccak, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by the designers, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -23,30 +24,28 @@ HashReturn Keccak_HashInitialize(Keccak_HashInstance *instance, unsigned int rat
     HashReturn result;
 
     if (delimitedSuffix == 0)
-        return FAIL;
+        return KECCAK_FAIL;
     result = (HashReturn)KeccakWidth1600_SpongeInitialize(&instance->sponge, rate, capacity);
-    if (result != SUCCESS)
+    if (result != KECCAK_SUCCESS)
         return result;
     instance->fixedOutputLength = hashbitlen;
     instance->delimitedSuffix = delimitedSuffix;
-    return SUCCESS;
+    return KECCAK_SUCCESS;
 }
 
 /* ---------------------------------------------------------------- */
 
-HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *data, DataLength databitlen)
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *data, BitLength databitlen)
 {
     if ((databitlen % 8) == 0)
         return (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
     else {
         HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
-        if (ret == SUCCESS) {
+        if (ret == KECCAK_SUCCESS) {
             /* The last partial byte is assumed to be aligned on the least significant bits */
-
             unsigned char lastByte = data[databitlen/8];
             /* Concatenate the last few bits provided here with those of the suffix */
-
-            unsigned short delimitedLastBytes = (unsigned short)((unsigned short)lastByte | ((unsigned short)instance->delimitedSuffix << (databitlen % 8)));
+            unsigned short delimitedLastBytes = (unsigned short)((unsigned short)(lastByte & ((1 << (databitlen % 8)) - 1)) | ((unsigned short)instance->delimitedSuffix << (databitlen % 8)));
             if ((delimitedLastBytes & 0xFF00) == 0x0000) {
                 instance->delimitedSuffix = delimitedLastBytes & 0xFF;
             }
@@ -66,7 +65,7 @@ HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *d
 HashReturn Keccak_HashFinal(Keccak_HashInstance *instance, BitSequence *hashval)
 {
     HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorbLastFewBits(&instance->sponge, instance->delimitedSuffix);
-    if (ret == SUCCESS)
+    if (ret == KECCAK_SUCCESS)
         return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, hashval, instance->fixedOutputLength/8);
     else
         return ret;
@@ -74,9 +73,9 @@ HashReturn Keccak_HashFinal(Keccak_HashInstance *instance, BitSequence *hashval)
 
 /* ---------------------------------------------------------------- */
 
-HashReturn Keccak_HashSqueeze(Keccak_HashInstance *instance, BitSequence *data, DataLength databitlen)
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *instance, BitSequence *data, BitLength databitlen)
 {
     if ((databitlen % 8) != 0)
-        return FAIL;
+        return KECCAK_FAIL;
     return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, data, databitlen/8);
 }
diff --git a/Modules/_sha3/kcp/KeccakHash.h b/Modules/_sha3/kcp/KeccakHash.h
index bbd3dc64a2285b..3c6222bb0907db 100644
--- a/Modules/_sha3/kcp/KeccakHash.h
+++ b/Modules/_sha3/kcp/KeccakHash.h
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Keccak, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by the designers, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -16,14 +17,21 @@ and related or neighboring rights to the source code in this file.
 #ifndef _KeccakHashInterface_h_
 #define _KeccakHashInterface_h_
 
-#ifndef KeccakP1600_excluded
+/* #include "config.h" */
+#ifdef XKCP_has_KeccakP1600
 
-#include "KeccakSponge.h"
+#include <stdint.h>
 #include <string.h>
+#include "KeccakSponge.h"
+
+#ifndef _Keccak_BitTypes_
+#define _Keccak_BitTypes_
+typedef uint8_t BitSequence;
+
+typedef size_t BitLength;
+#endif
 
-typedef unsigned char BitSequence;
-typedef size_t DataLength;
-typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+typedef enum { KECCAK_SUCCESS = 0, KECCAK_FAIL = 1, KECCAK_BAD_HASHLEN = 2 } HashReturn;
 
 typedef struct {
     KeccakWidth1600_SpongeInstance sponge;
@@ -44,7 +52,7 @@ typedef struct {
   *                         formatted like the @a delimitedData parameter of
   *                         the Keccak_SpongeAbsorbLastFewBits() function.
   * @pre    One must have r+c=1600 and the rate a multiple of 8 bits in this implementation.
-  * @return SUCCESS if successful, FAIL otherwise.
+  * @return KECCAK_SUCCESS if successful, KECCAK_FAIL otherwise.
   */
 HashReturn Keccak_HashInitialize(Keccak_HashInstance *hashInstance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix);
 
@@ -78,11 +86,13 @@ HashReturn Keccak_HashInitialize(Keccak_HashInstance *hashInstance, unsigned int
   * @param  data        Pointer to the input data.
   *                     When @a databitLen is not a multiple of 8, the last bits of data must be
   *                     in the least significant bits of the last byte (little-endian convention).
+  *                     In this case, the (8 - @a databitLen mod 8) most significant bits
+  *                     of the last byte are ignored.
   * @param  databitLen  The number of input bits provided in the input data.
   * @pre    In the previous call to Keccak_HashUpdate(), databitlen was a multiple of 8.
-  * @return SUCCESS if successful, FAIL otherwise.
+  * @return KECCAK_SUCCESS if successful, KECCAK_FAIL otherwise.
   */
-HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequence *data, DataLength databitlen);
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequence *data, BitLength databitlen);
 
 /**
   * Function to call after all input blocks have been input and to get
@@ -92,9 +102,8 @@ HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequenc
   *     output bits is equal to @a hashbitlen.
   * If @a hashbitlen was 0 in the call to Keccak_HashInitialize(), the output bits
   *     must be extracted using the Keccak_HashSqueeze() function.
-  * @param  state       Pointer to the state of the sponge function initialized by Init().
   * @param  hashval     Pointer to the buffer where to store the output data.
-  * @return SUCCESS if successful, FAIL otherwise.
+  * @return KECCAK_SUCCESS if successful, KECCAK_FAIL otherwise.
   */
 HashReturn Keccak_HashFinal(Keccak_HashInstance *hashInstance, BitSequence *hashval);
 
@@ -105,10 +114,12 @@ HashReturn Keccak_HashFinal(Keccak_HashInstance *hashInstance, BitSequence *hash
   * @param  databitlen  The number of output bits desired (must be a multiple of 8).
   * @pre    Keccak_HashFinal() must have been already called.
   * @pre    @a databitlen is a multiple of 8.
-  * @return SUCCESS if successful, FAIL otherwise.
+  * @return KECCAK_SUCCESS if successful, KECCAK_FAIL otherwise.
   */
-HashReturn Keccak_HashSqueeze(Keccak_HashInstance *hashInstance, BitSequence *data, DataLength databitlen);
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *hashInstance, BitSequence *data, BitLength databitlen);
 
+#else
+#error This requires an implementation of Keccak-p[1600]
 #endif
 
 #endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-64.macros b/Modules/_sha3/kcp/KeccakP-1600-64.macros
index 1f11fe3e79fbba..aabb307ba2f4dc 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-64.macros
+++ b/Modules/_sha3/kcp/KeccakP-1600-64.macros
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -14,23 +15,23 @@ http://creativecommons.org/publicdomain/zero/1.0/
 */
 
 #define declareABCDE \
-    UINT64 Aba, Abe, Abi, Abo, Abu; \
-    UINT64 Aga, Age, Agi, Ago, Agu; \
-    UINT64 Aka, Ake, Aki, Ako, Aku; \
-    UINT64 Ama, Ame, Ami, Amo, Amu; \
-    UINT64 Asa, Ase, Asi, Aso, Asu; \
-    UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
-    UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
-    UINT64 Bka, Bke, Bki, Bko, Bku; \
-    UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
-    UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
-    UINT64 Ca, Ce, Ci, Co, Cu; \
-    UINT64 Da, De, Di, Do, Du; \
-    UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
-    UINT64 Ega, Ege, Egi, Ego, Egu; \
-    UINT64 Eka, Eke, Eki, Eko, Eku; \
-    UINT64 Ema, Eme, Emi, Emo, Emu; \
-    UINT64 Esa, Ese, Esi, Eso, Esu; \
+    uint64_t Aba, Abe, Abi, Abo, Abu; \
+    uint64_t Aga, Age, Agi, Ago, Agu; \
+    uint64_t Aka, Ake, Aki, Ako, Aku; \
+    uint64_t Ama, Ame, Ami, Amo, Amu; \
+    uint64_t Asa, Ase, Asi, Aso, Asu; \
+    uint64_t Bba, Bbe, Bbi, Bbo, Bbu; \
+    uint64_t Bga, Bge, Bgi, Bgo, Bgu; \
+    uint64_t Bka, Bke, Bki, Bko, Bku; \
+    uint64_t Bma, Bme, Bmi, Bmo, Bmu; \
+    uint64_t Bsa, Bse, Bsi, Bso, Bsu; \
+    uint64_t Ca, Ce, Ci, Co, Cu; \
+    uint64_t Da, De, Di, Do, Du; \
+    uint64_t Eba, Ebe, Ebi, Ebo, Ebu; \
+    uint64_t Ega, Ege, Egi, Ego, Egu; \
+    uint64_t Eka, Eke, Eki, Eko, Eku; \
+    uint64_t Ema, Eme, Emi, Emo, Emu; \
+    uint64_t Esa, Ese, Esi, Eso, Esu; \
 
 #define prepareTheta \
     Ca = Aba^Aga^Aka^Ama^Asa; \
@@ -41,9 +42,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
 
 #ifdef UseBebigokimisa
 /* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
-
 /* --- 64-bit lanes mapped to 64-bit words */
-
 #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
     Da = Cu^ROL64(Ce, 1); \
     De = Ca^ROL64(Ci, 1); \
@@ -159,9 +158,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
 \
 
 /* --- Code for round (lane complementing pattern 'bebigokimisa') */
-
 /* --- 64-bit lanes mapped to 64-bit words */
-
 #define thetaRhoPiChiIota(i, A, E) \
     Da = Cu^ROL64(Ce, 1); \
     De = Ca^ROL64(Ci, 1); \
@@ -252,11 +249,8 @@ http://creativecommons.org/publicdomain/zero/1.0/
 \
 
 #else /* UseBebigokimisa */
-
 /* --- Code for round, with prepare-theta */
-
 /* --- 64-bit lanes mapped to 64-bit words */
-
 #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
     Da = Cu^ROL64(Ce, 1); \
     De = Ca^ROL64(Ci, 1); \
@@ -372,9 +366,7 @@ http://creativecommons.org/publicdomain/zero/1.0/
 \
 
 /* --- Code for round */
-
 /* --- 64-bit lanes mapped to 64-bit words */
-
 #define thetaRhoPiChiIota(i, A, E) \
     Da = Cu^ROL64(Ce, 1); \
     De = Ca^ROL64(Ci, 1); \
@@ -466,7 +458,6 @@ http://creativecommons.org/publicdomain/zero/1.0/
 
 #endif /* UseBebigokimisa */
 
-
 #define copyFromState(X, state) \
     X##ba = state[ 0]; \
     X##be = state[ 1]; \
@@ -548,257 +539,43 @@ http://creativecommons.org/publicdomain/zero/1.0/
     X##so = Y##so; \
     X##su = Y##su; \
 
-#define copyFromStateAndAdd(X, state, input, laneCount) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    if (laneCount < 1) { \
-                        X##ba = state[ 0]; \
-                    } \
-                    else { \
-                        X##ba = state[ 0]^input[ 0]; \
-                    } \
-                    X##be = state[ 1]; \
-                    X##bi = state[ 2]; \
-                } \
-                else { \
-                    X##ba = state[ 0]^input[ 0]; \
-                    X##be = state[ 1]^input[ 1]; \
-                    if (laneCount < 3) { \
-                        X##bi = state[ 2]; \
-                    } \
-                    else { \
-                        X##bi = state[ 2]^input[ 2]; \
-                    } \
-                } \
-                X##bo = state[ 3]; \
-                X##bu = state[ 4]; \
-                X##ga = state[ 5]; \
-                X##ge = state[ 6]; \
-            } \
-            else { \
-                X##ba = state[ 0]^input[ 0]; \
-                X##be = state[ 1]^input[ 1]; \
-                X##bi = state[ 2]^input[ 2]; \
-                X##bo = state[ 3]^input[ 3]; \
-                if (laneCount < 6) { \
-                    if (laneCount < 5) { \
-                        X##bu = state[ 4]; \
-                    } \
-                    else { \
-                        X##bu = state[ 4]^input[ 4]; \
-                    } \
-                    X##ga = state[ 5]; \
-                    X##ge = state[ 6]; \
-                } \
-                else { \
-                    X##bu = state[ 4]^input[ 4]; \
-                    X##ga = state[ 5]^input[ 5]; \
-                    if (laneCount < 7) { \
-                        X##ge = state[ 6]; \
-                    } \
-                    else { \
-                        X##ge = state[ 6]^input[ 6]; \
-                    } \
-                } \
-            } \
-            X##gi = state[ 7]; \
-            X##go = state[ 8]; \
-            X##gu = state[ 9]; \
-            X##ka = state[10]; \
-            X##ke = state[11]; \
-            X##ki = state[12]; \
-            X##ko = state[13]; \
-            X##ku = state[14]; \
-        } \
-        else { \
-            X##ba = state[ 0]^input[ 0]; \
-            X##be = state[ 1]^input[ 1]; \
-            X##bi = state[ 2]^input[ 2]; \
-            X##bo = state[ 3]^input[ 3]; \
-            X##bu = state[ 4]^input[ 4]; \
-            X##ga = state[ 5]^input[ 5]; \
-            X##ge = state[ 6]^input[ 6]; \
-            X##gi = state[ 7]^input[ 7]; \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    if (laneCount < 9) { \
-                        X##go = state[ 8]; \
-                    } \
-                    else { \
-                        X##go = state[ 8]^input[ 8]; \
-                    } \
-                    X##gu = state[ 9]; \
-                    X##ka = state[10]; \
-                } \
-                else { \
-                    X##go = state[ 8]^input[ 8]; \
-                    X##gu = state[ 9]^input[ 9]; \
-                    if (laneCount < 11) { \
-                        X##ka = state[10]; \
-                    } \
-                    else { \
-                        X##ka = state[10]^input[10]; \
-                    } \
-                } \
-                X##ke = state[11]; \
-                X##ki = state[12]; \
-                X##ko = state[13]; \
-                X##ku = state[14]; \
-            } \
-            else { \
-                X##go = state[ 8]^input[ 8]; \
-                X##gu = state[ 9]^input[ 9]; \
-                X##ka = state[10]^input[10]; \
-                X##ke = state[11]^input[11]; \
-                if (laneCount < 14) { \
-                    if (laneCount < 13) { \
-                        X##ki = state[12]; \
-                    } \
-                    else { \
-                        X##ki = state[12]^input[12]; \
-                    } \
-                    X##ko = state[13]; \
-                    X##ku = state[14]; \
-                } \
-                else { \
-                    X##ki = state[12]^input[12]; \
-                    X##ko = state[13]^input[13]; \
-                    if (laneCount < 15) { \
-                        X##ku = state[14]; \
-                    } \
-                    else { \
-                        X##ku = state[14]^input[14]; \
-                    } \
-                } \
-            } \
-        } \
-        X##ma = state[15]; \
-        X##me = state[16]; \
-        X##mi = state[17]; \
-        X##mo = state[18]; \
-        X##mu = state[19]; \
-        X##sa = state[20]; \
-        X##se = state[21]; \
-        X##si = state[22]; \
-        X##so = state[23]; \
-        X##su = state[24]; \
-    } \
-    else { \
-        X##ba = state[ 0]^input[ 0]; \
-        X##be = state[ 1]^input[ 1]; \
-        X##bi = state[ 2]^input[ 2]; \
-        X##bo = state[ 3]^input[ 3]; \
-        X##bu = state[ 4]^input[ 4]; \
-        X##ga = state[ 5]^input[ 5]; \
-        X##ge = state[ 6]^input[ 6]; \
-        X##gi = state[ 7]^input[ 7]; \
-        X##go = state[ 8]^input[ 8]; \
-        X##gu = state[ 9]^input[ 9]; \
-        X##ka = state[10]^input[10]; \
-        X##ke = state[11]^input[11]; \
-        X##ki = state[12]^input[12]; \
-        X##ko = state[13]^input[13]; \
-        X##ku = state[14]^input[14]; \
-        X##ma = state[15]^input[15]; \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    if (laneCount < 17) { \
-                        X##me = state[16]; \
-                    } \
-                    else { \
-                        X##me = state[16]^input[16]; \
-                    } \
-                    X##mi = state[17]; \
-                    X##mo = state[18]; \
-                } \
-                else { \
-                    X##me = state[16]^input[16]; \
-                    X##mi = state[17]^input[17]; \
-                    if (laneCount < 19) { \
-                        X##mo = state[18]; \
-                    } \
-                    else { \
-                        X##mo = state[18]^input[18]; \
-                    } \
-                } \
-                X##mu = state[19]; \
-                X##sa = state[20]; \
-                X##se = state[21]; \
-                X##si = state[22]; \
-            } \
-            else { \
-                X##me = state[16]^input[16]; \
-                X##mi = state[17]^input[17]; \
-                X##mo = state[18]^input[18]; \
-                X##mu = state[19]^input[19]; \
-                if (laneCount < 22) { \
-                    if (laneCount < 21) { \
-                        X##sa = state[20]; \
-                    } \
-                    else { \
-                        X##sa = state[20]^input[20]; \
-                    } \
-                    X##se = state[21]; \
-                    X##si = state[22]; \
-                } \
-                else { \
-                    X##sa = state[20]^input[20]; \
-                    X##se = state[21]^input[21]; \
-                    if (laneCount < 23) { \
-                        X##si = state[22]; \
-                    } \
-                    else { \
-                        X##si = state[22]^input[22]; \
-                    } \
-                } \
-            } \
-            X##so = state[23]; \
-            X##su = state[24]; \
-        } \
-        else { \
-            X##me = state[16]^input[16]; \
-            X##mi = state[17]^input[17]; \
-            X##mo = state[18]^input[18]; \
-            X##mu = state[19]^input[19]; \
-            X##sa = state[20]^input[20]; \
-            X##se = state[21]^input[21]; \
-            X##si = state[22]^input[22]; \
-            X##so = state[23]^input[23]; \
-            if (laneCount < 25) { \
-                X##su = state[24]; \
-            } \
-            else { \
-                X##su = state[24]^input[24]; \
-            } \
-        } \
-    }
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define HTOLE64(x) (x)
+#else
+#define HTOLE64(x) (\
+  ((x & 0xff00000000000000ull) >> 56) | \
+  ((x & 0x00ff000000000000ull) >> 40) | \
+  ((x & 0x0000ff0000000000ull) >> 24) | \
+  ((x & 0x000000ff00000000ull) >> 8)  | \
+  ((x & 0x00000000ff000000ull) << 8)  | \
+  ((x & 0x0000000000ff0000ull) << 24) | \
+  ((x & 0x000000000000ff00ull) << 40) | \
+  ((x & 0x00000000000000ffull) << 56))
+#endif
 
 #define addInput(X, input, laneCount) \
     if (laneCount == 21) { \
-        X##ba ^= input[ 0]; \
-        X##be ^= input[ 1]; \
-        X##bi ^= input[ 2]; \
-        X##bo ^= input[ 3]; \
-        X##bu ^= input[ 4]; \
-        X##ga ^= input[ 5]; \
-        X##ge ^= input[ 6]; \
-        X##gi ^= input[ 7]; \
-        X##go ^= input[ 8]; \
-        X##gu ^= input[ 9]; \
-        X##ka ^= input[10]; \
-        X##ke ^= input[11]; \
-        X##ki ^= input[12]; \
-        X##ko ^= input[13]; \
-        X##ku ^= input[14]; \
-        X##ma ^= input[15]; \
-        X##me ^= input[16]; \
-        X##mi ^= input[17]; \
-        X##mo ^= input[18]; \
-        X##mu ^= input[19]; \
-        X##sa ^= input[20]; \
+        X##ba ^= HTOLE64(input[ 0]); \
+        X##be ^= HTOLE64(input[ 1]); \
+        X##bi ^= HTOLE64(input[ 2]); \
+        X##bo ^= HTOLE64(input[ 3]); \
+        X##bu ^= HTOLE64(input[ 4]); \
+        X##ga ^= HTOLE64(input[ 5]); \
+        X##ge ^= HTOLE64(input[ 6]); \
+        X##gi ^= HTOLE64(input[ 7]); \
+        X##go ^= HTOLE64(input[ 8]); \
+        X##gu ^= HTOLE64(input[ 9]); \
+        X##ka ^= HTOLE64(input[10]); \
+        X##ke ^= HTOLE64(input[11]); \
+        X##ki ^= HTOLE64(input[12]); \
+        X##ko ^= HTOLE64(input[13]); \
+        X##ku ^= HTOLE64(input[14]); \
+        X##ma ^= HTOLE64(input[15]); \
+        X##me ^= HTOLE64(input[16]); \
+        X##mi ^= HTOLE64(input[17]); \
+        X##mo ^= HTOLE64(input[18]); \
+        X##mu ^= HTOLE64(input[19]); \
+        X##sa ^= HTOLE64(input[20]); \
     } \
     else if (laneCount < 16) { \
         if (laneCount < 8) { \
@@ -807,1402 +584,165 @@ http://creativecommons.org/publicdomain/zero/1.0/
                     if (laneCount < 1) { \
                     } \
                     else { \
-                        X##ba ^= input[ 0]; \
-                    } \
-                } \
-                else { \
-                    X##ba ^= input[ 0]; \
-                    X##be ^= input[ 1]; \
-                    if (laneCount < 3) { \
-                    } \
-                    else { \
-                        X##bi ^= input[ 2]; \
-                    } \
-                } \
-            } \
-            else { \
-                X##ba ^= input[ 0]; \
-                X##be ^= input[ 1]; \
-                X##bi ^= input[ 2]; \
-                X##bo ^= input[ 3]; \
-                if (laneCount < 6) { \
-                    if (laneCount < 5) { \
-                    } \
-                    else { \
-                        X##bu ^= input[ 4]; \
-                    } \
-                } \
-                else { \
-                    X##bu ^= input[ 4]; \
-                    X##ga ^= input[ 5]; \
-                    if (laneCount < 7) { \
-                    } \
-                    else { \
-                        X##ge ^= input[ 6]; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            X##ba ^= input[ 0]; \
-            X##be ^= input[ 1]; \
-            X##bi ^= input[ 2]; \
-            X##bo ^= input[ 3]; \
-            X##bu ^= input[ 4]; \
-            X##ga ^= input[ 5]; \
-            X##ge ^= input[ 6]; \
-            X##gi ^= input[ 7]; \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    if (laneCount < 9) { \
-                    } \
-                    else { \
-                        X##go ^= input[ 8]; \
-                    } \
-                } \
-                else { \
-                    X##go ^= input[ 8]; \
-                    X##gu ^= input[ 9]; \
-                    if (laneCount < 11) { \
-                    } \
-                    else { \
-                        X##ka ^= input[10]; \
-                    } \
-                } \
-            } \
-            else { \
-                X##go ^= input[ 8]; \
-                X##gu ^= input[ 9]; \
-                X##ka ^= input[10]; \
-                X##ke ^= input[11]; \
-                if (laneCount < 14) { \
-                    if (laneCount < 13) { \
-                    } \
-                    else { \
-                        X##ki ^= input[12]; \
-                    } \
-                } \
-                else { \
-                    X##ki ^= input[12]; \
-                    X##ko ^= input[13]; \
-                    if (laneCount < 15) { \
-                    } \
-                    else { \
-                        X##ku ^= input[14]; \
-                    } \
-                } \
-            } \
-        } \
-    } \
-    else { \
-        X##ba ^= input[ 0]; \
-        X##be ^= input[ 1]; \
-        X##bi ^= input[ 2]; \
-        X##bo ^= input[ 3]; \
-        X##bu ^= input[ 4]; \
-        X##ga ^= input[ 5]; \
-        X##ge ^= input[ 6]; \
-        X##gi ^= input[ 7]; \
-        X##go ^= input[ 8]; \
-        X##gu ^= input[ 9]; \
-        X##ka ^= input[10]; \
-        X##ke ^= input[11]; \
-        X##ki ^= input[12]; \
-        X##ko ^= input[13]; \
-        X##ku ^= input[14]; \
-        X##ma ^= input[15]; \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    if (laneCount < 17) { \
-                    } \
-                    else { \
-                        X##me ^= input[16]; \
-                    } \
-                } \
-                else { \
-                    X##me ^= input[16]; \
-                    X##mi ^= input[17]; \
-                    if (laneCount < 19) { \
-                    } \
-                    else { \
-                        X##mo ^= input[18]; \
-                    } \
-                } \
-            } \
-            else { \
-                X##me ^= input[16]; \
-                X##mi ^= input[17]; \
-                X##mo ^= input[18]; \
-                X##mu ^= input[19]; \
-                if (laneCount < 22) { \
-                    if (laneCount < 21) { \
-                    } \
-                    else { \
-                        X##sa ^= input[20]; \
-                    } \
-                } \
-                else { \
-                    X##sa ^= input[20]; \
-                    X##se ^= input[21]; \
-                    if (laneCount < 23) { \
-                    } \
-                    else { \
-                        X##si ^= input[22]; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            X##me ^= input[16]; \
-            X##mi ^= input[17]; \
-            X##mo ^= input[18]; \
-            X##mu ^= input[19]; \
-            X##sa ^= input[20]; \
-            X##se ^= input[21]; \
-            X##si ^= input[22]; \
-            X##so ^= input[23]; \
-            if (laneCount < 25) { \
-            } \
-            else { \
-                X##su ^= input[24]; \
-            } \
-        } \
-    }
-
-#ifdef UseBebigokimisa
-
-#define copyToStateAndOutput(X, state, output, laneCount) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    state[ 0] = X##ba; \
-                    if (laneCount >= 1) { \
-                        output[ 0] = X##ba; \
-                    } \
-                    state[ 1] = X##be; \
-                    state[ 2] = X##bi; \
-                } \
-                else { \
-                    state[ 0] = X##ba; \
-                    output[ 0] = X##ba; \
-                    state[ 1] = X##be; \
-                    output[ 1] = ~X##be; \
-                    state[ 2] = X##bi; \
-                    if (laneCount >= 3) { \
-                        output[ 2] = ~X##bi; \
-                    } \
-                } \
-                state[ 3] = X##bo; \
-                state[ 4] = X##bu; \
-                state[ 5] = X##ga; \
-                state[ 6] = X##ge; \
-            } \
-            else { \
-                state[ 0] = X##ba; \
-                output[ 0] = X##ba; \
-                state[ 1] = X##be; \
-                output[ 1] = ~X##be; \
-                state[ 2] = X##bi; \
-                output[ 2] = ~X##bi; \
-                state[ 3] = X##bo; \
-                output[ 3] = X##bo; \
-                if (laneCount < 6) { \
-                    state[ 4] = X##bu; \
-                    if (laneCount >= 5) { \
-                        output[ 4] = X##bu; \
-                    } \
-                    state[ 5] = X##ga; \
-                    state[ 6] = X##ge; \
-                } \
-                else { \
-                    state[ 4] = X##bu; \
-                    output[ 4] = X##bu; \
-                    state[ 5] = X##ga; \
-                    output[ 5] = X##ga; \
-                    state[ 6] = X##ge; \
-                    if (laneCount >= 7) { \
-                        output[ 6] = X##ge; \
-                    } \
-                } \
-            } \
-            state[ 7] = X##gi; \
-            state[ 8] = X##go; \
-            state[ 9] = X##gu; \
-            state[10] = X##ka; \
-            state[11] = X##ke; \
-            state[12] = X##ki; \
-            state[13] = X##ko; \
-            state[14] = X##ku; \
-        } \
-        else { \
-            state[ 0] = X##ba; \
-            output[ 0] = X##ba; \
-            state[ 1] = X##be; \
-            output[ 1] = ~X##be; \
-            state[ 2] = X##bi; \
-            output[ 2] = ~X##bi; \
-            state[ 3] = X##bo; \
-            output[ 3] = X##bo; \
-            state[ 4] = X##bu; \
-            output[ 4] = X##bu; \
-            state[ 5] = X##ga; \
-            output[ 5] = X##ga; \
-            state[ 6] = X##ge; \
-            output[ 6] = X##ge; \
-            state[ 7] = X##gi; \
-            output[ 7] = X##gi; \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    state[ 8] = X##go; \
-                    if (laneCount >= 9) { \
-                        output[ 8] = ~X##go; \
-                    } \
-                    state[ 9] = X##gu; \
-                    state[10] = X##ka; \
-                } \
-                else { \
-                    state[ 8] = X##go; \
-                    output[ 8] = ~X##go; \
-                    state[ 9] = X##gu; \
-                    output[ 9] = X##gu; \
-                    state[10] = X##ka; \
-                    if (laneCount >= 11) { \
-                        output[10] = X##ka; \
-                    } \
-                } \
-                state[11] = X##ke; \
-                state[12] = X##ki; \
-                state[13] = X##ko; \
-                state[14] = X##ku; \
-            } \
-            else { \
-                state[ 8] = X##go; \
-                output[ 8] = ~X##go; \
-                state[ 9] = X##gu; \
-                output[ 9] = X##gu; \
-                state[10] = X##ka; \
-                output[10] = X##ka; \
-                state[11] = X##ke; \
-                output[11] = X##ke; \
-                if (laneCount < 14) { \
-                    state[12] = X##ki; \
-                    if (laneCount >= 13) { \
-                        output[12] = ~X##ki; \
-                    } \
-                    state[13] = X##ko; \
-                    state[14] = X##ku; \
-                } \
-                else { \
-                    state[12] = X##ki; \
-                    output[12] = ~X##ki; \
-                    state[13] = X##ko; \
-                    output[13] = X##ko; \
-                    state[14] = X##ku; \
-                    if (laneCount >= 15) { \
-                        output[14] = X##ku; \
-                    } \
-                } \
-            } \
-        } \
-        state[15] = X##ma; \
-        state[16] = X##me; \
-        state[17] = X##mi; \
-        state[18] = X##mo; \
-        state[19] = X##mu; \
-        state[20] = X##sa; \
-        state[21] = X##se; \
-        state[22] = X##si; \
-        state[23] = X##so; \
-        state[24] = X##su; \
-    } \
-    else { \
-        state[ 0] = X##ba; \
-        output[ 0] = X##ba; \
-        state[ 1] = X##be; \
-        output[ 1] = ~X##be; \
-        state[ 2] = X##bi; \
-        output[ 2] = ~X##bi; \
-        state[ 3] = X##bo; \
-        output[ 3] = X##bo; \
-        state[ 4] = X##bu; \
-        output[ 4] = X##bu; \
-        state[ 5] = X##ga; \
-        output[ 5] = X##ga; \
-        state[ 6] = X##ge; \
-        output[ 6] = X##ge; \
-        state[ 7] = X##gi; \
-        output[ 7] = X##gi; \
-        state[ 8] = X##go; \
-        output[ 8] = ~X##go; \
-        state[ 9] = X##gu; \
-        output[ 9] = X##gu; \
-        state[10] = X##ka; \
-        output[10] = X##ka; \
-        state[11] = X##ke; \
-        output[11] = X##ke; \
-        state[12] = X##ki; \
-        output[12] = ~X##ki; \
-        state[13] = X##ko; \
-        output[13] = X##ko; \
-        state[14] = X##ku; \
-        output[14] = X##ku; \
-        state[15] = X##ma; \
-        output[15] = X##ma; \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    state[16] = X##me; \
-                    if (laneCount >= 17) { \
-                        output[16] = X##me; \
-                    } \
-                    state[17] = X##mi; \
-                    state[18] = X##mo; \
-                } \
-                else { \
-                    state[16] = X##me; \
-                    output[16] = X##me; \
-                    state[17] = X##mi; \
-                    output[17] = ~X##mi; \
-                    state[18] = X##mo; \
-                    if (laneCount >= 19) { \
-                        output[18] = X##mo; \
-                    } \
-                } \
-                state[19] = X##mu; \
-                state[20] = X##sa; \
-                state[21] = X##se; \
-                state[22] = X##si; \
-            } \
-            else { \
-                state[16] = X##me; \
-                output[16] = X##me; \
-                state[17] = X##mi; \
-                output[17] = ~X##mi; \
-                state[18] = X##mo; \
-                output[18] = X##mo; \
-                state[19] = X##mu; \
-                output[19] = X##mu; \
-                if (laneCount < 22) { \
-                    state[20] = X##sa; \
-                    if (laneCount >= 21) { \
-                        output[20] = ~X##sa; \
-                    } \
-                    state[21] = X##se; \
-                    state[22] = X##si; \
-                } \
-                else { \
-                    state[20] = X##sa; \
-                    output[20] = ~X##sa; \
-                    state[21] = X##se; \
-                    output[21] = X##se; \
-                    state[22] = X##si; \
-                    if (laneCount >= 23) { \
-                        output[22] = X##si; \
-                    } \
-                } \
-            } \
-            state[23] = X##so; \
-            state[24] = X##su; \
-        } \
-        else { \
-            state[16] = X##me; \
-            output[16] = X##me; \
-            state[17] = X##mi; \
-            output[17] = ~X##mi; \
-            state[18] = X##mo; \
-            output[18] = X##mo; \
-            state[19] = X##mu; \
-            output[19] = X##mu; \
-            state[20] = X##sa; \
-            output[20] = ~X##sa; \
-            state[21] = X##se; \
-            output[21] = X##se; \
-            state[22] = X##si; \
-            output[22] = X##si; \
-            state[23] = X##so; \
-            output[23] = X##so; \
-            state[24] = X##su; \
-            if (laneCount >= 25) { \
-                output[24] = X##su; \
-            } \
-        } \
-    }
-
-#define output(X, output, laneCount) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    if (laneCount >= 1) { \
-                        output[ 0] = X##ba; \
-                    } \
-                } \
-                else { \
-                    output[ 0] = X##ba; \
-                    output[ 1] = ~X##be; \
-                    if (laneCount >= 3) { \
-                        output[ 2] = ~X##bi; \
-                    } \
-                } \
-            } \
-            else { \
-                output[ 0] = X##ba; \
-                output[ 1] = ~X##be; \
-                output[ 2] = ~X##bi; \
-                output[ 3] = X##bo; \
-                if (laneCount < 6) { \
-                    if (laneCount >= 5) { \
-                        output[ 4] = X##bu; \
-                    } \
-                } \
-                else { \
-                    output[ 4] = X##bu; \
-                    output[ 5] = X##ga; \
-                    if (laneCount >= 7) { \
-                        output[ 6] = X##ge; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            output[ 0] = X##ba; \
-            output[ 1] = ~X##be; \
-            output[ 2] = ~X##bi; \
-            output[ 3] = X##bo; \
-            output[ 4] = X##bu; \
-            output[ 5] = X##ga; \
-            output[ 6] = X##ge; \
-            output[ 7] = X##gi; \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    if (laneCount >= 9) { \
-                        output[ 8] = ~X##go; \
-                    } \
-                } \
-                else { \
-                    output[ 8] = ~X##go; \
-                    output[ 9] = X##gu; \
-                    if (laneCount >= 11) { \
-                        output[10] = X##ka; \
-                    } \
-                } \
-            } \
-            else { \
-                output[ 8] = ~X##go; \
-                output[ 9] = X##gu; \
-                output[10] = X##ka; \
-                output[11] = X##ke; \
-                if (laneCount < 14) { \
-                    if (laneCount >= 13) { \
-                        output[12] = ~X##ki; \
-                    } \
-                } \
-                else { \
-                    output[12] = ~X##ki; \
-                    output[13] = X##ko; \
-                    if (laneCount >= 15) { \
-                        output[14] = X##ku; \
-                    } \
-                } \
-            } \
-        } \
-    } \
-    else { \
-        output[ 0] = X##ba; \
-        output[ 1] = ~X##be; \
-        output[ 2] = ~X##bi; \
-        output[ 3] = X##bo; \
-        output[ 4] = X##bu; \
-        output[ 5] = X##ga; \
-        output[ 6] = X##ge; \
-        output[ 7] = X##gi; \
-        output[ 8] = ~X##go; \
-        output[ 9] = X##gu; \
-        output[10] = X##ka; \
-        output[11] = X##ke; \
-        output[12] = ~X##ki; \
-        output[13] = X##ko; \
-        output[14] = X##ku; \
-        output[15] = X##ma; \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    if (laneCount >= 17) { \
-                        output[16] = X##me; \
-                    } \
-                } \
-                else { \
-                    output[16] = X##me; \
-                    output[17] = ~X##mi; \
-                    if (laneCount >= 19) { \
-                        output[18] = X##mo; \
-                    } \
-                } \
-            } \
-            else { \
-                output[16] = X##me; \
-                output[17] = ~X##mi; \
-                output[18] = X##mo; \
-                output[19] = X##mu; \
-                if (laneCount < 22) { \
-                    if (laneCount >= 21) { \
-                        output[20] = ~X##sa; \
-                    } \
-                } \
-                else { \
-                    output[20] = ~X##sa; \
-                    output[21] = X##se; \
-                    if (laneCount >= 23) { \
-                        output[22] = X##si; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            output[16] = X##me; \
-            output[17] = ~X##mi; \
-            output[18] = X##mo; \
-            output[19] = X##mu; \
-            output[20] = ~X##sa; \
-            output[21] = X##se; \
-            output[22] = X##si; \
-            output[23] = X##so; \
-            if (laneCount >= 25) { \
-                output[24] = X##su; \
-            } \
-        } \
-    }
-
-#define wrapOne(X, input, output, index, name) \
-    X##name ^= input[index]; \
-    output[index] = X##name;
-
-#define wrapOneInvert(X, input, output, index, name) \
-    X##name ^= input[index]; \
-    output[index] = ~X##name;
-
-#define unwrapOne(X, input, output, index, name) \
-    output[index] = input[index] ^ X##name; \
-    X##name ^= output[index];
-
-#define unwrapOneInvert(X, input, output, index, name) \
-    output[index] = ~(input[index] ^ X##name); \
-    X##name ^= output[index]; \
-
-#else /* UseBebigokimisa */
-
-
-#define copyToStateAndOutput(X, state, output, laneCount) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    state[ 0] = X##ba; \
-                    if (laneCount >= 1) { \
-                        output[ 0] = X##ba; \
-                    } \
-                    state[ 1] = X##be; \
-                    state[ 2] = X##bi; \
-                } \
-                else { \
-                    state[ 0] = X##ba; \
-                    output[ 0] = X##ba; \
-                    state[ 1] = X##be; \
-                    output[ 1] = X##be; \
-                    state[ 2] = X##bi; \
-                    if (laneCount >= 3) { \
-                        output[ 2] = X##bi; \
-                    } \
-                } \
-                state[ 3] = X##bo; \
-                state[ 4] = X##bu; \
-                state[ 5] = X##ga; \
-                state[ 6] = X##ge; \
-            } \
-            else { \
-                state[ 0] = X##ba; \
-                output[ 0] = X##ba; \
-                state[ 1] = X##be; \
-                output[ 1] = X##be; \
-                state[ 2] = X##bi; \
-                output[ 2] = X##bi; \
-                state[ 3] = X##bo; \
-                output[ 3] = X##bo; \
-                if (laneCount < 6) { \
-                    state[ 4] = X##bu; \
-                    if (laneCount >= 5) { \
-                        output[ 4] = X##bu; \
-                    } \
-                    state[ 5] = X##ga; \
-                    state[ 6] = X##ge; \
-                } \
-                else { \
-                    state[ 4] = X##bu; \
-                    output[ 4] = X##bu; \
-                    state[ 5] = X##ga; \
-                    output[ 5] = X##ga; \
-                    state[ 6] = X##ge; \
-                    if (laneCount >= 7) { \
-                        output[ 6] = X##ge; \
-                    } \
-                } \
-            } \
-            state[ 7] = X##gi; \
-            state[ 8] = X##go; \
-            state[ 9] = X##gu; \
-            state[10] = X##ka; \
-            state[11] = X##ke; \
-            state[12] = X##ki; \
-            state[13] = X##ko; \
-            state[14] = X##ku; \
-        } \
-        else { \
-            state[ 0] = X##ba; \
-            output[ 0] = X##ba; \
-            state[ 1] = X##be; \
-            output[ 1] = X##be; \
-            state[ 2] = X##bi; \
-            output[ 2] = X##bi; \
-            state[ 3] = X##bo; \
-            output[ 3] = X##bo; \
-            state[ 4] = X##bu; \
-            output[ 4] = X##bu; \
-            state[ 5] = X##ga; \
-            output[ 5] = X##ga; \
-            state[ 6] = X##ge; \
-            output[ 6] = X##ge; \
-            state[ 7] = X##gi; \
-            output[ 7] = X##gi; \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    state[ 8] = X##go; \
-                    if (laneCount >= 9) { \
-                        output[ 8] = X##go; \
-                    } \
-                    state[ 9] = X##gu; \
-                    state[10] = X##ka; \
-                } \
-                else { \
-                    state[ 8] = X##go; \
-                    output[ 8] = X##go; \
-                    state[ 9] = X##gu; \
-                    output[ 9] = X##gu; \
-                    state[10] = X##ka; \
-                    if (laneCount >= 11) { \
-                        output[10] = X##ka; \
-                    } \
-                } \
-                state[11] = X##ke; \
-                state[12] = X##ki; \
-                state[13] = X##ko; \
-                state[14] = X##ku; \
-            } \
-            else { \
-                state[ 8] = X##go; \
-                output[ 8] = X##go; \
-                state[ 9] = X##gu; \
-                output[ 9] = X##gu; \
-                state[10] = X##ka; \
-                output[10] = X##ka; \
-                state[11] = X##ke; \
-                output[11] = X##ke; \
-                if (laneCount < 14) { \
-                    state[12] = X##ki; \
-                    if (laneCount >= 13) { \
-                        output[12]= X##ki; \
-                    } \
-                    state[13] = X##ko; \
-                    state[14] = X##ku; \
-                } \
-                else { \
-                    state[12] = X##ki; \
-                    output[12]= X##ki; \
-                    state[13] = X##ko; \
-                    output[13] = X##ko; \
-                    state[14] = X##ku; \
-                    if (laneCount >= 15) { \
-                        output[14] = X##ku; \
-                    } \
-                } \
-            } \
-        } \
-        state[15] = X##ma; \
-        state[16] = X##me; \
-        state[17] = X##mi; \
-        state[18] = X##mo; \
-        state[19] = X##mu; \
-        state[20] = X##sa; \
-        state[21] = X##se; \
-        state[22] = X##si; \
-        state[23] = X##so; \
-        state[24] = X##su; \
-    } \
-    else { \
-        state[ 0] = X##ba; \
-        output[ 0] = X##ba; \
-        state[ 1] = X##be; \
-        output[ 1] = X##be; \
-        state[ 2] = X##bi; \
-        output[ 2] = X##bi; \
-        state[ 3] = X##bo; \
-        output[ 3] = X##bo; \
-        state[ 4] = X##bu; \
-        output[ 4] = X##bu; \
-        state[ 5] = X##ga; \
-        output[ 5] = X##ga; \
-        state[ 6] = X##ge; \
-        output[ 6] = X##ge; \
-        state[ 7] = X##gi; \
-        output[ 7] = X##gi; \
-        state[ 8] = X##go; \
-        output[ 8] = X##go; \
-        state[ 9] = X##gu; \
-        output[ 9] = X##gu; \
-        state[10] = X##ka; \
-        output[10] = X##ka; \
-        state[11] = X##ke; \
-        output[11] = X##ke; \
-        state[12] = X##ki; \
-        output[12]= X##ki; \
-        state[13] = X##ko; \
-        output[13] = X##ko; \
-        state[14] = X##ku; \
-        output[14] = X##ku; \
-        state[15] = X##ma; \
-        output[15] = X##ma; \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    state[16] = X##me; \
-                    if (laneCount >= 17) { \
-                        output[16] = X##me; \
-                    } \
-                    state[17] = X##mi; \
-                    state[18] = X##mo; \
-                } \
-                else { \
-                    state[16] = X##me; \
-                    output[16] = X##me; \
-                    state[17] = X##mi; \
-                    output[17] = X##mi; \
-                    state[18] = X##mo; \
-                    if (laneCount >= 19) { \
-                        output[18] = X##mo; \
-                    } \
-                } \
-                state[19] = X##mu; \
-                state[20] = X##sa; \
-                state[21] = X##se; \
-                state[22] = X##si; \
-            } \
-            else { \
-                state[16] = X##me; \
-                output[16] = X##me; \
-                state[17] = X##mi; \
-                output[17] = X##mi; \
-                state[18] = X##mo; \
-                output[18] = X##mo; \
-                state[19] = X##mu; \
-                output[19] = X##mu; \
-                if (laneCount < 22) { \
-                    state[20] = X##sa; \
-                    if (laneCount >= 21) { \
-                        output[20] = X##sa; \
-                    } \
-                    state[21] = X##se; \
-                    state[22] = X##si; \
-                } \
-                else { \
-                    state[20] = X##sa; \
-                    output[20] = X##sa; \
-                    state[21] = X##se; \
-                    output[21] = X##se; \
-                    state[22] = X##si; \
-                    if (laneCount >= 23) { \
-                        output[22] = X##si; \
-                    } \
-                } \
-            } \
-            state[23] = X##so; \
-            state[24] = X##su; \
-        } \
-        else { \
-            state[16] = X##me; \
-            output[16] = X##me; \
-            state[17] = X##mi; \
-            output[17] = X##mi; \
-            state[18] = X##mo; \
-            output[18] = X##mo; \
-            state[19] = X##mu; \
-            output[19] = X##mu; \
-            state[20] = X##sa; \
-            output[20] = X##sa; \
-            state[21] = X##se; \
-            output[21] = X##se; \
-            state[22] = X##si; \
-            output[22] = X##si; \
-            state[23] = X##so; \
-            output[23] = X##so; \
-            state[24] = X##su; \
-            if (laneCount >= 25) { \
-                output[24] = X##su; \
-            } \
-        } \
-    }
-
-#define output(X, output, laneCount) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    if (laneCount >= 1) { \
-                        output[ 0] = X##ba; \
-                    } \
-                } \
-                else { \
-                    output[ 0] = X##ba; \
-                    output[ 1] = X##be; \
-                    if (laneCount >= 3) { \
-                        output[ 2] = X##bi; \
-                    } \
-                } \
-            } \
-            else { \
-                output[ 0] = X##ba; \
-                output[ 1] = X##be; \
-                output[ 2] = X##bi; \
-                output[ 3] = X##bo; \
-                if (laneCount < 6) { \
-                    if (laneCount >= 5) { \
-                        output[ 4] = X##bu; \
-                    } \
-                } \
-                else { \
-                    output[ 4] = X##bu; \
-                    output[ 5] = X##ga; \
-                    if (laneCount >= 7) { \
-                        output[ 6] = X##ge; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            output[ 0] = X##ba; \
-            output[ 1] = X##be; \
-            output[ 2] = X##bi; \
-            output[ 3] = X##bo; \
-            output[ 4] = X##bu; \
-            output[ 5] = X##ga; \
-            output[ 6] = X##ge; \
-            output[ 7] = X##gi; \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    if (laneCount >= 9) { \
-                        output[ 8] = X##go; \
-                    } \
-                } \
-                else { \
-                    output[ 8] = X##go; \
-                    output[ 9] = X##gu; \
-                    if (laneCount >= 11) { \
-                        output[10] = X##ka; \
-                    } \
-                } \
-            } \
-            else { \
-                output[ 8] = X##go; \
-                output[ 9] = X##gu; \
-                output[10] = X##ka; \
-                output[11] = X##ke; \
-                if (laneCount < 14) { \
-                    if (laneCount >= 13) { \
-                        output[12] = X##ki; \
-                    } \
-                } \
-                else { \
-                    output[12] = X##ki; \
-                    output[13] = X##ko; \
-                    if (laneCount >= 15) { \
-                        output[14] = X##ku; \
-                    } \
-                } \
-            } \
-        } \
-    } \
-    else { \
-        output[ 0] = X##ba; \
-        output[ 1] = X##be; \
-        output[ 2] = X##bi; \
-        output[ 3] = X##bo; \
-        output[ 4] = X##bu; \
-        output[ 5] = X##ga; \
-        output[ 6] = X##ge; \
-        output[ 7] = X##gi; \
-        output[ 8] = X##go; \
-        output[ 9] = X##gu; \
-        output[10] = X##ka; \
-        output[11] = X##ke; \
-        output[12] = X##ki; \
-        output[13] = X##ko; \
-        output[14] = X##ku; \
-        output[15] = X##ma; \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    if (laneCount >= 17) { \
-                        output[16] = X##me; \
-                    } \
-                } \
-                else { \
-                    output[16] = X##me; \
-                    output[17] = X##mi; \
-                    if (laneCount >= 19) { \
-                        output[18] = X##mo; \
-                    } \
-                } \
-            } \
-            else { \
-                output[16] = X##me; \
-                output[17] = X##mi; \
-                output[18] = X##mo; \
-                output[19] = X##mu; \
-                if (laneCount < 22) { \
-                    if (laneCount >= 21) { \
-                        output[20] = X##sa; \
-                    } \
-                } \
-                else { \
-                    output[20] = X##sa; \
-                    output[21] = X##se; \
-                    if (laneCount >= 23) { \
-                        output[22] = X##si; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            output[16] = X##me; \
-            output[17] = X##mi; \
-            output[18] = X##mo; \
-            output[19] = X##mu; \
-            output[20] = X##sa; \
-            output[21] = X##se; \
-            output[22] = X##si; \
-            output[23] = X##so; \
-            if (laneCount >= 25) { \
-                output[24] = X##su; \
-            } \
-        } \
-    }
-
-#define wrapOne(X, input, output, index, name) \
-    X##name ^= input[index]; \
-    output[index] = X##name;
-
-#define wrapOneInvert(X, input, output, index, name) \
-    X##name ^= input[index]; \
-    output[index] = X##name;
-
-#define unwrapOne(X, input, output, index, name) \
-    output[index] = input[index] ^ X##name; \
-    X##name ^= output[index];
-
-#define unwrapOneInvert(X, input, output, index, name) \
-    output[index] = input[index] ^ X##name; \
-    X##name ^= output[index];
-
-#endif
-
-#define wrap(X, input, output, laneCount, trailingBits) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    if (laneCount < 1) { \
-                        X##ba ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 0, ba) \
-                        X##be ^= trailingBits; \
-                    } \
-                } \
-                else { \
-                    wrapOne(X, input, output, 0, ba) \
-                    wrapOneInvert(X, input, output, 1, be) \
-                    if (laneCount < 3) { \
-                        X##bi ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOneInvert(X, input, output, 2, bi) \
-                        X##bo ^= trailingBits; \
-                    } \
-                } \
-            } \
-            else { \
-                wrapOne(X, input, output, 0, ba) \
-                wrapOneInvert(X, input, output, 1, be) \
-                wrapOneInvert(X, input, output, 2, bi) \
-                wrapOne(X, input, output, 3, bo) \
-                if (laneCount < 6) { \
-                    if (laneCount < 5) { \
-                        X##bu ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 4, bu) \
-                        X##ga ^= trailingBits; \
-                    } \
-                } \
-                else { \
-                    wrapOne(X, input, output, 4, bu) \
-                    wrapOne(X, input, output, 5, ga) \
-                    if (laneCount < 7) { \
-                        X##ge ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 6, ge) \
-                        X##gi ^= trailingBits; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            wrapOne(X, input, output, 0, ba) \
-            wrapOneInvert(X, input, output, 1, be) \
-            wrapOneInvert(X, input, output, 2, bi) \
-            wrapOne(X, input, output, 3, bo) \
-            wrapOne(X, input, output, 4, bu) \
-            wrapOne(X, input, output, 5, ga) \
-            wrapOne(X, input, output, 6, ge) \
-            wrapOne(X, input, output, 7, gi) \
-            if (laneCount < 12) { \
-                if (laneCount < 10) { \
-                    if (laneCount < 9) { \
-                        X##go ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOneInvert(X, input, output, 8, go) \
-                        X##gu ^= trailingBits; \
-                    } \
-                } \
-                else { \
-                    wrapOneInvert(X, input, output, 8, go) \
-                    wrapOne(X, input, output, 9, gu) \
-                    if (laneCount < 11) { \
-                        X##ka ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 10, ka) \
-                        X##ke ^= trailingBits; \
-                    } \
-                } \
-            } \
-            else { \
-                wrapOneInvert(X, input, output, 8, go) \
-                wrapOne(X, input, output, 9, gu) \
-                wrapOne(X, input, output, 10, ka) \
-                wrapOne(X, input, output, 11, ke) \
-                if (laneCount < 14) { \
-                    if (laneCount < 13) { \
-                        X##ki ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOneInvert(X, input, output, 12, ki) \
-                        X##ko ^= trailingBits; \
-                    } \
-                } \
-                else { \
-                    wrapOneInvert(X, input, output, 12, ki) \
-                    wrapOne(X, input, output, 13, ko) \
-                    if (laneCount < 15) { \
-                        X##ku ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 14, ku) \
-                        X##ma ^= trailingBits; \
-                    } \
-                } \
-            } \
-        } \
-    } \
-    else { \
-        wrapOne(X, input, output, 0, ba) \
-        wrapOneInvert(X, input, output, 1, be) \
-        wrapOneInvert(X, input, output, 2, bi) \
-        wrapOne(X, input, output, 3, bo) \
-        wrapOne(X, input, output, 4, bu) \
-        wrapOne(X, input, output, 5, ga) \
-        wrapOne(X, input, output, 6, ge) \
-        wrapOne(X, input, output, 7, gi) \
-        wrapOneInvert(X, input, output, 8, go) \
-        wrapOne(X, input, output, 9, gu) \
-        wrapOne(X, input, output, 10, ka) \
-        wrapOne(X, input, output, 11, ke) \
-        wrapOneInvert(X, input, output, 12, ki) \
-        wrapOne(X, input, output, 13, ko) \
-        wrapOne(X, input, output, 14, ku) \
-        wrapOne(X, input, output, 15, ma) \
-        if (laneCount < 24) { \
-            if (laneCount < 20) { \
-                if (laneCount < 18) { \
-                    if (laneCount < 17) { \
-                        X##me ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 16, me) \
-                        X##mi ^= trailingBits; \
-                    } \
-                } \
-                else { \
-                    wrapOne(X, input, output, 16, me) \
-                    wrapOneInvert(X, input, output, 17, mi) \
-                    if (laneCount < 19) { \
-                        X##mo ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 18, mo) \
-                        X##mu ^= trailingBits; \
-                    } \
-                } \
-            } \
-            else { \
-                wrapOne(X, input, output, 16, me) \
-                wrapOneInvert(X, input, output, 17, mi) \
-                wrapOne(X, input, output, 18, mo) \
-                wrapOne(X, input, output, 19, mu) \
-                if (laneCount < 22) { \
-                    if (laneCount < 21) { \
-                        X##sa ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOneInvert(X, input, output, 20, sa) \
-                        X##se ^= trailingBits; \
-                    } \
-                } \
-                else { \
-                    wrapOneInvert(X, input, output, 20, sa) \
-                    wrapOne(X, input, output, 21, se) \
-                    if (laneCount < 23) { \
-                        X##si ^= trailingBits; \
-                    } \
-                    else { \
-                        wrapOne(X, input, output, 22, si) \
-                        X##so ^= trailingBits; \
-                    } \
-                } \
-            } \
-        } \
-        else { \
-            wrapOne(X, input, output, 16, me) \
-            wrapOneInvert(X, input, output, 17, mi) \
-            wrapOne(X, input, output, 18, mo) \
-            wrapOne(X, input, output, 19, mu) \
-            wrapOneInvert(X, input, output, 20, sa) \
-            wrapOne(X, input, output, 21, se) \
-            wrapOne(X, input, output, 22, si) \
-            wrapOne(X, input, output, 23, so) \
-            if (laneCount < 25) { \
-                X##su ^= trailingBits; \
-            } \
-            else { \
-                wrapOne(X, input, output, 24, su) \
-            } \
-        } \
-    }
-
-#define unwrap(X, input, output, laneCount, trailingBits) \
-    if (laneCount < 16) { \
-        if (laneCount < 8) { \
-            if (laneCount < 4) { \
-                if (laneCount < 2) { \
-                    if (laneCount < 1) { \
-                        X##ba ^= trailingBits; \
-                    } \
-                    else { \
-                        unwrapOne(X, input, output, 0, ba) \
-                        X##be ^= trailingBits; \
+                        X##ba ^= HTOLE64(input[ 0]); \
                     } \
                 } \
                 else { \
-                    unwrapOne(X, input, output, 0, ba) \
-                    unwrapOneInvert(X, input, output, 1, be) \
+                    X##ba ^= HTOLE64(input[ 0]); \
+                    X##be ^= HTOLE64(input[ 1]); \
                     if (laneCount < 3) { \
-                        X##bi ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOneInvert(X, input, output, 2, bi) \
-                        X##bo ^= trailingBits; \
+                        X##bi ^= HTOLE64(input[ 2]); \
                     } \
                 } \
             } \
             else { \
-                unwrapOne(X, input, output, 0, ba) \
-                unwrapOneInvert(X, input, output, 1, be) \
-                unwrapOneInvert(X, input, output, 2, bi) \
-                unwrapOne(X, input, output, 3, bo) \
+                X##ba ^= HTOLE64(input[ 0]); \
+                X##be ^= HTOLE64(input[ 1]); \
+                X##bi ^= HTOLE64(input[ 2]); \
+                X##bo ^= HTOLE64(input[ 3]); \
                 if (laneCount < 6) { \
                     if (laneCount < 5) { \
-                        X##bu ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 4, bu) \
-                        X##ga ^= trailingBits; \
+                        X##bu ^= HTOLE64(input[ 4]); \
                     } \
                 } \
                 else { \
-                    unwrapOne(X, input, output, 4, bu) \
-                    unwrapOne(X, input, output, 5, ga) \
+                    X##bu ^= HTOLE64(input[ 4]); \
+                    X##ga ^= HTOLE64(input[ 5]); \
                     if (laneCount < 7) { \
-                        X##ge ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 6, ge) \
-                        X##gi ^= trailingBits; \
+                        X##ge ^= HTOLE64(input[ 6]); \
                     } \
                 } \
             } \
         } \
         else { \
-            unwrapOne(X, input, output, 0, ba) \
-            unwrapOneInvert(X, input, output, 1, be) \
-            unwrapOneInvert(X, input, output, 2, bi) \
-            unwrapOne(X, input, output, 3, bo) \
-            unwrapOne(X, input, output, 4, bu) \
-            unwrapOne(X, input, output, 5, ga) \
-            unwrapOne(X, input, output, 6, ge) \
-            unwrapOne(X, input, output, 7, gi) \
+            X##ba ^= HTOLE64(input[ 0]); \
+            X##be ^= HTOLE64(input[ 1]); \
+            X##bi ^= HTOLE64(input[ 2]); \
+            X##bo ^= HTOLE64(input[ 3]); \
+            X##bu ^= HTOLE64(input[ 4]); \
+            X##ga ^= HTOLE64(input[ 5]); \
+            X##ge ^= HTOLE64(input[ 6]); \
+            X##gi ^= HTOLE64(input[ 7]); \
             if (laneCount < 12) { \
                 if (laneCount < 10) { \
                     if (laneCount < 9) { \
-                        X##go ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOneInvert(X, input, output, 8, go) \
-                        X##gu ^= trailingBits; \
+                        X##go ^= HTOLE64(input[ 8]); \
                     } \
                 } \
                 else { \
-                    unwrapOneInvert(X, input, output, 8, go) \
-                    unwrapOne(X, input, output, 9, gu) \
+                    X##go ^= HTOLE64(input[ 8]); \
+                    X##gu ^= HTOLE64(input[ 9]); \
                     if (laneCount < 11) { \
-                        X##ka ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 10, ka) \
-                        X##ke ^= trailingBits; \
+                        X##ka ^= HTOLE64(input[10]); \
                     } \
                 } \
             } \
             else { \
-                unwrapOneInvert(X, input, output, 8, go) \
-                unwrapOne(X, input, output, 9, gu) \
-                unwrapOne(X, input, output, 10, ka) \
-                unwrapOne(X, input, output, 11, ke) \
+                X##go ^= HTOLE64(input[ 8]); \
+                X##gu ^= HTOLE64(input[ 9]); \
+                X##ka ^= HTOLE64(input[10]); \
+                X##ke ^= HTOLE64(input[11]); \
                 if (laneCount < 14) { \
                     if (laneCount < 13) { \
-                        X##ki ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOneInvert(X, input, output, 12, ki) \
-                        X##ko ^= trailingBits; \
+                        X##ki ^= HTOLE64(input[12]); \
                     } \
                 } \
                 else { \
-                    unwrapOneInvert(X, input, output, 12, ki) \
-                    unwrapOne(X, input, output, 13, ko) \
+                    X##ki ^= HTOLE64(input[12]); \
+                    X##ko ^= HTOLE64(input[13]); \
                     if (laneCount < 15) { \
-                        X##ku ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 14, ku) \
-                        X##ma ^= trailingBits; \
+                        X##ku ^= HTOLE64(input[14]); \
                     } \
                 } \
             } \
         } \
     } \
     else { \
-        unwrapOne(X, input, output, 0, ba) \
-        unwrapOneInvert(X, input, output, 1, be) \
-        unwrapOneInvert(X, input, output, 2, bi) \
-        unwrapOne(X, input, output, 3, bo) \
-        unwrapOne(X, input, output, 4, bu) \
-        unwrapOne(X, input, output, 5, ga) \
-        unwrapOne(X, input, output, 6, ge) \
-        unwrapOne(X, input, output, 7, gi) \
-        unwrapOneInvert(X, input, output, 8, go) \
-        unwrapOne(X, input, output, 9, gu) \
-        unwrapOne(X, input, output, 10, ka) \
-        unwrapOne(X, input, output, 11, ke) \
-        unwrapOneInvert(X, input, output, 12, ki) \
-        unwrapOne(X, input, output, 13, ko) \
-        unwrapOne(X, input, output, 14, ku) \
-        unwrapOne(X, input, output, 15, ma) \
+        X##ba ^= HTOLE64(input[ 0]); \
+        X##be ^= HTOLE64(input[ 1]); \
+        X##bi ^= HTOLE64(input[ 2]); \
+        X##bo ^= HTOLE64(input[ 3]); \
+        X##bu ^= HTOLE64(input[ 4]); \
+        X##ga ^= HTOLE64(input[ 5]); \
+        X##ge ^= HTOLE64(input[ 6]); \
+        X##gi ^= HTOLE64(input[ 7]); \
+        X##go ^= HTOLE64(input[ 8]); \
+        X##gu ^= HTOLE64(input[ 9]); \
+        X##ka ^= HTOLE64(input[10]); \
+        X##ke ^= HTOLE64(input[11]); \
+        X##ki ^= HTOLE64(input[12]); \
+        X##ko ^= HTOLE64(input[13]); \
+        X##ku ^= HTOLE64(input[14]); \
+        X##ma ^= HTOLE64(input[15]); \
         if (laneCount < 24) { \
             if (laneCount < 20) { \
                 if (laneCount < 18) { \
                     if (laneCount < 17) { \
-                        X##me ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 16, me) \
-                        X##mi ^= trailingBits; \
+                        X##me ^= HTOLE64(input[16]); \
                     } \
                 } \
                 else { \
-                    unwrapOne(X, input, output, 16, me) \
-                    unwrapOneInvert(X, input, output, 17, mi) \
+                    X##me ^= HTOLE64(input[16]); \
+                    X##mi ^= HTOLE64(input[17]); \
                     if (laneCount < 19) { \
-                        X##mo ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 18, mo) \
-                        X##mu ^= trailingBits; \
+                        X##mo ^= HTOLE64(input[18]); \
                     } \
                 } \
             } \
             else { \
-                unwrapOne(X, input, output, 16, me) \
-                unwrapOneInvert(X, input, output, 17, mi) \
-                unwrapOne(X, input, output, 18, mo) \
-                unwrapOne(X, input, output, 19, mu) \
+                X##me ^= HTOLE64(input[16]); \
+                X##mi ^= HTOLE64(input[17]); \
+                X##mo ^= HTOLE64(input[18]); \
+                X##mu ^= HTOLE64(input[19]); \
                 if (laneCount < 22) { \
                     if (laneCount < 21) { \
-                        X##sa ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOneInvert(X, input, output, 20, sa) \
-                        X##se ^= trailingBits; \
+                        X##sa ^= HTOLE64(input[20]); \
                     } \
                 } \
                 else { \
-                    unwrapOneInvert(X, input, output, 20, sa) \
-                    unwrapOne(X, input, output, 21, se) \
+                    X##sa ^= HTOLE64(input[20]); \
+                    X##se ^= HTOLE64(input[21]); \
                     if (laneCount < 23) { \
-                        X##si ^= trailingBits; \
                     } \
                     else { \
-                        unwrapOne(X, input, output, 22, si) \
-                        X##so ^= trailingBits; \
+                        X##si ^= HTOLE64(input[22]); \
                     } \
                 } \
             } \
         } \
         else { \
-            unwrapOne(X, input, output, 16, me) \
-            unwrapOneInvert(X, input, output, 17, mi) \
-            unwrapOne(X, input, output, 18, mo) \
-            unwrapOne(X, input, output, 19, mu) \
-            unwrapOneInvert(X, input, output, 20, sa) \
-            unwrapOne(X, input, output, 21, se) \
-            unwrapOne(X, input, output, 22, si) \
-            unwrapOne(X, input, output, 23, so) \
+            X##me ^= HTOLE64(input[16]); \
+            X##mi ^= HTOLE64(input[17]); \
+            X##mo ^= HTOLE64(input[18]); \
+            X##mu ^= HTOLE64(input[19]); \
+            X##sa ^= HTOLE64(input[20]); \
+            X##se ^= HTOLE64(input[21]); \
+            X##si ^= HTOLE64(input[22]); \
+            X##so ^= HTOLE64(input[23]); \
             if (laneCount < 25) { \
-                X##su ^= trailingBits; \
             } \
             else { \
-                unwrapOne(X, input, output, 24, su) \
+                X##su ^= HTOLE64(input[24]); \
             } \
         } \
     }
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
index 6cf765e6ce11e1..f5ac6b50d3425a 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
@@ -1,16 +1,21 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
 http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to SnP-documentation.h for more details.
 */
 
 #ifndef _KeccakP_1600_SnP_h_
@@ -29,6 +34,7 @@ void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
 void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
 void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
 void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds);
 void KeccakP1600_Permute_12rounds(void *state);
 void KeccakP1600_Permute_24rounds(void *state);
 void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
index 889a31a79444c5..f904949c9b8d01 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
@@ -1,24 +1,26 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
 http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+Please refer to SnP-documentation.h for more details.
 */
 
 #ifndef _KeccakP_1600_SnP_h_
 #define _KeccakP_1600_SnP_h_
 
-/** For the documentation, see SnP-documentation.h.
- */
-
 /* #include "brg_endian.h" */
 #include "KeccakP-1600-opt64-config.h"
 
@@ -26,6 +28,7 @@ and related or neighboring rights to the source code in this file.
 #define KeccakP1600_stateSizeInBytes    200
 #define KeccakP1600_stateAlignment      8
 #define KeccakF1600_FastLoop_supported
+#define KeccakP1600_12rounds_FastLoop_supported
 
 #include <stddef.h>
 
@@ -40,10 +43,12 @@ void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
 void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
 void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
 void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds);
 void KeccakP1600_Permute_12rounds(void *state);
 void KeccakP1600_Permute_24rounds(void *state);
 void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
 void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
 size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
 
 #endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c b/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
index a2f9ffea93259d..ccac7a2d6ba2e2 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
+++ b/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
@@ -1,34 +1,36 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
 http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+This file implements Keccak-p[1600] in a SnP-compatible way.
+Please refer to SnP-documentation.h for more details.
+
+This implementation comes with KeccakP-1600-SnP.h in the same folder.
+Please refer to LowLevel.build for the exact list of other files it must be combined with.
 */
 
-#include    <string.h>
+#include <stdint.h>
+#include <string.h>
 /* #include "brg_endian.h" */
 #include "KeccakP-1600-SnP.h"
 #include "SnP-Relaned.h"
 
-typedef unsigned char UINT8;
-typedef unsigned int UINT32;
-/* WARNING: on 8-bit and 16-bit platforms, this should be replaced by: */
-
-/*typedef unsigned long       UINT32; */
-
-
-#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))
+#define ROL32(a, offset) ((((uint32_t)a) << (offset)) ^ (((uint32_t)a) >> (32-(offset))))
 
 /* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
-
 #define prepareToBitInterleaving(low, high, temp, temp0, temp1) \
         temp0 = (low); \
         temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
@@ -57,7 +59,6 @@ typedef unsigned int UINT32;
         odd = (temp0 >> 16) | (temp1 & 0xFFFF0000);
 
 /* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
-
 #define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
         temp0 = (even); \
         temp1 = (odd); \
@@ -85,26 +86,26 @@ typedef unsigned int UINT32;
 
 void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length)
 {
-    UINT8 laneAsBytes[8];
-    UINT32 low, high;
-    UINT32 temp, temp0, temp1;
-    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    uint8_t laneAsBytes[8];
+    uint32_t low, high;
+    uint32_t temp, temp0, temp1;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
 
     memset(laneAsBytes, 0xFF, offset);
     memset(laneAsBytes+offset, 0x00, length);
     memset(laneAsBytes+offset+length, 0xFF, 8-offset-length);
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    low = *((UINT32*)(laneAsBytes+0));
-    high = *((UINT32*)(laneAsBytes+4));
+    low = *((uint32_t*)(laneAsBytes+0));
+    high = *((uint32_t*)(laneAsBytes+4));
 #else
     low = laneAsBytes[0]
-        | ((UINT32)(laneAsBytes[1]) << 8)
-        | ((UINT32)(laneAsBytes[2]) << 16)
-        | ((UINT32)(laneAsBytes[3]) << 24);
+        | ((uint32_t)(laneAsBytes[1]) << 8)
+        | ((uint32_t)(laneAsBytes[2]) << 16)
+        | ((uint32_t)(laneAsBytes[3]) << 24);
     high = laneAsBytes[4]
-        | ((UINT32)(laneAsBytes[5]) << 8)
-        | ((UINT32)(laneAsBytes[6]) << 16)
-        | ((UINT32)(laneAsBytes[7]) << 24);
+        | ((uint32_t)(laneAsBytes[5]) << 8)
+        | ((uint32_t)(laneAsBytes[6]) << 16)
+        | ((uint32_t)(laneAsBytes[7]) << 24);
 #endif
     toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
 }
@@ -122,17 +123,17 @@ void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
 {
     unsigned int lanePosition = offset/8;
     unsigned int offsetInLane = offset%8;
-    UINT32 low, high;
-    UINT32 temp, temp0, temp1;
-    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    uint32_t low, high;
+    uint32_t temp, temp0, temp1;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
 
     if (offsetInLane < 4) {
-        low = (UINT32)byte << (offsetInLane*8);
+        low = (uint32_t)byte << (offsetInLane*8);
         high = 0;
     }
     else {
         low = 0;
-        high = (UINT32)byte << ((offsetInLane-4)*8);
+        high = (uint32_t)byte << ((offsetInLane-4)*8);
     }
     toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
 }
@@ -141,25 +142,25 @@ void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
 
 void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
 {
-    UINT8 laneAsBytes[8];
-    UINT32 low, high;
-    UINT32 temp, temp0, temp1;
-    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    uint8_t laneAsBytes[8];
+    uint32_t low, high;
+    uint32_t temp, temp0, temp1;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
 
     memset(laneAsBytes, 0, 8);
     memcpy(laneAsBytes+offset, data, length);
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    low = *((UINT32*)(laneAsBytes+0));
-    high = *((UINT32*)(laneAsBytes+4));
+    low = *((uint32_t*)(laneAsBytes+0));
+    high = *((uint32_t*)(laneAsBytes+4));
 #else
     low = laneAsBytes[0]
-        | ((UINT32)(laneAsBytes[1]) << 8)
-        | ((UINT32)(laneAsBytes[2]) << 16)
-        | ((UINT32)(laneAsBytes[3]) << 24);
+        | ((uint32_t)(laneAsBytes[1]) << 8)
+        | ((uint32_t)(laneAsBytes[2]) << 16)
+        | ((uint32_t)(laneAsBytes[3]) << 24);
     high = laneAsBytes[4]
-        | ((UINT32)(laneAsBytes[5]) << 8)
-        | ((UINT32)(laneAsBytes[6]) << 16)
-        | ((UINT32)(laneAsBytes[7]) << 24);
+        | ((uint32_t)(laneAsBytes[5]) << 8)
+        | ((uint32_t)(laneAsBytes[6]) << 16)
+        | ((uint32_t)(laneAsBytes[7]) << 24);
 #endif
     toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
 }
@@ -169,14 +170,14 @@ void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const un
 void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
 {
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    const UINT32 * pI = (const UINT32 *)data;
-    UINT32 * pS = (UINT32*)state;
-    UINT32 t, x0, x1;
+    const uint32_t * pI = (const uint32_t *)data;
+    uint32_t * pS = (uint32_t*)state;
+    uint32_t t, x0, x1;
     int i;
     for (i = laneCount-1; i >= 0; --i) {
 #ifdef NO_MISALIGNED_ACCESSES
-        UINT32 low;
-        UINT32 high;
+        uint32_t low;
+        uint32_t high;
         memcpy(&low, pI++, 4);
         memcpy(&high, pI++, 4);
         toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1);
@@ -187,19 +188,18 @@ void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int l
 #else
     unsigned int lanePosition;
     for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
-        UINT8 laneAsBytes[8];
-        UINT32 low, high, temp, temp0, temp1;
-        UINT32 *stateAsHalfLanes;
+        uint8_t laneAsBytes[8];
         memcpy(laneAsBytes, data+lanePosition*8, 8);
-        low = laneAsBytes[0]
-            | ((UINT32)(laneAsBytes[1]) << 8)
-            | ((UINT32)(laneAsBytes[2]) << 16)
-            | ((UINT32)(laneAsBytes[3]) << 24);
-        high = laneAsBytes[4]
-            | ((UINT32)(laneAsBytes[5]) << 8)
-            | ((UINT32)(laneAsBytes[6]) << 16)
-            | ((UINT32)(laneAsBytes[7]) << 24);
-        stateAsHalfLanes = (UINT32*)state;
+        uint32_t low = laneAsBytes[0]
+            | ((uint32_t)(laneAsBytes[1]) << 8)
+            | ((uint32_t)(laneAsBytes[2]) << 16)
+            | ((uint32_t)(laneAsBytes[3]) << 24);
+        uint32_t high = laneAsBytes[4]
+            | ((uint32_t)(laneAsBytes[5]) << 8)
+            | ((uint32_t)(laneAsBytes[6]) << 16)
+            | ((uint32_t)(laneAsBytes[7]) << 24);
+        uint32_t even, odd, temp, temp0, temp1;
+        uint32_t *stateAsHalfLanes = (uint32_t*)state;
         toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
     }
 #endif
@@ -225,14 +225,14 @@ void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, co
 void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
 {
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    const UINT32 * pI = (const UINT32 *)data;
-    UINT32 * pS = (UINT32 *)state;
-    UINT32 t, x0, x1;
+    const uint32_t * pI = (const uint32_t *)data;
+    uint32_t * pS = (uint32_t *)state;
+    uint32_t t, x0, x1;
     int i;
     for (i = laneCount-1; i >= 0; --i) {
 #ifdef NO_MISALIGNED_ACCESSES
-        UINT32 low;
-        UINT32 high;
+        uint32_t low;
+        uint32_t high;
         memcpy(&low, pI++, 4);
         memcpy(&high, pI++, 4);
         toBitInterleavingAndSet(low, high, *(pS++), *(pS++), t, x0, x1);
@@ -243,19 +243,18 @@ void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned
 #else
     unsigned int lanePosition;
     for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
-        UINT8 laneAsBytes[8];
-        UINT32 low, high, temp, temp0, temp1;
-        UINT32 *stateAsHalfLanes;
+        uint8_t laneAsBytes[8];
         memcpy(laneAsBytes, data+lanePosition*8, 8);
-        low = laneAsBytes[0]
-            | ((UINT32)(laneAsBytes[1]) << 8)
-            | ((UINT32)(laneAsBytes[2]) << 16)
-            | ((UINT32)(laneAsBytes[3]) << 24);
-        high = laneAsBytes[4]
-            | ((UINT32)(laneAsBytes[5]) << 8)
-            | ((UINT32)(laneAsBytes[6]) << 16)
-            | ((UINT32)(laneAsBytes[7]) << 24);
-        stateAsHalfLanes = (UINT32*)state;
+        uint32_t low = laneAsBytes[0]
+            | ((uint32_t)(laneAsBytes[1]) << 8)
+            | ((uint32_t)(laneAsBytes[2]) << 16)
+            | ((uint32_t)(laneAsBytes[3]) << 24);
+        uint32_t high = laneAsBytes[4]
+            | ((uint32_t)(laneAsBytes[5]) << 8)
+            | ((uint32_t)(laneAsBytes[6]) << 16)
+            | ((uint32_t)(laneAsBytes[7]) << 24);
+        uint32_t even, odd, temp, temp0, temp1;
+        uint32_t *stateAsHalfLanes = (uint32_t*)state;
         toBitInterleavingAndSet(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
     }
 #endif
@@ -272,7 +271,7 @@ void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned
 
 void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
 {
-    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
     unsigned int i;
 
     for(i=0; i<byteCount/8; i++) {
@@ -287,14 +286,14 @@ void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
 
 void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
 {
-    UINT32 *stateAsHalfLanes = (UINT32*)state;
-    UINT32 low, high, temp, temp0, temp1;
-    UINT8 laneAsBytes[8];
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+    uint32_t low, high, temp, temp0, temp1;
+    uint8_t laneAsBytes[8];
 
     fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    *((UINT32*)(laneAsBytes+0)) = low;
-    *((UINT32*)(laneAsBytes+4)) = high;
+    *((uint32_t*)(laneAsBytes+0)) = low;
+    *((uint32_t*)(laneAsBytes+4)) = high;
 #else
     laneAsBytes[0] = low & 0xFF;
     laneAsBytes[1] = (low >> 8) & 0xFF;
@@ -313,14 +312,14 @@ void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition
 void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
 {
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    UINT32 * pI = (UINT32 *)data;
-    const UINT32 * pS = ( const UINT32 *)state;
-    UINT32 t, x0, x1;
+    uint32_t * pI = (uint32_t *)data;
+    const uint32_t * pS = ( const uint32_t *)state;
+    uint32_t t, x0, x1;
     int i;
     for (i = laneCount-1; i >= 0; --i) {
 #ifdef NO_MISALIGNED_ACCESSES
-        UINT32 low;
-        UINT32 high;
+        uint32_t low;
+        uint32_t high;
         fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
         memcpy(pI++, &low, 4);
         memcpy(pI++, &high, 4);
@@ -331,10 +330,10 @@ void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned i
 #else
     unsigned int lanePosition;
     for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
-        UINT32 *stateAsHalfLanes = (UINT32*)state;
-        UINT32 low, high, temp, temp0, temp1;
-        UINT8 laneAsBytes[8];
+        uint32_t *stateAsHalfLanes = (uint32_t*)state;
+        uint32_t low, high, temp, temp0, temp1;
         fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        uint8_t laneAsBytes[8];
         laneAsBytes[0] = low & 0xFF;
         laneAsBytes[1] = (low >> 8) & 0xFF;
         laneAsBytes[2] = (low >> 16) & 0xFF;
@@ -359,15 +358,15 @@ void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned i
 
 void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
 {
-    UINT32 *stateAsHalfLanes = (UINT32*)state;
-    UINT32 low, high, temp, temp0, temp1;
-    UINT8 laneAsBytes[8];
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+    uint32_t low, high, temp, temp0, temp1;
+    uint8_t laneAsBytes[8];
     unsigned int i;
 
     fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    *((UINT32*)(laneAsBytes+0)) = low;
-    *((UINT32*)(laneAsBytes+4)) = high;
+    *((uint32_t*)(laneAsBytes+0)) = low;
+    *((uint32_t*)(laneAsBytes+4)) = high;
 #else
     laneAsBytes[0] = low & 0xFF;
     laneAsBytes[1] = (low >> 8) & 0xFF;
@@ -387,15 +386,15 @@ void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePo
 void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
 {
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    const UINT32 * pI = (const UINT32 *)input;
-    UINT32 * pO = (UINT32 *)output;
-    const UINT32 * pS = (const UINT32 *)state;
-    UINT32 t, x0, x1;
+    const uint32_t * pI = (const uint32_t *)input;
+    uint32_t * pO = (uint32_t *)output;
+    const uint32_t * pS = (const uint32_t *)state;
+    uint32_t t, x0, x1;
     int i;
     for (i = laneCount-1; i >= 0; --i) {
 #ifdef NO_MISALIGNED_ACCESSES
-        UINT32 low;
-        UINT32 high;
+        uint32_t low;
+        uint32_t high;
         fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
         *(pO++) = *(pI++) ^ low;
         *(pO++) = *(pI++) ^ high;
@@ -406,10 +405,10 @@ void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *inpu
 #else
     unsigned int lanePosition;
     for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
-        UINT32 *stateAsHalfLanes = (UINT32*)state;
-        UINT32 low, high, temp, temp0, temp1;
-        UINT8 laneAsBytes[8];
+        uint32_t *stateAsHalfLanes = (uint32_t*)state;
+        uint32_t low, high, temp, temp0, temp1;
         fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        uint8_t laneAsBytes[8];
         laneAsBytes[0] = low & 0xFF;
         laneAsBytes[1] = (low >> 8) & 0xFF;
         laneAsBytes[2] = (low >> 16) & 0xFF;
@@ -418,8 +417,8 @@ void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *inpu
         laneAsBytes[5] = (high >> 8) & 0xFF;
         laneAsBytes[6] = (high >> 16) & 0xFF;
         laneAsBytes[7] = (high >> 24) & 0xFF;
-        ((UINT32*)(output+lanePosition*8))[0] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+0));
-        ((UINT32*)(output+lanePosition*8))[1] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+4));
+        ((uint32_t*)(output+lanePosition*8))[0] = ((uint32_t*)(input+lanePosition*8))[0] ^ (*(const uint32_t*)(laneAsBytes+0));
+        ((uint32_t*)(output+lanePosition*8))[1] = ((uint32_t*)(input+lanePosition*8))[0] ^ (*(const uint32_t*)(laneAsBytes+4));
     }
 #endif
 }
@@ -432,7 +431,7 @@ void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *inpu
 
 /* ---------------------------------------------------------------- */
 
-static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] =
+static const uint32_t KeccakF1600RoundConstants_int2[2*24+1] =
 {
     0x00000001UL,    0x00000000UL,
     0x00000000UL,    0x00000089UL,
@@ -461,690 +460,692 @@ static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] =
     0x000000FFUL
 };
 
-#define KeccakAtoD_round0() \
+#define KeccakRound0() \
         Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
         Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
         Da0 = Cx^ROL32(Du1, 1); \
         Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
         Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
         Da1 = Cz^Du0; \
-\
         Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
         Do0 = Cw^ROL32(Cz, 1); \
         Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
         Do1 = Cy^Cx; \
-\
         Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
         De0 = Cx^ROL32(Cy, 1); \
         Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
         De1 = Cz^Cw; \
-\
         Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
         Di0 = Du0^ROL32(Cy, 1); \
         Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
         Di1 = Du1^Cw; \
-\
         Du0 = Cw^ROL32(Cz, 1); \
         Du1 = Cy^Cx; \
-
-#define KeccakAtoD_round1() \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Age0^De0), 22); \
+        Bi = ROL32((Aki1^Di1), 22); \
+        Bo = ROL32((Amo1^Do1), 11); \
+        Bu = ROL32((Asu0^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Age1^De1), 22); \
+        Bi = ROL32((Aki0^Di0), 21); \
+        Bo = ROL32((Amo0^Do0), 10); \
+        Bu = ROL32((Asu1^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aka1^Da1),  2); \
+        Bo = ROL32((Ame1^De1), 23); \
+        Bu = ROL32((Asi1^Di1), 31); \
+        Ba = ROL32((Abo0^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aka0^Da0),  1); \
+        Bo = ROL32((Ame0^De0), 22); \
+        Bu = ROL32((Asi0^Di0), 30); \
+        Ba = ROL32((Abo1^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Asa0^Da0),  9); \
+        Ba = ROL32((Abe1^De1),  1); \
+        Be = ROL32((Agi0^Di0),  3); \
+        Bi = ROL32((Ako1^Do1), 13); \
+        Bo = ROL32((Amu0^Du0),  4); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Asa1^Da1),  9); \
+        Ba = (Abe0^De0); \
+        Be = ROL32((Agi1^Di1),  3); \
+        Bi = ROL32((Ako0^Do0), 12); \
+        Bo = ROL32((Amu1^Du1),  4); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aga0^Da0), 18); \
+        Bi = ROL32((Ake0^De0),  5); \
+        Bo = ROL32((Ami1^Di1),  8); \
+        Bu = ROL32((Aso0^Do0), 28); \
+        Ba = ROL32((Abu1^Du1), 14); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aga1^Da1), 18); \
+        Bi = ROL32((Ake1^De1),  5); \
+        Bo = ROL32((Ami0^Di0),  7); \
+        Bu = ROL32((Aso1^Do1), 28); \
+        Ba = ROL32((Abu0^Du0), 13); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Ama1^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Abi0^Di0), 31); \
+        Be = ROL32((Ago1^Do1), 28); \
+        Bi = ROL32((Aku1^Du1), 20); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Ama0^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Abi1^Di1), 31); \
+        Be = ROL32((Ago0^Do0), 27); \
+        Bi = ROL32((Aku0^Du0), 19); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be )
+
+#define KeccakRound1() \
         Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \
         Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \
         Da0 = Cx^ROL32(Du1, 1); \
         Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \
         Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \
         Da1 = Cz^Du0; \
-\
         Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \
         Do0 = Cw^ROL32(Cz, 1); \
         Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \
         Do1 = Cy^Cx; \
-\
         Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \
         De0 = Cx^ROL32(Cy, 1); \
         Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \
         De1 = Cz^Cw; \
-\
         Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \
         Di0 = Du0^ROL32(Cy, 1); \
         Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \
         Di1 = Du1^Cw; \
-\
         Du0 = Cw^ROL32(Cz, 1); \
         Du1 = Cy^Cx; \
-
-#define KeccakAtoD_round2() \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Ame1^De0), 22); \
+        Bi = ROL32((Agi1^Di1), 22); \
+        Bo = ROL32((Aso1^Do1), 11); \
+        Bu = ROL32((Aku1^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Ame0^De1), 22); \
+        Bi = ROL32((Agi0^Di0), 21); \
+        Bo = ROL32((Aso0^Do0), 10); \
+        Bu = ROL32((Aku0^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Asa1^Da1),  2); \
+        Bo = ROL32((Ake1^De1), 23); \
+        Bu = ROL32((Abi1^Di1), 31); \
+        Ba = ROL32((Amo1^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Asa0^Da0),  1); \
+        Bo = ROL32((Ake0^De0), 22); \
+        Bu = ROL32((Abi0^Di0), 30); \
+        Ba = ROL32((Amo0^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Ama1^Da0),  9); \
+        Ba = ROL32((Age1^De1),  1); \
+        Be = ROL32((Asi1^Di0),  3); \
+        Bi = ROL32((Ako0^Do1), 13); \
+        Bo = ROL32((Abu1^Du0),  4); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Ama0^Da1),  9); \
+        Ba = (Age0^De0); \
+        Be = ROL32((Asi0^Di1),  3); \
+        Bi = ROL32((Ako1^Do0), 12); \
+        Bo = ROL32((Abu0^Du1),  4); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aka1^Da0), 18); \
+        Bi = ROL32((Abe1^De0),  5); \
+        Bo = ROL32((Ami0^Di1),  8); \
+        Bu = ROL32((Ago1^Do0), 28); \
+        Ba = ROL32((Asu1^Du1), 14); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aka0^Da1), 18); \
+        Bi = ROL32((Abe0^De1),  5); \
+        Bo = ROL32((Ami1^Di0),  7); \
+        Bu = ROL32((Ago0^Do1), 28); \
+        Ba = ROL32((Asu0^Du0), 13); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aga1^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Aki1^Di0), 31); \
+        Be = ROL32((Abo1^Do1), 28); \
+        Bi = ROL32((Amu1^Du1), 20); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aga0^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Aki0^Di1), 31); \
+        Be = ROL32((Abo0^Do0), 27); \
+        Bi = ROL32((Amu0^Du0), 19); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be );
+
+#define KeccakRound2() \
         Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \
         Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \
         Da0 = Cx^ROL32(Du1, 1); \
         Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \
         Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \
         Da1 = Cz^Du0; \
-\
         Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \
         Do0 = Cw^ROL32(Cz, 1); \
         Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \
         Do1 = Cy^Cx; \
-\
         Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \
         De0 = Cx^ROL32(Cy, 1); \
         Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \
         De1 = Cz^Cw; \
-\
         Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \
         Di0 = Du0^ROL32(Cy, 1); \
         Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \
         Di1 = Du1^Cw; \
-\
         Du0 = Cw^ROL32(Cz, 1); \
         Du1 = Cy^Cx; \
-
-#define KeccakAtoD_round3() \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Ake1^De0), 22); \
+        Bi = ROL32((Asi0^Di1), 22); \
+        Bo = ROL32((Ago0^Do1), 11); \
+        Bu = ROL32((Amu1^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Ake0^De1), 22); \
+        Bi = ROL32((Asi1^Di0), 21); \
+        Bo = ROL32((Ago1^Do0), 10); \
+        Bu = ROL32((Amu0^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Ama0^Da1),  2); \
+        Bo = ROL32((Abe0^De1), 23); \
+        Bu = ROL32((Aki0^Di1), 31); \
+        Ba = ROL32((Aso1^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Ama1^Da0),  1); \
+        Bo = ROL32((Abe1^De0), 22); \
+        Bu = ROL32((Aki1^Di0), 30); \
+        Ba = ROL32((Aso0^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aga1^Da0),  9); \
+        Ba = ROL32((Ame0^De1),  1); \
+        Be = ROL32((Abi1^Di0),  3); \
+        Bi = ROL32((Ako1^Do1), 13); \
+        Bo = ROL32((Asu1^Du0),  4); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aga0^Da1),  9); \
+        Ba = (Ame1^De0); \
+        Be = ROL32((Abi0^Di1),  3); \
+        Bi = ROL32((Ako0^Do0), 12); \
+        Bo = ROL32((Asu0^Du1),  4); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Asa1^Da0), 18); \
+        Bi = ROL32((Age1^De0),  5); \
+        Bo = ROL32((Ami1^Di1),  8); \
+        Bu = ROL32((Abo1^Do0), 28); \
+        Ba = ROL32((Aku0^Du1), 14); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Asa0^Da1), 18); \
+        Bi = ROL32((Age0^De1),  5); \
+        Bo = ROL32((Ami0^Di0),  7); \
+        Bu = ROL32((Abo0^Do1), 28); \
+        Ba = ROL32((Aku1^Du0), 13); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aka0^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Agi1^Di0), 31); \
+        Be = ROL32((Amo0^Do1), 28); \
+        Bi = ROL32((Abu0^Du1), 20); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aka1^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Agi0^Di1), 31); \
+        Be = ROL32((Amo1^Do0), 27); \
+        Bi = ROL32((Abu1^Du0), 19); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be );
+
+#define KeccakRound3() \
         Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \
         Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \
         Da0 = Cx^ROL32(Du1, 1); \
         Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \
         Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \
         Da1 = Cz^Du0; \
-\
         Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \
         Do0 = Cw^ROL32(Cz, 1); \
         Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \
         Do1 = Cy^Cx; \
-\
         Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \
         De0 = Cx^ROL32(Cy, 1); \
         Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \
         De1 = Cz^Cw; \
-\
         Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \
         Di0 = Du0^ROL32(Cy, 1); \
         Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \
         Di1 = Du1^Cw; \
-\
         Du0 = Cw^ROL32(Cz, 1); \
         Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Abe0^De0), 22); \
+        Bi = ROL32((Abi0^Di1), 22); \
+        Bo = ROL32((Abo0^Do1), 11); \
+        Bu = ROL32((Abu0^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Abe1^De1), 22); \
+        Bi = ROL32((Abi1^Di0), 21); \
+        Bo = ROL32((Abo1^Do0), 10); \
+        Bu = ROL32((Abu1^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aga0^Da1),  2); \
+        Bo = ROL32((Age0^De1), 23); \
+        Bu = ROL32((Agi0^Di1), 31); \
+        Ba = ROL32((Ago0^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aga1^Da0),  1); \
+        Bo = ROL32((Age1^De0), 22); \
+        Bu = ROL32((Agi1^Di0), 30); \
+        Ba = ROL32((Ago1^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aka0^Da0),  9); \
+        Ba = ROL32((Ake0^De1),  1); \
+        Be = ROL32((Aki0^Di0),  3); \
+        Bi = ROL32((Ako0^Do1), 13); \
+        Bo = ROL32((Aku0^Du0),  4); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aka1^Da1),  9); \
+        Ba = (Ake1^De0); \
+        Be = ROL32((Aki1^Di1),  3); \
+        Bi = ROL32((Ako1^Do0), 12); \
+        Bo = ROL32((Aku1^Du1),  4); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Ama0^Da0), 18); \
+        Bi = ROL32((Ame0^De0),  5); \
+        Bo = ROL32((Ami0^Di1),  8); \
+        Bu = ROL32((Amo0^Do0), 28); \
+        Ba = ROL32((Amu0^Du1), 14); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Ama1^Da1), 18); \
+        Bi = ROL32((Ame1^De1),  5); \
+        Bo = ROL32((Ami1^Di0),  7); \
+        Bu = ROL32((Amo1^Do1), 28); \
+        Ba = ROL32((Amu1^Du0), 13); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Asa0^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Asi0^Di0), 31); \
+        Be = ROL32((Aso0^Do1), 28); \
+        Bi = ROL32((Asu0^Du1), 20); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Asa1^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Asi1^Di1), 31); \
+        Be = ROL32((Aso1^Do0), 27); \
+        Bi = ROL32((Asu1^Du0), 19); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be );
 
 void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds)
 {
+    uint32_t Da0, De0, Di0, Do0, Du0;
+    uint32_t Da1, De1, Di1, Do1, Du1;
+    uint32_t Ba, Be, Bi, Bo, Bu;
+    uint32_t Cx, Cy, Cz, Cw;
+    const uint32_t *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
+    uint32_t *stateAsHalfLanes = (uint32_t*)state;
+    #define Aba0 stateAsHalfLanes[ 0]
+    #define Aba1 stateAsHalfLanes[ 1]
+    #define Abe0 stateAsHalfLanes[ 2]
+    #define Abe1 stateAsHalfLanes[ 3]
+    #define Abi0 stateAsHalfLanes[ 4]
+    #define Abi1 stateAsHalfLanes[ 5]
+    #define Abo0 stateAsHalfLanes[ 6]
+    #define Abo1 stateAsHalfLanes[ 7]
+    #define Abu0 stateAsHalfLanes[ 8]
+    #define Abu1 stateAsHalfLanes[ 9]
+    #define Aga0 stateAsHalfLanes[10]
+    #define Aga1 stateAsHalfLanes[11]
+    #define Age0 stateAsHalfLanes[12]
+    #define Age1 stateAsHalfLanes[13]
+    #define Agi0 stateAsHalfLanes[14]
+    #define Agi1 stateAsHalfLanes[15]
+    #define Ago0 stateAsHalfLanes[16]
+    #define Ago1 stateAsHalfLanes[17]
+    #define Agu0 stateAsHalfLanes[18]
+    #define Agu1 stateAsHalfLanes[19]
+    #define Aka0 stateAsHalfLanes[20]
+    #define Aka1 stateAsHalfLanes[21]
+    #define Ake0 stateAsHalfLanes[22]
+    #define Ake1 stateAsHalfLanes[23]
+    #define Aki0 stateAsHalfLanes[24]
+    #define Aki1 stateAsHalfLanes[25]
+    #define Ako0 stateAsHalfLanes[26]
+    #define Ako1 stateAsHalfLanes[27]
+    #define Aku0 stateAsHalfLanes[28]
+    #define Aku1 stateAsHalfLanes[29]
+    #define Ama0 stateAsHalfLanes[30]
+    #define Ama1 stateAsHalfLanes[31]
+    #define Ame0 stateAsHalfLanes[32]
+    #define Ame1 stateAsHalfLanes[33]
+    #define Ami0 stateAsHalfLanes[34]
+    #define Ami1 stateAsHalfLanes[35]
+    #define Amo0 stateAsHalfLanes[36]
+    #define Amo1 stateAsHalfLanes[37]
+    #define Amu0 stateAsHalfLanes[38]
+    #define Amu1 stateAsHalfLanes[39]
+    #define Asa0 stateAsHalfLanes[40]
+    #define Asa1 stateAsHalfLanes[41]
+    #define Ase0 stateAsHalfLanes[42]
+    #define Ase1 stateAsHalfLanes[43]
+    #define Asi0 stateAsHalfLanes[44]
+    #define Asi1 stateAsHalfLanes[45]
+    #define Aso0 stateAsHalfLanes[46]
+    #define Aso1 stateAsHalfLanes[47]
+    #define Asu0 stateAsHalfLanes[48]
+    #define Asu1 stateAsHalfLanes[49]
+
+    nRounds &= 3;
+    switch ( nRounds )
+    {
+        #define I0 Ba
+        #define I1 Be
+        #define T0 Bi
+        #define T1 Bo
+        #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \
+            I0 = (in0)[0]; I1 = (in0)[1];       \
+            T0 = (in1)[0]; T1 = (in1)[1];       \
+            (in0)[eo0] = T0; (in0)[eo0^1] = T1; \
+            T0 = (in2)[0]; T1 = (in2)[1];       \
+            (in1)[eo1] = T0; (in1)[eo1^1] = T1; \
+            T0 = (in3)[0]; T1 = (in3)[1];       \
+            (in2)[eo2] = T0; (in2)[eo2^1] = T1; \
+            (in3)[eo3] = I0; (in3)[eo3^1] = I1
+        #define SwapPI2( in0,in1,in2,in3 ) \
+            I0 = (in0)[0]; I1 = (in0)[1]; \
+            T0 = (in1)[0]; T1 = (in1)[1]; \
+            (in0)[1] = T0; (in0)[0] = T1; \
+            (in1)[1] = I0; (in1)[0] = I1; \
+            I0 = (in2)[0]; I1 = (in2)[1]; \
+            T0 = (in3)[0]; T1 = (in3)[1]; \
+            (in2)[1] = T0; (in2)[0] = T1; \
+            (in3)[1] = I0; (in3)[0] = I1
+        #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0
+
+        case 1:
+            SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 );
+            SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 );
+            SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 );
+            SwapEO( Ami0, Ami1 );
+            SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 );
+            SwapEO( Ako0, Ako1 );
+            SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 );
+            break;        
+
+        case 2:
+            SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 );
+            SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 );
+            SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 );
+            SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 );
+            SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 );
+            break;        
+
+        case 3:
+            SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 );
+            SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 );
+            SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 );
+            SwapEO( Ami0, Ami1 );
+            SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 );
+            SwapEO( Ako0, Ako1 );
+            SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 );
+            break;        
+        #undef I0
+        #undef I1
+        #undef T0
+        #undef T1
+        #undef SwapPI13
+        #undef SwapPI2
+        #undef SwapEO
+    }
+
+    do
     {
-        UINT32 Da0, De0, Di0, Do0, Du0;
-        UINT32 Da1, De1, Di1, Do1, Du1;
-        UINT32 Ca0, Ce0, Ci0, Co0, Cu0;
-        UINT32 Cx, Cy, Cz, Cw;
-        #define Ba Ca0
-        #define Be Ce0
-        #define Bi Ci0
-        #define Bo Co0
-        #define Bu Cu0
-        const UINT32 *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
-        UINT32 *stateAsHalfLanes = (UINT32*)state;
-        #define Aba0 stateAsHalfLanes[ 0]
-        #define Aba1 stateAsHalfLanes[ 1]
-        #define Abe0 stateAsHalfLanes[ 2]
-        #define Abe1 stateAsHalfLanes[ 3]
-        #define Abi0 stateAsHalfLanes[ 4]
-        #define Abi1 stateAsHalfLanes[ 5]
-        #define Abo0 stateAsHalfLanes[ 6]
-        #define Abo1 stateAsHalfLanes[ 7]
-        #define Abu0 stateAsHalfLanes[ 8]
-        #define Abu1 stateAsHalfLanes[ 9]
-        #define Aga0 stateAsHalfLanes[10]
-        #define Aga1 stateAsHalfLanes[11]
-        #define Age0 stateAsHalfLanes[12]
-        #define Age1 stateAsHalfLanes[13]
-        #define Agi0 stateAsHalfLanes[14]
-        #define Agi1 stateAsHalfLanes[15]
-        #define Ago0 stateAsHalfLanes[16]
-        #define Ago1 stateAsHalfLanes[17]
-        #define Agu0 stateAsHalfLanes[18]
-        #define Agu1 stateAsHalfLanes[19]
-        #define Aka0 stateAsHalfLanes[20]
-        #define Aka1 stateAsHalfLanes[21]
-        #define Ake0 stateAsHalfLanes[22]
-        #define Ake1 stateAsHalfLanes[23]
-        #define Aki0 stateAsHalfLanes[24]
-        #define Aki1 stateAsHalfLanes[25]
-        #define Ako0 stateAsHalfLanes[26]
-        #define Ako1 stateAsHalfLanes[27]
-        #define Aku0 stateAsHalfLanes[28]
-        #define Aku1 stateAsHalfLanes[29]
-        #define Ama0 stateAsHalfLanes[30]
-        #define Ama1 stateAsHalfLanes[31]
-        #define Ame0 stateAsHalfLanes[32]
-        #define Ame1 stateAsHalfLanes[33]
-        #define Ami0 stateAsHalfLanes[34]
-        #define Ami1 stateAsHalfLanes[35]
-        #define Amo0 stateAsHalfLanes[36]
-        #define Amo1 stateAsHalfLanes[37]
-        #define Amu0 stateAsHalfLanes[38]
-        #define Amu1 stateAsHalfLanes[39]
-        #define Asa0 stateAsHalfLanes[40]
-        #define Asa1 stateAsHalfLanes[41]
-        #define Ase0 stateAsHalfLanes[42]
-        #define Ase1 stateAsHalfLanes[43]
-        #define Asi0 stateAsHalfLanes[44]
-        #define Asi1 stateAsHalfLanes[45]
-        #define Aso0 stateAsHalfLanes[46]
-        #define Aso1 stateAsHalfLanes[47]
-        #define Asu0 stateAsHalfLanes[48]
-        #define Asu1 stateAsHalfLanes[49]
-
-        do
+        /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+        switch ( nRounds )
         {
-            /* --- Code for 4 rounds */
-
-            /* --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
-
-            KeccakAtoD_round0();
-
-            Ba = (Aba0^Da0);
-            Be = ROL32((Age0^De0), 22);
-            Bi = ROL32((Aki1^Di1), 22);
-            Bo = ROL32((Amo1^Do1), 11);
-            Bu = ROL32((Asu0^Du0), 7);
-            Aba0 =   Ba ^((~Be)&  Bi );
-            Aba0 ^= *(pRoundConstants++);
-            Age0 =   Be ^((~Bi)&  Bo );
-            Aki1 =   Bi ^((~Bo)&  Bu );
-            Amo1 =   Bo ^((~Bu)&  Ba );
-            Asu0 =   Bu ^((~Ba)&  Be );
-
-            Ba = (Aba1^Da1);
-            Be = ROL32((Age1^De1), 22);
-            Bi = ROL32((Aki0^Di0), 21);
-            Bo = ROL32((Amo0^Do0), 10);
-            Bu = ROL32((Asu1^Du1), 7);
-            Aba1 =   Ba ^((~Be)&  Bi );
-            Aba1 ^= *(pRoundConstants++);
-            Age1 =   Be ^((~Bi)&  Bo );
-            Aki0 =   Bi ^((~Bo)&  Bu );
-            Amo0 =   Bo ^((~Bu)&  Ba );
-            Asu1 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Aka1^Da1), 2);
-            Bo = ROL32((Ame1^De1), 23);
-            Bu = ROL32((Asi1^Di1), 31);
-            Ba = ROL32((Abo0^Do0), 14);
-            Be = ROL32((Agu0^Du0), 10);
-            Aka1 =   Ba ^((~Be)&  Bi );
-            Ame1 =   Be ^((~Bi)&  Bo );
-            Asi1 =   Bi ^((~Bo)&  Bu );
-            Abo0 =   Bo ^((~Bu)&  Ba );
-            Agu0 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Aka0^Da0), 1);
-            Bo = ROL32((Ame0^De0), 22);
-            Bu = ROL32((Asi0^Di0), 30);
-            Ba = ROL32((Abo1^Do1), 14);
-            Be = ROL32((Agu1^Du1), 10);
-            Aka0 =   Ba ^((~Be)&  Bi );
-            Ame0 =   Be ^((~Bi)&  Bo );
-            Asi0 =   Bi ^((~Bo)&  Bu );
-            Abo1 =   Bo ^((~Bu)&  Ba );
-            Agu1 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Asa0^Da0), 9);
-            Ba = ROL32((Abe1^De1), 1);
-            Be = ROL32((Agi0^Di0), 3);
-            Bi = ROL32((Ako1^Do1), 13);
-            Bo = ROL32((Amu0^Du0), 4);
-            Asa0 =   Ba ^((~Be)&  Bi );
-            Abe1 =   Be ^((~Bi)&  Bo );
-            Agi0 =   Bi ^((~Bo)&  Bu );
-            Ako1 =   Bo ^((~Bu)&  Ba );
-            Amu0 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Asa1^Da1), 9);
-            Ba = (Abe0^De0);
-            Be = ROL32((Agi1^Di1), 3);
-            Bi = ROL32((Ako0^Do0), 12);
-            Bo = ROL32((Amu1^Du1), 4);
-            Asa1 =   Ba ^((~Be)&  Bi );
-            Abe0 =   Be ^((~Bi)&  Bo );
-            Agi1 =   Bi ^((~Bo)&  Bu );
-            Ako0 =   Bo ^((~Bu)&  Ba );
-            Amu1 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Aga0^Da0), 18);
-            Bi = ROL32((Ake0^De0), 5);
-            Bo = ROL32((Ami1^Di1), 8);
-            Bu = ROL32((Aso0^Do0), 28);
-            Ba = ROL32((Abu1^Du1), 14);
-            Aga0 =   Ba ^((~Be)&  Bi );
-            Ake0 =   Be ^((~Bi)&  Bo );
-            Ami1 =   Bi ^((~Bo)&  Bu );
-            Aso0 =   Bo ^((~Bu)&  Ba );
-            Abu1 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Aga1^Da1), 18);
-            Bi = ROL32((Ake1^De1), 5);
-            Bo = ROL32((Ami0^Di0), 7);
-            Bu = ROL32((Aso1^Do1), 28);
-            Ba = ROL32((Abu0^Du0), 13);
-            Aga1 =   Ba ^((~Be)&  Bi );
-            Ake1 =   Be ^((~Bi)&  Bo );
-            Ami0 =   Bi ^((~Bo)&  Bu );
-            Aso1 =   Bo ^((~Bu)&  Ba );
-            Abu0 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Ama1^Da1), 21);
-            Bu = ROL32((Ase0^De0), 1);
-            Ba = ROL32((Abi0^Di0), 31);
-            Be = ROL32((Ago1^Do1), 28);
-            Bi = ROL32((Aku1^Du1), 20);
-            Ama1 =   Ba ^((~Be)&  Bi );
-            Ase0 =   Be ^((~Bi)&  Bo );
-            Abi0 =   Bi ^((~Bo)&  Bu );
-            Ago1 =   Bo ^((~Bu)&  Ba );
-            Aku1 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Ama0^Da0), 20);
-            Bu = ROL32((Ase1^De1), 1);
-            Ba = ROL32((Abi1^Di1), 31);
-            Be = ROL32((Ago0^Do0), 27);
-            Bi = ROL32((Aku0^Du0), 19);
-            Ama0 =   Ba ^((~Be)&  Bi );
-            Ase1 =   Be ^((~Bi)&  Bo );
-            Abi1 =   Bi ^((~Bo)&  Bu );
-            Ago0 =   Bo ^((~Bu)&  Ba );
-            Aku0 =   Bu ^((~Ba)&  Be );
-
-            KeccakAtoD_round1();
-
-            Ba = (Aba0^Da0);
-            Be = ROL32((Ame1^De0), 22);
-            Bi = ROL32((Agi1^Di1), 22);
-            Bo = ROL32((Aso1^Do1), 11);
-            Bu = ROL32((Aku1^Du0), 7);
-            Aba0 =   Ba ^((~Be)&  Bi );
-            Aba0 ^= *(pRoundConstants++);
-            Ame1 =   Be ^((~Bi)&  Bo );
-            Agi1 =   Bi ^((~Bo)&  Bu );
-            Aso1 =   Bo ^((~Bu)&  Ba );
-            Aku1 =   Bu ^((~Ba)&  Be );
-
-            Ba = (Aba1^Da1);
-            Be = ROL32((Ame0^De1), 22);
-            Bi = ROL32((Agi0^Di0), 21);
-            Bo = ROL32((Aso0^Do0), 10);
-            Bu = ROL32((Aku0^Du1), 7);
-            Aba1 =   Ba ^((~Be)&  Bi );
-            Aba1 ^= *(pRoundConstants++);
-            Ame0 =   Be ^((~Bi)&  Bo );
-            Agi0 =   Bi ^((~Bo)&  Bu );
-            Aso0 =   Bo ^((~Bu)&  Ba );
-            Aku0 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Asa1^Da1), 2);
-            Bo = ROL32((Ake1^De1), 23);
-            Bu = ROL32((Abi1^Di1), 31);
-            Ba = ROL32((Amo1^Do0), 14);
-            Be = ROL32((Agu0^Du0), 10);
-            Asa1 =   Ba ^((~Be)&  Bi );
-            Ake1 =   Be ^((~Bi)&  Bo );
-            Abi1 =   Bi ^((~Bo)&  Bu );
-            Amo1 =   Bo ^((~Bu)&  Ba );
-            Agu0 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Asa0^Da0), 1);
-            Bo = ROL32((Ake0^De0), 22);
-            Bu = ROL32((Abi0^Di0), 30);
-            Ba = ROL32((Amo0^Do1), 14);
-            Be = ROL32((Agu1^Du1), 10);
-            Asa0 =   Ba ^((~Be)&  Bi );
-            Ake0 =   Be ^((~Bi)&  Bo );
-            Abi0 =   Bi ^((~Bo)&  Bu );
-            Amo0 =   Bo ^((~Bu)&  Ba );
-            Agu1 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Ama1^Da0), 9);
-            Ba = ROL32((Age1^De1), 1);
-            Be = ROL32((Asi1^Di0), 3);
-            Bi = ROL32((Ako0^Do1), 13);
-            Bo = ROL32((Abu1^Du0), 4);
-            Ama1 =   Ba ^((~Be)&  Bi );
-            Age1 =   Be ^((~Bi)&  Bo );
-            Asi1 =   Bi ^((~Bo)&  Bu );
-            Ako0 =   Bo ^((~Bu)&  Ba );
-            Abu1 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Ama0^Da1), 9);
-            Ba = (Age0^De0);
-            Be = ROL32((Asi0^Di1), 3);
-            Bi = ROL32((Ako1^Do0), 12);
-            Bo = ROL32((Abu0^Du1), 4);
-            Ama0 =   Ba ^((~Be)&  Bi );
-            Age0 =   Be ^((~Bi)&  Bo );
-            Asi0 =   Bi ^((~Bo)&  Bu );
-            Ako1 =   Bo ^((~Bu)&  Ba );
-            Abu0 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Aka1^Da0), 18);
-            Bi = ROL32((Abe1^De0), 5);
-            Bo = ROL32((Ami0^Di1), 8);
-            Bu = ROL32((Ago1^Do0), 28);
-            Ba = ROL32((Asu1^Du1), 14);
-            Aka1 =   Ba ^((~Be)&  Bi );
-            Abe1 =   Be ^((~Bi)&  Bo );
-            Ami0 =   Bi ^((~Bo)&  Bu );
-            Ago1 =   Bo ^((~Bu)&  Ba );
-            Asu1 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Aka0^Da1), 18);
-            Bi = ROL32((Abe0^De1), 5);
-            Bo = ROL32((Ami1^Di0), 7);
-            Bu = ROL32((Ago0^Do1), 28);
-            Ba = ROL32((Asu0^Du0), 13);
-            Aka0 =   Ba ^((~Be)&  Bi );
-            Abe0 =   Be ^((~Bi)&  Bo );
-            Ami1 =   Bi ^((~Bo)&  Bu );
-            Ago0 =   Bo ^((~Bu)&  Ba );
-            Asu0 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Aga1^Da1), 21);
-            Bu = ROL32((Ase0^De0), 1);
-            Ba = ROL32((Aki1^Di0), 31);
-            Be = ROL32((Abo1^Do1), 28);
-            Bi = ROL32((Amu1^Du1), 20);
-            Aga1 =   Ba ^((~Be)&  Bi );
-            Ase0 =   Be ^((~Bi)&  Bo );
-            Aki1 =   Bi ^((~Bo)&  Bu );
-            Abo1 =   Bo ^((~Bu)&  Ba );
-            Amu1 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Aga0^Da0), 20);
-            Bu = ROL32((Ase1^De1), 1);
-            Ba = ROL32((Aki0^Di1), 31);
-            Be = ROL32((Abo0^Do0), 27);
-            Bi = ROL32((Amu0^Du0), 19);
-            Aga0 =   Ba ^((~Be)&  Bi );
-            Ase1 =   Be ^((~Bi)&  Bo );
-            Aki0 =   Bi ^((~Bo)&  Bu );
-            Abo0 =   Bo ^((~Bu)&  Ba );
-            Amu0 =   Bu ^((~Ba)&  Be );
-
-            KeccakAtoD_round2();
-
-            Ba = (Aba0^Da0);
-            Be = ROL32((Ake1^De0), 22);
-            Bi = ROL32((Asi0^Di1), 22);
-            Bo = ROL32((Ago0^Do1), 11);
-            Bu = ROL32((Amu1^Du0), 7);
-            Aba0 =   Ba ^((~Be)&  Bi );
-            Aba0 ^= *(pRoundConstants++);
-            Ake1 =   Be ^((~Bi)&  Bo );
-            Asi0 =   Bi ^((~Bo)&  Bu );
-            Ago0 =   Bo ^((~Bu)&  Ba );
-            Amu1 =   Bu ^((~Ba)&  Be );
-
-            Ba = (Aba1^Da1);
-            Be = ROL32((Ake0^De1), 22);
-            Bi = ROL32((Asi1^Di0), 21);
-            Bo = ROL32((Ago1^Do0), 10);
-            Bu = ROL32((Amu0^Du1), 7);
-            Aba1 =   Ba ^((~Be)&  Bi );
-            Aba1 ^= *(pRoundConstants++);
-            Ake0 =   Be ^((~Bi)&  Bo );
-            Asi1 =   Bi ^((~Bo)&  Bu );
-            Ago1 =   Bo ^((~Bu)&  Ba );
-            Amu0 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Ama0^Da1), 2);
-            Bo = ROL32((Abe0^De1), 23);
-            Bu = ROL32((Aki0^Di1), 31);
-            Ba = ROL32((Aso1^Do0), 14);
-            Be = ROL32((Agu0^Du0), 10);
-            Ama0 =   Ba ^((~Be)&  Bi );
-            Abe0 =   Be ^((~Bi)&  Bo );
-            Aki0 =   Bi ^((~Bo)&  Bu );
-            Aso1 =   Bo ^((~Bu)&  Ba );
-            Agu0 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Ama1^Da0), 1);
-            Bo = ROL32((Abe1^De0), 22);
-            Bu = ROL32((Aki1^Di0), 30);
-            Ba = ROL32((Aso0^Do1), 14);
-            Be = ROL32((Agu1^Du1), 10);
-            Ama1 =   Ba ^((~Be)&  Bi );
-            Abe1 =   Be ^((~Bi)&  Bo );
-            Aki1 =   Bi ^((~Bo)&  Bu );
-            Aso0 =   Bo ^((~Bu)&  Ba );
-            Agu1 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Aga1^Da0), 9);
-            Ba = ROL32((Ame0^De1), 1);
-            Be = ROL32((Abi1^Di0), 3);
-            Bi = ROL32((Ako1^Do1), 13);
-            Bo = ROL32((Asu1^Du0), 4);
-            Aga1 =   Ba ^((~Be)&  Bi );
-            Ame0 =   Be ^((~Bi)&  Bo );
-            Abi1 =   Bi ^((~Bo)&  Bu );
-            Ako1 =   Bo ^((~Bu)&  Ba );
-            Asu1 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Aga0^Da1), 9);
-            Ba = (Ame1^De0);
-            Be = ROL32((Abi0^Di1), 3);
-            Bi = ROL32((Ako0^Do0), 12);
-            Bo = ROL32((Asu0^Du1), 4);
-            Aga0 =   Ba ^((~Be)&  Bi );
-            Ame1 =   Be ^((~Bi)&  Bo );
-            Abi0 =   Bi ^((~Bo)&  Bu );
-            Ako0 =   Bo ^((~Bu)&  Ba );
-            Asu0 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Asa1^Da0), 18);
-            Bi = ROL32((Age1^De0), 5);
-            Bo = ROL32((Ami1^Di1), 8);
-            Bu = ROL32((Abo1^Do0), 28);
-            Ba = ROL32((Aku0^Du1), 14);
-            Asa1 =   Ba ^((~Be)&  Bi );
-            Age1 =   Be ^((~Bi)&  Bo );
-            Ami1 =   Bi ^((~Bo)&  Bu );
-            Abo1 =   Bo ^((~Bu)&  Ba );
-            Aku0 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Asa0^Da1), 18);
-            Bi = ROL32((Age0^De1), 5);
-            Bo = ROL32((Ami0^Di0), 7);
-            Bu = ROL32((Abo0^Do1), 28);
-            Ba = ROL32((Aku1^Du0), 13);
-            Asa0 =   Ba ^((~Be)&  Bi );
-            Age0 =   Be ^((~Bi)&  Bo );
-            Ami0 =   Bi ^((~Bo)&  Bu );
-            Abo0 =   Bo ^((~Bu)&  Ba );
-            Aku1 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Aka0^Da1), 21);
-            Bu = ROL32((Ase0^De0), 1);
-            Ba = ROL32((Agi1^Di0), 31);
-            Be = ROL32((Amo0^Do1), 28);
-            Bi = ROL32((Abu0^Du1), 20);
-            Aka0 =   Ba ^((~Be)&  Bi );
-            Ase0 =   Be ^((~Bi)&  Bo );
-            Agi1 =   Bi ^((~Bo)&  Bu );
-            Amo0 =   Bo ^((~Bu)&  Ba );
-            Abu0 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Aka1^Da0), 20);
-            Bu = ROL32((Ase1^De1), 1);
-            Ba = ROL32((Agi0^Di1), 31);
-            Be = ROL32((Amo1^Do0), 27);
-            Bi = ROL32((Abu1^Du0), 19);
-            Aka1 =   Ba ^((~Be)&  Bi );
-            Ase1 =   Be ^((~Bi)&  Bo );
-            Agi0 =   Bi ^((~Bo)&  Bu );
-            Amo1 =   Bo ^((~Bu)&  Ba );
-            Abu1 =   Bu ^((~Ba)&  Be );
-
-            KeccakAtoD_round3();
-
-            Ba = (Aba0^Da0);
-            Be = ROL32((Abe0^De0), 22);
-            Bi = ROL32((Abi0^Di1), 22);
-            Bo = ROL32((Abo0^Do1), 11);
-            Bu = ROL32((Abu0^Du0), 7);
-            Aba0 =   Ba ^((~Be)&  Bi );
-            Aba0 ^= *(pRoundConstants++);
-            Abe0 =   Be ^((~Bi)&  Bo );
-            Abi0 =   Bi ^((~Bo)&  Bu );
-            Abo0 =   Bo ^((~Bu)&  Ba );
-            Abu0 =   Bu ^((~Ba)&  Be );
-
-            Ba = (Aba1^Da1);
-            Be = ROL32((Abe1^De1), 22);
-            Bi = ROL32((Abi1^Di0), 21);
-            Bo = ROL32((Abo1^Do0), 10);
-            Bu = ROL32((Abu1^Du1), 7);
-            Aba1 =   Ba ^((~Be)&  Bi );
-            Aba1 ^= *(pRoundConstants++);
-            Abe1 =   Be ^((~Bi)&  Bo );
-            Abi1 =   Bi ^((~Bo)&  Bu );
-            Abo1 =   Bo ^((~Bu)&  Ba );
-            Abu1 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Aga0^Da1), 2);
-            Bo = ROL32((Age0^De1), 23);
-            Bu = ROL32((Agi0^Di1), 31);
-            Ba = ROL32((Ago0^Do0), 14);
-            Be = ROL32((Agu0^Du0), 10);
-            Aga0 =   Ba ^((~Be)&  Bi );
-            Age0 =   Be ^((~Bi)&  Bo );
-            Agi0 =   Bi ^((~Bo)&  Bu );
-            Ago0 =   Bo ^((~Bu)&  Ba );
-            Agu0 =   Bu ^((~Ba)&  Be );
-
-            Bi = ROL32((Aga1^Da0), 1);
-            Bo = ROL32((Age1^De0), 22);
-            Bu = ROL32((Agi1^Di0), 30);
-            Ba = ROL32((Ago1^Do1), 14);
-            Be = ROL32((Agu1^Du1), 10);
-            Aga1 =   Ba ^((~Be)&  Bi );
-            Age1 =   Be ^((~Bi)&  Bo );
-            Agi1 =   Bi ^((~Bo)&  Bu );
-            Ago1 =   Bo ^((~Bu)&  Ba );
-            Agu1 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Aka0^Da0), 9);
-            Ba = ROL32((Ake0^De1), 1);
-            Be = ROL32((Aki0^Di0), 3);
-            Bi = ROL32((Ako0^Do1), 13);
-            Bo = ROL32((Aku0^Du0), 4);
-            Aka0 =   Ba ^((~Be)&  Bi );
-            Ake0 =   Be ^((~Bi)&  Bo );
-            Aki0 =   Bi ^((~Bo)&  Bu );
-            Ako0 =   Bo ^((~Bu)&  Ba );
-            Aku0 =   Bu ^((~Ba)&  Be );
-
-            Bu = ROL32((Aka1^Da1), 9);
-            Ba = (Ake1^De0);
-            Be = ROL32((Aki1^Di1), 3);
-            Bi = ROL32((Ako1^Do0), 12);
-            Bo = ROL32((Aku1^Du1), 4);
-            Aka1 =   Ba ^((~Be)&  Bi );
-            Ake1 =   Be ^((~Bi)&  Bo );
-            Aki1 =   Bi ^((~Bo)&  Bu );
-            Ako1 =   Bo ^((~Bu)&  Ba );
-            Aku1 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Ama0^Da0), 18);
-            Bi = ROL32((Ame0^De0), 5);
-            Bo = ROL32((Ami0^Di1), 8);
-            Bu = ROL32((Amo0^Do0), 28);
-            Ba = ROL32((Amu0^Du1), 14);
-            Ama0 =   Ba ^((~Be)&  Bi );
-            Ame0 =   Be ^((~Bi)&  Bo );
-            Ami0 =   Bi ^((~Bo)&  Bu );
-            Amo0 =   Bo ^((~Bu)&  Ba );
-            Amu0 =   Bu ^((~Ba)&  Be );
-
-            Be = ROL32((Ama1^Da1), 18);
-            Bi = ROL32((Ame1^De1), 5);
-            Bo = ROL32((Ami1^Di0), 7);
-            Bu = ROL32((Amo1^Do1), 28);
-            Ba = ROL32((Amu1^Du0), 13);
-            Ama1 =   Ba ^((~Be)&  Bi );
-            Ame1 =   Be ^((~Bi)&  Bo );
-            Ami1 =   Bi ^((~Bo)&  Bu );
-            Amo1 =   Bo ^((~Bu)&  Ba );
-            Amu1 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Asa0^Da1), 21);
-            Bu = ROL32((Ase0^De0), 1);
-            Ba = ROL32((Asi0^Di0), 31);
-            Be = ROL32((Aso0^Do1), 28);
-            Bi = ROL32((Asu0^Du1), 20);
-            Asa0 =   Ba ^((~Be)&  Bi );
-            Ase0 =   Be ^((~Bi)&  Bo );
-            Asi0 =   Bi ^((~Bo)&  Bu );
-            Aso0 =   Bo ^((~Bu)&  Ba );
-            Asu0 =   Bu ^((~Ba)&  Be );
-
-            Bo = ROL32((Asa1^Da0), 20);
-            Bu = ROL32((Ase1^De1), 1);
-            Ba = ROL32((Asi1^Di1), 31);
-            Be = ROL32((Aso1^Do0), 27);
-            Bi = ROL32((Asu1^Du0), 19);
-            Asa1 =   Ba ^((~Be)&  Bi );
-            Ase1 =   Be ^((~Bi)&  Bo );
-            Asi1 =   Bi ^((~Bo)&  Bu );
-            Aso1 =   Bo ^((~Bu)&  Ba );
-            Asu1 =   Bu ^((~Ba)&  Be );
+            case 0: KeccakRound0(); /* fall through */
+            case 3: KeccakRound1();
+            case 2: KeccakRound2();
+            case 1: KeccakRound3();
         }
-        while ( *pRoundConstants != 0xFF );
-
-        #undef Aba0
-        #undef Aba1
-        #undef Abe0
-        #undef Abe1
-        #undef Abi0
-        #undef Abi1
-        #undef Abo0
-        #undef Abo1
-        #undef Abu0
-        #undef Abu1
-        #undef Aga0
-        #undef Aga1
-        #undef Age0
-        #undef Age1
-        #undef Agi0
-        #undef Agi1
-        #undef Ago0
-        #undef Ago1
-        #undef Agu0
-        #undef Agu1
-        #undef Aka0
-        #undef Aka1
-        #undef Ake0
-        #undef Ake1
-        #undef Aki0
-        #undef Aki1
-        #undef Ako0
-        #undef Ako1
-        #undef Aku0
-        #undef Aku1
-        #undef Ama0
-        #undef Ama1
-        #undef Ame0
-        #undef Ame1
-        #undef Ami0
-        #undef Ami1
-        #undef Amo0
-        #undef Amo1
-        #undef Amu0
-        #undef Amu1
-        #undef Asa0
-        #undef Asa1
-        #undef Ase0
-        #undef Ase1
-        #undef Asi0
-        #undef Asi1
-        #undef Aso0
-        #undef Aso1
-        #undef Asu0
-        #undef Asu1
+        nRounds = 0;
     }
+    while ( *pRoundConstants != 0xFF );
+
+    #undef Aba0
+    #undef Aba1
+    #undef Abe0
+    #undef Abe1
+    #undef Abi0
+    #undef Abi1
+    #undef Abo0
+    #undef Abo1
+    #undef Abu0
+    #undef Abu1
+    #undef Aga0
+    #undef Aga1
+    #undef Age0
+    #undef Age1
+    #undef Agi0
+    #undef Agi1
+    #undef Ago0
+    #undef Ago1
+    #undef Agu0
+    #undef Agu1
+    #undef Aka0
+    #undef Aka1
+    #undef Ake0
+    #undef Ake1
+    #undef Aki0
+    #undef Aki1
+    #undef Ako0
+    #undef Ako1
+    #undef Aku0
+    #undef Aku1
+    #undef Ama0
+    #undef Ama1
+    #undef Ame0
+    #undef Ame1
+    #undef Ami0
+    #undef Ami1
+    #undef Amo0
+    #undef Amo1
+    #undef Amu0
+    #undef Amu1
+    #undef Asa0
+    #undef Asa1
+    #undef Ase0
+    #undef Ase1
+    #undef Asi0
+    #undef Asi1
+    #undef Aso0
+    #undef Aso1
+    #undef Asu0
+    #undef Asu1
 }
 
 /* ---------------------------------------------------------------- */
diff --git a/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h b/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
index 9501c64b186aa9..e6f16a4becb4b3 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
+++ b/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
@@ -1,3 +1,7 @@
+/*
+This file defines some parameters of the implementation in the parent directory.
+*/
+
 #define KeccakP1600_implementation_config "lane complementing, all rounds unrolled"
 #define KeccakP1600_fullUnrolling
 #define KeccakP1600_useLaneComplementing
diff --git a/Modules/_sha3/kcp/KeccakP-1600-opt64.c b/Modules/_sha3/kcp/KeccakP-1600-opt64.c
index c90010dd9256c1..1673abe877b83e 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-opt64.c
+++ b/Modules/_sha3/kcp/KeccakP-1600-opt64.c
@@ -1,28 +1,33 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
 http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+This file implements Keccak-p[1600] in a SnP-compatible way.
+Please refer to SnP-documentation.h for more details.
+
+This implementation comes with KeccakP-1600-SnP.h in the same folder.
+Please refer to LowLevel.build for the exact list of other files it must be combined with.
 */
 
+#include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
 /* #include "brg_endian.h" */
 #include "KeccakP-1600-opt64-config.h"
 
-#if NOT_PYTHON
-typedef unsigned char UINT8;
-/* typedef unsigned long long int UINT64; */
-#endif
-
 #if defined(KeccakP1600_useLaneComplementing)
 #define UseBebigokimisa
 #endif
@@ -31,13 +36,13 @@ typedef unsigned char UINT8;
 #define ROL64(a, offset) _rotl64(a, offset)
 #elif defined(KeccakP1600_useSHLD)
     #define ROL64(x,N) ({ \
-    register UINT64 __out; \
-    register UINT64 __in = x; \
+    register uint64_t __out; \
+    register uint64_t __in = x; \
     __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
     __out; \
     })
 #else
-#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
+#define ROL64(a, offset) ((((uint64_t)a) << offset) ^ (((uint64_t)a) >> (64-offset)))
 #endif
 
 #include "KeccakP-1600-64.macros"
@@ -49,7 +54,7 @@ typedef unsigned char UINT8;
 #include "KeccakP-1600-unrolling.macros"
 #include "SnP-Relaned.h"
 
-static const UINT64 KeccakF1600RoundConstants[24] = {
+static const uint64_t KeccakF1600RoundConstants[24] = {
     0x0000000000000001ULL,
     0x0000000000008082ULL,
     0x800000000000808aULL,
@@ -81,12 +86,12 @@ void KeccakP1600_Initialize(void *state)
 {
     memset(state, 0, 200);
 #ifdef KeccakP1600_useLaneComplementing
-    ((UINT64*)state)[ 1] = ~(UINT64)0;
-    ((UINT64*)state)[ 2] = ~(UINT64)0;
-    ((UINT64*)state)[ 8] = ~(UINT64)0;
-    ((UINT64*)state)[12] = ~(UINT64)0;
-    ((UINT64*)state)[17] = ~(UINT64)0;
-    ((UINT64*)state)[20] = ~(UINT64)0;
+    ((uint64_t*)state)[ 1] = ~(uint64_t)0;
+    ((uint64_t*)state)[ 2] = ~(uint64_t)0;
+    ((uint64_t*)state)[ 8] = ~(uint64_t)0;
+    ((uint64_t*)state)[12] = ~(uint64_t)0;
+    ((uint64_t*)state)[17] = ~(uint64_t)0;
+    ((uint64_t*)state)[20] = ~(uint64_t)0;
 #endif
 }
 
@@ -95,7 +100,7 @@ void KeccakP1600_Initialize(void *state)
 void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
 {
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-    UINT64 lane;
+    uint64_t lane;
     if (length == 0)
         return;
     if (length == 1)
@@ -106,12 +111,12 @@ void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const un
     }
     lane <<= offset*8;
 #else
-    UINT64 lane = 0;
+    uint64_t lane = 0;
     unsigned int i;
     for(i=0; i<length; i++)
-        lane |= ((UINT64)data[i]) << ((i+offset)*8);
+        lane |= ((uint64_t)data[i]) << ((i+offset)*8);
 #endif
-    ((UINT64*)state)[lanePosition] ^= lane;
+    ((uint64_t*)state)[lanePosition] ^= lane;
 }
 
 /* ---------------------------------------------------------------- */
@@ -122,7 +127,6 @@ void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int l
     unsigned int i = 0;
 #ifdef NO_MISALIGNED_ACCESSES
     /* If either pointer is misaligned, fall back to byte-wise xor. */
-
     if (((((uintptr_t)state) & 7) != 0) || ((((uintptr_t)data) & 7) != 0)) {
       for (i = 0; i < laneCount * 8; i++) {
         ((unsigned char*)state)[i] ^= data[i];
@@ -132,44 +136,43 @@ void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int l
 #endif
     {
       /* Otherwise... */
-
       for( ; (i+8)<=laneCount; i+=8) {
-          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
-          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
-          ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
-          ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
-          ((UINT64*)state)[i+4] ^= ((UINT64*)data)[i+4];
-          ((UINT64*)state)[i+5] ^= ((UINT64*)data)[i+5];
-          ((UINT64*)state)[i+6] ^= ((UINT64*)data)[i+6];
-          ((UINT64*)state)[i+7] ^= ((UINT64*)data)[i+7];
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+          ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+          ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
+          ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
+          ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4];
+          ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5];
+          ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6];
+          ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7];
       }
       for( ; (i+4)<=laneCount; i+=4) {
-          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
-          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
-          ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
-          ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+          ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
+          ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2];
+          ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3];
       }
       for( ; (i+2)<=laneCount; i+=2) {
-          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
-          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
+          ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1];
       }
       if (i<laneCount) {
-          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0];
       }
     }
 #else
     unsigned int i;
-    UINT8 *curData = data;
+    const uint8_t *curData = data;
     for(i=0; i<laneCount; i++, curData+=8) {
-        UINT64 lane = (UINT64)curData[0]
-            | ((UINT64)curData[1] << 8)
-            | ((UINT64)curData[2] << 16)
-            | ((UINT64)curData[3] << 24)
-            | ((UINT64)curData[4] <<32)
-            | ((UINT64)curData[5] << 40)
-            | ((UINT64)curData[6] << 48)
-            | ((UINT64)curData[7] << 56);
-        ((UINT64*)state)[i] ^= lane;
+        uint64_t lane = (uint64_t)curData[0]
+            | ((uint64_t)curData[1] <<  8)
+            | ((uint64_t)curData[2] << 16)
+            | ((uint64_t)curData[3] << 24)
+            | ((uint64_t)curData[4] << 32)
+            | ((uint64_t)curData[5] << 40)
+            | ((uint64_t)curData[6] << 48)
+            | ((uint64_t)curData[7] << 56);
+        ((uint64_t*)state)[i] ^= lane;
     }
 #endif
 }
@@ -179,9 +182,9 @@ void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int l
 #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
 void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
 {
-    UINT64 lane = byte;
+    uint64_t lane = byte;
     lane <<= (offset%8)*8;
-    ((UINT64*)state)[offset/8] ^= lane;
+    ((uint64_t*)state)[offset/8] ^= lane;
 }
 #endif
 
@@ -209,7 +212,18 @@ void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, co
         memcpy((unsigned char*)state+lanePosition*8+offset, data, length);
     }
 #else
-#error "Not yet implemented"
+    uint64_t lane = ((uint64_t*)state)[lanePosition];
+    unsigned int i;
+    for(i=0; i<length; i++) {
+        lane &= ~((uint64_t)0xFF << ((offset+i)*8));
+#ifdef KeccakP1600_useLaneComplementing
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            lane |= (uint64_t)(data[i] ^ 0xFF) << ((offset+i)*8);
+        else
+#endif
+            lane |= (uint64_t)data[i] << ((offset+i)*8);
+    }
+    ((uint64_t*)state)[lanePosition] = lane;
 #endif
 }
 
@@ -223,14 +237,31 @@ void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned
 
     for(lanePosition=0; lanePosition<laneCount; lanePosition++)
         if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
-            ((UINT64*)state)[lanePosition] = ~((const UINT64*)data)[lanePosition];
+            ((uint64_t*)state)[lanePosition] = ~((const uint64_t*)data)[lanePosition];
         else
-            ((UINT64*)state)[lanePosition] = ((const UINT64*)data)[lanePosition];
+            ((uint64_t*)state)[lanePosition] = ((const uint64_t*)data)[lanePosition];
 #else
     memcpy(state, data, laneCount*8);
 #endif
 #else
-#error "Not yet implemented"
+    unsigned int lanePosition;
+    const uint8_t *curData = data;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++, curData+=8) {
+        uint64_t lane = (uint64_t)curData[0]
+            | ((uint64_t)curData[1] <<  8)
+            | ((uint64_t)curData[2] << 16)
+            | ((uint64_t)curData[3] << 24)
+            | ((uint64_t)curData[4] << 32)
+            | ((uint64_t)curData[5] << 40)
+            | ((uint64_t)curData[6] << 48)
+            | ((uint64_t)curData[7] << 56);
+#ifdef KeccakP1600_useLaneComplementing
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            ((uint64_t*)state)[lanePosition] = ~lane;
+        else
+#endif
+            ((uint64_t*)state)[lanePosition] = lane;
+    }
 #endif
 }
 
@@ -251,9 +282,9 @@ void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
 
     for(lanePosition=0; lanePosition<byteCount/8; lanePosition++)
         if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
-            ((UINT64*)state)[lanePosition] = ~0;
+            ((uint64_t*)state)[lanePosition] = ~0;
         else
-            ((UINT64*)state)[lanePosition] = 0;
+            ((uint64_t*)state)[lanePosition] = 0;
     if (byteCount%8 != 0) {
         lanePosition = byteCount/8;
         if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
@@ -265,19 +296,56 @@ void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
     memset(state, 0, byteCount);
 #endif
 #else
-#error "Not yet implemented"
+    unsigned int i, j;
+    for(i=0; i<byteCount; i+=8) {
+        unsigned int lanePosition = i/8;
+        if (i+8 <= byteCount) {
+#ifdef KeccakP1600_useLaneComplementing
+            if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+                ((uint64_t*)state)[lanePosition] = ~(uint64_t)0;
+            else
+#endif
+                ((uint64_t*)state)[lanePosition] = 0;
+        }
+        else {
+            uint64_t lane = ((uint64_t*)state)[lanePosition];
+            for(j=0; j<byteCount%8; j++) {
+#ifdef KeccakP1600_useLaneComplementing
+                if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+                    lane |= (uint64_t)0xFF << (j*8);
+                else
+#endif
+                    lane &= ~((uint64_t)0xFF << (j*8));
+            }
+            ((uint64_t*)state)[lanePosition] = lane;
+        }
+    }
 #endif
 }
 
 /* ---------------------------------------------------------------- */
 
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nr)
+{
+    declareABCDE
+    unsigned int i;
+    uint64_t *stateAsLanes = (uint64_t*)state;
+
+    copyFromState(A, stateAsLanes)
+    roundsN(nr)
+    copyToState(stateAsLanes, A)
+
+}
+
+/* ---------------------------------------------------------------- */
+
 void KeccakP1600_Permute_24rounds(void *state)
 {
     declareABCDE
     #ifndef KeccakP1600_fullUnrolling
     unsigned int i;
     #endif
-    UINT64 *stateAsLanes = (UINT64*)state;
+    uint64_t *stateAsLanes = (uint64_t*)state;
 
     copyFromState(A, stateAsLanes)
     rounds24
@@ -292,7 +360,7 @@ void KeccakP1600_Permute_12rounds(void *state)
     #ifndef KeccakP1600_fullUnrolling
     unsigned int i;
     #endif
-    UINT64 *stateAsLanes = (UINT64*)state;
+    uint64_t *stateAsLanes = (uint64_t*)state;
 
     copyFromState(A, stateAsLanes)
     rounds12
@@ -303,16 +371,16 @@ void KeccakP1600_Permute_12rounds(void *state)
 
 void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
 {
-    UINT64 lane = ((UINT64*)state)[lanePosition];
+    uint64_t lane = ((uint64_t*)state)[lanePosition];
 #ifdef KeccakP1600_useLaneComplementing
     if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
         lane = ~lane;
 #endif
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
     {
-        UINT64 lane1[1];
+        uint64_t lane1[1];
         lane1[0] = lane;
-        memcpy(data, (UINT8*)lane1+offset, length);
+        memcpy(data, (uint8_t*)lane1+offset, length);
     }
 #else
     unsigned int i;
@@ -327,7 +395,7 @@ void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition
 /* ---------------------------------------------------------------- */
 
 #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
-void fromWordToBytes(UINT8 *bytes, const UINT64 word)
+static void fromWordToBytes(uint8_t *bytes, const uint64_t word)
 {
     unsigned int i;
 
@@ -344,21 +412,21 @@ void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned i
     unsigned int i;
 
     for(i=0; i<laneCount; i++)
-        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+        fromWordToBytes(data+(i*8), ((const uint64_t*)state)[i]);
 #endif
 #ifdef KeccakP1600_useLaneComplementing
     if (laneCount > 1) {
-        ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+        ((uint64_t*)data)[ 1] = ~((uint64_t*)data)[ 1];
         if (laneCount > 2) {
-            ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+            ((uint64_t*)data)[ 2] = ~((uint64_t*)data)[ 2];
             if (laneCount > 8) {
-                ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+                ((uint64_t*)data)[ 8] = ~((uint64_t*)data)[ 8];
                 if (laneCount > 12) {
-                    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+                    ((uint64_t*)data)[12] = ~((uint64_t*)data)[12];
                     if (laneCount > 17) {
-                        ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+                        ((uint64_t*)data)[17] = ~((uint64_t*)data)[17];
                         if (laneCount > 20) {
-                            ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+                            ((uint64_t*)data)[20] = ~((uint64_t*)data)[20];
                         }
                     }
                 }
@@ -379,7 +447,7 @@ void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned i
 
 void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
 {
-    UINT64 lane = ((UINT64*)state)[lanePosition];
+    uint64_t lane = ((uint64_t*)state)[lanePosition];
 #ifdef KeccakP1600_useLaneComplementing
     if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
         lane = ~lane;
@@ -387,10 +455,10 @@ void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePo
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
     {
         unsigned int i;
-        UINT64 lane1[1];
+        uint64_t lane1[1];
         lane1[0] = lane;
         for(i=0; i<length; i++)
-            output[i] = input[i] ^ ((UINT8*)lane1)[offset+i];
+            output[i] = input[i] ^ ((uint8_t*)lane1)[offset+i];
     }
 #else
     unsigned int i;
@@ -414,26 +482,26 @@ void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *inpu
 
     for(i=0; i<laneCount; i++) {
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-        ((UINT64*)output)[i] = ((UINT64*)input)[i] ^ ((const UINT64*)state)[i];
+        ((uint64_t*)output)[i] = ((uint64_t*)input)[i] ^ ((const uint64_t*)state)[i];
 #else
-        fromWordToBytes(temp, ((const UINT64*)state)[i]);
+        fromWordToBytes(temp, ((const uint64_t*)state)[i]);
         for(j=0; j<8; j++)
             output[i*8+j] = input[i*8+j] ^ temp[j];
 #endif
     }
 #ifdef KeccakP1600_useLaneComplementing
     if (laneCount > 1) {
-        ((UINT64*)output)[ 1] = ~((UINT64*)output)[ 1];
+        ((uint64_t*)output)[ 1] = ~((uint64_t*)output)[ 1];
         if (laneCount > 2) {
-            ((UINT64*)output)[ 2] = ~((UINT64*)output)[ 2];
+            ((uint64_t*)output)[ 2] = ~((uint64_t*)output)[ 2];
             if (laneCount > 8) {
-                ((UINT64*)output)[ 8] = ~((UINT64*)output)[ 8];
+                ((uint64_t*)output)[ 8] = ~((uint64_t*)output)[ 8];
                 if (laneCount > 12) {
-                    ((UINT64*)output)[12] = ~((UINT64*)output)[12];
+                    ((uint64_t*)output)[12] = ~((uint64_t*)output)[12];
                     if (laneCount > 17) {
-                        ((UINT64*)output)[17] = ~((UINT64*)output)[17];
+                        ((uint64_t*)output)[17] = ~((uint64_t*)output)[17];
                         if (laneCount > 20) {
-                            ((UINT64*)output)[20] = ~((UINT64*)output)[20];
+                            ((uint64_t*)output)[20] = ~((uint64_t*)output)[20];
                         }
                     }
                 }
@@ -459,8 +527,8 @@ size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const un
     #ifndef KeccakP1600_fullUnrolling
     unsigned int i;
     #endif
-    UINT64 *stateAsLanes = (UINT64*)state;
-    UINT64 *inDataAsLanes = (UINT64*)data;
+    uint64_t *stateAsLanes = (uint64_t*)state;
+    uint64_t *inDataAsLanes = (uint64_t*)data;
 
     copyFromState(A, stateAsLanes)
     while(dataByteLen >= laneCount*8) {
@@ -472,3 +540,26 @@ size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const un
     copyToState(stateAsLanes, A)
     return originalDataByteLen - dataByteLen;
 }
+
+/* ---------------------------------------------------------------- */
+
+size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+    size_t originalDataByteLen = dataByteLen;
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    uint64_t *stateAsLanes = (uint64_t*)state;
+    uint64_t *inDataAsLanes = (uint64_t*)data;
+
+    copyFromState(A, stateAsLanes)
+    while(dataByteLen >= laneCount*8) {
+        addInput(A, inDataAsLanes, laneCount)
+        rounds12
+        inDataAsLanes += laneCount;
+        dataByteLen -= laneCount*8;
+    }
+    copyToState(stateAsLanes, A)
+    return originalDataByteLen - dataByteLen;
+}
diff --git a/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros b/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
index 405ce29724cedd..9f72002262b2ef 100644
--- a/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
+++ b/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -56,6 +57,22 @@ http://creativecommons.org/publicdomain/zero/1.0/
     thetaRhoPiChiIotaPrepareTheta(22, A, E) \
     thetaRhoPiChiIota(23, E, A) \
 
+#define rounds6 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds4 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
 #elif (Unrolling == 12)
 #define rounds24 \
     prepareTheta \
@@ -89,6 +106,22 @@ http://creativecommons.org/publicdomain/zero/1.0/
     thetaRhoPiChiIotaPrepareTheta(22, A, E) \
     thetaRhoPiChiIota(23, E, A) \
 
+#define rounds6 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds4 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
 #elif (Unrolling == 6)
 #define rounds24 \
     prepareTheta \
@@ -112,6 +145,22 @@ http://creativecommons.org/publicdomain/zero/1.0/
         thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
     } \
 
+#define rounds6 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds4 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
 #elif (Unrolling == 4)
 #define rounds24 \
     prepareTheta \
@@ -131,6 +180,20 @@ http://creativecommons.org/publicdomain/zero/1.0/
         thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
     } \
 
+#define rounds6 \
+    prepareTheta \
+    for(i=18; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#define rounds4 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
 #elif (Unrolling == 3)
 #define rounds24 \
     prepareTheta \
@@ -150,6 +213,22 @@ http://creativecommons.org/publicdomain/zero/1.0/
         copyStateVariables(A, E) \
     } \
 
+#define rounds6 \
+    prepareTheta \
+    for(i=18; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds4 \
+    prepareTheta \
+    for(i=20; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
 #elif (Unrolling == 2)
 #define rounds24 \
     prepareTheta \
@@ -165,6 +244,20 @@ http://creativecommons.org/publicdomain/zero/1.0/
         thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
     } \
 
+#define rounds6 \
+    prepareTheta \
+    for(i=18; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#define rounds4 \
+    prepareTheta \
+    for(i=20; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
 #elif (Unrolling == 1)
 #define rounds24 \
     prepareTheta \
@@ -180,6 +273,33 @@ http://creativecommons.org/publicdomain/zero/1.0/
         copyStateVariables(A, E) \
     } \
 
+#define rounds6 \
+    prepareTheta \
+    for(i=18; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds4 \
+    prepareTheta \
+    for(i=20; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
 #else
 #error "Unrolling is not correctly specified!"
 #endif
+
+#define roundsN(__nrounds) \
+    prepareTheta \
+    i = 24 - (__nrounds); \
+    if ((i&1) != 0) { \
+        thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+        copyStateVariables(A, E) \
+        ++i; \
+    } \
+    for( /* empty */; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    }
diff --git a/Modules/_sha3/kcp/KeccakSponge.c b/Modules/_sha3/kcp/KeccakSponge.c
index afdb73172f3478..350df772e426dd 100644
--- a/Modules/_sha3/kcp/KeccakSponge.c
+++ b/Modules/_sha3/kcp/KeccakSponge.c
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Keccak, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by the designers, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -19,7 +20,7 @@ and related or neighboring rights to the source code in this file.
     #include "displayIntermediateValues.h"
 #endif
 
-#ifndef KeccakP200_excluded
+#ifdef XKCP_has_KeccakP200
     #include "KeccakP-200-SnP.h"
 
     #define prefix KeccakWidth200
@@ -37,7 +38,7 @@ and related or neighboring rights to the source code in this file.
     #undef SnP_FastLoop_Absorb
 #endif
 
-#ifndef KeccakP400_excluded
+#ifdef XKCP_has_KeccakP400
     #include "KeccakP-400-SnP.h"
 
     #define prefix KeccakWidth400
@@ -55,7 +56,7 @@ and related or neighboring rights to the source code in this file.
     #undef SnP_FastLoop_Absorb
 #endif
 
-#ifndef KeccakP800_excluded
+#ifdef XKCP_has_KeccakP800
     #include "KeccakP-800-SnP.h"
 
     #define prefix KeccakWidth800
@@ -73,7 +74,7 @@ and related or neighboring rights to the source code in this file.
     #undef SnP_FastLoop_Absorb
 #endif
 
-#ifndef KeccakP1600_excluded
+#ifdef XKCP_has_KeccakP1600
     #include "KeccakP-1600-SnP.h"
 
     #define prefix KeccakWidth1600
@@ -90,3 +91,21 @@ and related or neighboring rights to the source code in this file.
     #undef SnP_Permute
     #undef SnP_FastLoop_Absorb
 #endif
+
+#ifdef XKCP_has_KeccakP1600
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600_12rounds
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_12rounds
+    #if defined(KeccakP1600_12rounds_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.h b/Modules/_sha3/kcp/KeccakSponge.h
index 0f4badcac059e9..5ef9bf29728022 100644
--- a/Modules/_sha3/kcp/KeccakSponge.h
+++ b/Modules/_sha3/kcp/KeccakSponge.h
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Keccak, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by the designers, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -16,121 +17,14 @@ and related or neighboring rights to the source code in this file.
 #ifndef _KeccakSponge_h_
 #define _KeccakSponge_h_
 
-/** General information
-  *
-  * The following type and functions are not actually implemented. Their
-  * documentation is generic, with the prefix Prefix replaced by
-  * - KeccakWidth200 for a sponge function based on Keccak-f[200]
-  * - KeccakWidth400 for a sponge function based on Keccak-f[400]
-  * - KeccakWidth800 for a sponge function based on Keccak-f[800]
-  * - KeccakWidth1600 for a sponge function based on Keccak-f[1600]
-  *
-  * In all these functions, the rate and capacity must sum to the width of the
-  * chosen permutation. For instance, to use the sponge function
-  * Keccak[r=1344, c=256], one must use KeccakWidth1600_Sponge() or a combination
-  * of KeccakWidth1600_SpongeInitialize(), KeccakWidth1600_SpongeAbsorb(),
-  * KeccakWidth1600_SpongeAbsorbLastFewBits() and
-  * KeccakWidth1600_SpongeSqueeze().
-  *
-  * The Prefix_SpongeInstance contains the sponge instance attributes for use
-  * with the Prefix_Sponge* functions.
-  * It gathers the state processed by the permutation as well as the rate,
-  * the position of input/output bytes in the state and the phase
-  * (absorbing or squeezing).
-  */
-
-#ifdef DontReallyInclude_DocumentationOnly
-/** Function to evaluate the sponge function Keccak[r, c] in a single call.
-  * @param  rate        The value of the rate r.
-  * @param  capacity    The value of the capacity c.
-  * @param  input           Pointer to the input message (before the suffix).
-  * @param  inputByteLen    The length of the input message in bytes.
-  * @param  suffix          Byte containing from 0 to 7 suffix bits
-  *                         that must be absorbed after @a input.
-  *                         These <i>n</i> bits must be in the least significant bit positions.
-  *                         These bits must be delimited with a bit 1 at position <i>n</i>
-  *                         (counting from 0=LSB to 7=MSB) and followed by bits 0
-  *                         from position <i>n</i>+1 to position 7.
-  *                         Some examples:
-  *                             - If no bits are to be absorbed, then @a suffix must be 0x01.
-  *                             - If the 2-bit sequence 0,0 is to be absorbed, @a suffix must be 0x04.
-  *                             - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a suffix must be 0x32.
-  *                             - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a suffix must be 0x8B.
-  *                         .
-  * @param  output          Pointer to the output buffer.
-  * @param  outputByteLen   The desired number of output bytes.
-  * @pre    One must have r+c equal to the supported width of this implementation
-  *         and the rate a multiple of 8 bits (one byte) in this implementation.
-  * @pre    @a suffix ≠ 0x00
-  * @return Zero if successful, 1 otherwise.
-  */
-int Prefix_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen);
-
-/**
-  * Function to initialize the state of the Keccak[r, c] sponge function.
-  * The phase of the sponge function is set to absorbing.
-  * @param  spongeInstance  Pointer to the sponge instance to be initialized.
-  * @param  rate        The value of the rate r.
-  * @param  capacity    The value of the capacity c.
-  * @pre    One must have r+c equal to the supported width of this implementation
-  *         and the rate a multiple of 8 bits (one byte) in this implementation.
-  * @return Zero if successful, 1 otherwise.
-  */
-int Prefix_SpongeInitialize(Prefix_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity);
-
-/**
-  * Function to give input data bytes for the sponge function to absorb.
-  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
-  * @param  data        Pointer to the input data.
-  * @param  dataByteLen  The number of input bytes provided in the input data.
-  * @pre    The sponge function must be in the absorbing phase,
-  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
-  *         must not have been called before.
-  * @return Zero if successful, 1 otherwise.
-  */
-int Prefix_SpongeAbsorb(Prefix_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen);
-
-/**
-  * Function to give input data bits for the sponge function to absorb
-  * and then to switch to the squeezing phase.
-  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
-  * @param  delimitedData   Byte containing from 0 to 7 trailing bits
-  *                     that must be absorbed.
-  *                     These <i>n</i> bits must be in the least significant bit positions.
-  *                     These bits must be delimited with a bit 1 at position <i>n</i>
-  *                     (counting from 0=LSB to 7=MSB) and followed by bits 0
-  *                     from position <i>n</i>+1 to position 7.
-  *                     Some examples:
-  *                         - If no bits are to be absorbed, then @a delimitedData must be 0x01.
-  *                         - If the 2-bit sequence 0,0 is to be absorbed, @a delimitedData must be 0x04.
-  *                         - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a delimitedData must be 0x32.
-  *                         - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a delimitedData must be 0x8B.
-  *                     .
-  * @pre    The sponge function must be in the absorbing phase,
-  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
-  *         must not have been called before.
-  * @pre    @a delimitedData ≠ 0x00
-  * @return Zero if successful, 1 otherwise.
-  */
-int Prefix_SpongeAbsorbLastFewBits(Prefix_SpongeInstance *spongeInstance, unsigned char delimitedData);
-
-/**
-  * Function to squeeze output data from the sponge function.
-  * If the sponge function was in the absorbing phase, this function
-  * switches it to the squeezing phase
-  * as if Prefix_SpongeAbsorbLastFewBits(spongeInstance, 0x01) was called.
-  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
-  * @param  data        Pointer to the buffer where to store the output data.
-  * @param  dataByteLen The number of output bytes desired.
-  * @return Zero if successful, 1 otherwise.
-  */
-int Prefix_SpongeSqueeze(Prefix_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
-#endif
+/* For the documentation, please follow the link: */
+/* #include "KeccakSponge-documentation.h" */
 
 #include <string.h>
 #include "align.h"
+/* #include "config.h" */
 
-#define KCP_DeclareSpongeStructure(prefix, size, alignment) \
+#define XKCP_DeclareSpongeStructure(prefix, size, alignment) \
     ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
         unsigned char state[size]; \
         unsigned int rate; \
@@ -138,35 +32,45 @@ int Prefix_SpongeSqueeze(Prefix_SpongeInstance *spongeInstance, unsigned char *d
         int squeezing; \
     } prefix##_SpongeInstance;
 
-#define KCP_DeclareSpongeFunctions(prefix) \
+#define XKCP_DeclareSpongeFunctions(prefix) \
     int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \
     int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \
     int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \
     int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \
     int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
 
-#ifndef KeccakP200_excluded
+#ifdef XKCP_has_KeccakP200
     #include "KeccakP-200-SnP.h"
-    KCP_DeclareSpongeStructure(KeccakWidth200, KeccakP200_stateSizeInBytes, KeccakP200_stateAlignment)
-    KCP_DeclareSpongeFunctions(KeccakWidth200)
+    XKCP_DeclareSpongeStructure(KeccakWidth200, KeccakP200_stateSizeInBytes, KeccakP200_stateAlignment)
+    XKCP_DeclareSpongeFunctions(KeccakWidth200)
+    #define XKCP_has_Sponge_Keccak_width200
 #endif
 
-#ifndef KeccakP400_excluded
+#ifdef XKCP_has_KeccakP400
     #include "KeccakP-400-SnP.h"
-    KCP_DeclareSpongeStructure(KeccakWidth400, KeccakP400_stateSizeInBytes, KeccakP400_stateAlignment)
-    KCP_DeclareSpongeFunctions(KeccakWidth400)
+    XKCP_DeclareSpongeStructure(KeccakWidth400, KeccakP400_stateSizeInBytes, KeccakP400_stateAlignment)
+    XKCP_DeclareSpongeFunctions(KeccakWidth400)
+    #define XKCP_has_Sponge_Keccak_width400
 #endif
 
-#ifndef KeccakP800_excluded
+#ifdef XKCP_has_KeccakP800
     #include "KeccakP-800-SnP.h"
-    KCP_DeclareSpongeStructure(KeccakWidth800, KeccakP800_stateSizeInBytes, KeccakP800_stateAlignment)
-    KCP_DeclareSpongeFunctions(KeccakWidth800)
+    XKCP_DeclareSpongeStructure(KeccakWidth800, KeccakP800_stateSizeInBytes, KeccakP800_stateAlignment)
+    XKCP_DeclareSpongeFunctions(KeccakWidth800)
+    #define XKCP_has_Sponge_Keccak_width800
+#endif
+
+#ifdef XKCP_has_KeccakP1600
+    #include "KeccakP-1600-SnP.h"
+    XKCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    XKCP_DeclareSpongeFunctions(KeccakWidth1600)
+    #define XKCP_has_Sponge_Keccak_width1600
 #endif
 
-#ifndef KeccakP1600_excluded
+#ifdef XKCP_has_KeccakP1600
     #include "KeccakP-1600-SnP.h"
-    KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
-    KCP_DeclareSpongeFunctions(KeccakWidth1600)
+    XKCP_DeclareSpongeStructure(KeccakWidth1600_12rounds, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    XKCP_DeclareSpongeFunctions(KeccakWidth1600_12rounds)
 #endif
 
 #endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.inc b/Modules/_sha3/kcp/KeccakSponge.inc
index e10739deafa836..70080923ec7bb2 100644
--- a/Modules/_sha3/kcp/KeccakSponge.inc
+++ b/Modules/_sha3/kcp/KeccakSponge.inc
@@ -1,12 +1,13 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Keccak, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
+
+Implementation by the designers, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -47,16 +48,13 @@ int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input,
         return 1;
 
     /* Initialize the state */
-
     SnP_StaticInitialize();
     SnP_Initialize(state);
 
     /* First, absorb whole blocks */
-
 #ifdef SnP_FastLoop_Absorb
     if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) {
         /* fast lane: whole lane rate */
-
         size_t j;
         j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen);
         curInput += j;
@@ -74,7 +72,6 @@ int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input,
     }
 
     /* Then, absorb what remains */
-
     partialBlock = (unsigned int)inputByteLen;
     #ifdef KeccakReference
     displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock);
@@ -82,7 +79,6 @@ int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input,
     SnP_AddBytes(state, curInput, 0, partialBlock);
 
     /* Finally, absorb the suffix */
-
     #ifdef KeccakReference
     {
         unsigned char delimitedData1[1];
@@ -91,14 +87,11 @@ int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input,
     }
     #endif
     /* Last few bits, whose delimiter coincides with first bit of padding */
-
     SnP_AddByte(state, suffix, partialBlock);
     /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
-
     if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1)))
         SnP_Permute(state);
     /* Second bit of padding */
-
     SnP_AddByte(state, 0x80, rateInBytes-1);
     #ifdef KeccakReference
     {
@@ -114,7 +107,6 @@ int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input,
     #endif
 
     /* First, output whole blocks */
-
     while(outputByteLen > (size_t)rateInBytes) {
         SnP_ExtractBytes(state, curOutput, 0, rateInBytes);
         SnP_Permute(state);
@@ -126,7 +118,6 @@ int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input,
     }
 
     /* Finally, output what remains */
-
     partialBlock = (unsigned int)outputByteLen;
     SnP_ExtractBytes(state, curOutput, 0, partialBlock);
     #ifdef KeccakReference
@@ -167,17 +158,14 @@ int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dat
     if (instance->squeezing)
         return 1; /* Too late for additional input */
 
-
     i = 0;
     curData = data;
     while(i < dataByteLen) {
         if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) {
 #ifdef SnP_FastLoop_Absorb
             /* processing full blocks first */
-
             if ((rateInBytes % (SnP_width/200)) == 0) {
                 /* fast lane: whole lane rate */
-
                 j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i);
                 i += j;
                 curData += j;
@@ -199,7 +187,6 @@ int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dat
         }
         else {
             /* normal lane: using the message queue */
-
             partialBlock = (unsigned int)(dataByteLen - i);
             if (partialBlock+instance->byteIOIndex > rateInBytes)
                 partialBlock = rateInBytes-instance->byteIOIndex;
@@ -231,7 +218,6 @@ int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedDat
     if (instance->squeezing)
         return 1; /* Too late for additional input */
 
-
     #ifdef KeccakReference
     {
         unsigned char delimitedData1[1];
@@ -240,14 +226,11 @@ int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedDat
     }
     #endif
     /* Last few bits, whose delimiter coincides with first bit of padding */
-
     SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex);
     /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
-
     if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
         SnP_Permute(instance->state);
     /* Second bit of padding */
-
     SnP_AddByte(instance->state, 0x80, rateInBytes-1);
     #ifdef KeccakReference
     {
@@ -294,7 +277,6 @@ int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByte
         }
         else {
             /* normal lane: using the message queue */
-
             if (instance->byteIOIndex == rateInBytes) {
                 SnP_Permute(instance->state);
                 instance->byteIOIndex = 0;
diff --git a/Modules/_sha3/kcp/PlSnP-Fallback.inc b/Modules/_sha3/kcp/PlSnP-Fallback.inc
deleted file mode 100644
index 3a9119ab4b6aa8..00000000000000
--- a/Modules/_sha3/kcp/PlSnP-Fallback.inc
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
-
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
-
-To the extent possible under law, the implementer has waived all copyright
-and related or neighboring rights to the source code in this file.
-http://creativecommons.org/publicdomain/zero/1.0/
-*/
-
-/* expect PlSnP_baseParallelism, PlSnP_targetParallelism */
-
-/* expect SnP_stateSizeInBytes, SnP_stateAlignment */
-
-/* expect prefix */
-
-/* expect SnP_* */
-
-
-#define JOIN0(a, b)                     a ## b
-#define JOIN(a, b)                      JOIN0(a, b)
-
-#define PlSnP_StaticInitialize          JOIN(prefix, _StaticInitialize)
-#define PlSnP_InitializeAll             JOIN(prefix, _InitializeAll)
-#define PlSnP_AddByte                   JOIN(prefix, _AddByte)
-#define PlSnP_AddBytes                  JOIN(prefix, _AddBytes)
-#define PlSnP_AddLanesAll               JOIN(prefix, _AddLanesAll)
-#define PlSnP_OverwriteBytes            JOIN(prefix, _OverwriteBytes)
-#define PlSnP_OverwriteLanesAll         JOIN(prefix, _OverwriteLanesAll)
-#define PlSnP_OverwriteWithZeroes       JOIN(prefix, _OverwriteWithZeroes)
-#define PlSnP_ExtractBytes              JOIN(prefix, _ExtractBytes)
-#define PlSnP_ExtractLanesAll           JOIN(prefix, _ExtractLanesAll)
-#define PlSnP_ExtractAndAddBytes        JOIN(prefix, _ExtractAndAddBytes)
-#define PlSnP_ExtractAndAddLanesAll     JOIN(prefix, _ExtractAndAddLanesAll)
-
-#if (PlSnP_baseParallelism == 1)
-    #define SnP_stateSizeInBytes            JOIN(SnP, _stateSizeInBytes)
-    #define SnP_stateAlignment              JOIN(SnP, _stateAlignment)
-#else
-    #define SnP_stateSizeInBytes            JOIN(SnP, _statesSizeInBytes)
-    #define SnP_stateAlignment              JOIN(SnP, _statesAlignment)
-#endif
-#define PlSnP_factor ((PlSnP_targetParallelism)/(PlSnP_baseParallelism))
-#define SnP_stateOffset (((SnP_stateSizeInBytes+(SnP_stateAlignment-1))/SnP_stateAlignment)*SnP_stateAlignment)
-#define stateWithIndex(i) ((unsigned char *)states+((i)*SnP_stateOffset))
-
-#define SnP_StaticInitialize            JOIN(SnP, _StaticInitialize)
-#define SnP_Initialize                  JOIN(SnP, _Initialize)
-#define SnP_InitializeAll               JOIN(SnP, _InitializeAll)
-#define SnP_AddByte                     JOIN(SnP, _AddByte)
-#define SnP_AddBytes                    JOIN(SnP, _AddBytes)
-#define SnP_AddLanesAll                 JOIN(SnP, _AddLanesAll)
-#define SnP_OverwriteBytes              JOIN(SnP, _OverwriteBytes)
-#define SnP_OverwriteLanesAll           JOIN(SnP, _OverwriteLanesAll)
-#define SnP_OverwriteWithZeroes         JOIN(SnP, _OverwriteWithZeroes)
-#define SnP_ExtractBytes                JOIN(SnP, _ExtractBytes)
-#define SnP_ExtractLanesAll             JOIN(SnP, _ExtractLanesAll)
-#define SnP_ExtractAndAddBytes          JOIN(SnP, _ExtractAndAddBytes)
-#define SnP_ExtractAndAddLanesAll       JOIN(SnP, _ExtractAndAddLanesAll)
-
-void PlSnP_StaticInitialize( void )
-{
-    SnP_StaticInitialize();
-}
-
-void PlSnP_InitializeAll(void *states)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++)
-    #if (PlSnP_baseParallelism == 1)
-        SnP_Initialize(stateWithIndex(i));
-    #else
-        SnP_InitializeAll(stateWithIndex(i));
-    #endif
-}
-
-void PlSnP_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset)
-{
-    #if (PlSnP_baseParallelism == 1)
-        SnP_AddByte(stateWithIndex(instanceIndex), byte, offset);
-    #else
-        SnP_AddByte(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byte, offset);
-    #endif
-}
-
-void PlSnP_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
-{
-    #if (PlSnP_baseParallelism == 1)
-        SnP_AddBytes(stateWithIndex(instanceIndex), data, offset, length);
-    #else
-        SnP_AddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
-    #endif
-}
-
-void PlSnP_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++) {
-        #if (PlSnP_baseParallelism == 1)
-            SnP_AddBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
-        #else
-            SnP_AddLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
-        #endif
-        data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
-    }
-}
-
-void PlSnP_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
-{
-    #if (PlSnP_baseParallelism == 1)
-        SnP_OverwriteBytes(stateWithIndex(instanceIndex), data, offset, length);
-    #else
-        SnP_OverwriteBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
-    #endif
-}
-
-void PlSnP_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++) {
-        #if (PlSnP_baseParallelism == 1)
-            SnP_OverwriteBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
-        #else
-            SnP_OverwriteLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
-        #endif
-        data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
-    }
-}
-
-void PlSnP_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
-{
-    #if (PlSnP_baseParallelism == 1)
-        SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex), byteCount);
-    #else
-        SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byteCount);
-    #endif
-}
-
-void PlSnP_PermuteAll(void *states)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++) {
-        #if (PlSnP_baseParallelism == 1)
-            SnP_Permute(stateWithIndex(i));
-        #else
-            SnP_PermuteAll(stateWithIndex(i));
-        #endif
-    }
-}
-
-#if (defined(SnP_Permute_12rounds) || defined(SnP_PermuteAll_12rounds))
-void PlSnP_PermuteAll_12rounds(void *states)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++) {
-        #if (PlSnP_baseParallelism == 1)
-            SnP_Permute_12rounds(stateWithIndex(i));
-        #else
-            SnP_PermuteAll_12rounds(stateWithIndex(i));
-        #endif
-    }
-}
-#endif
-
-void PlSnP_ExtractBytes(void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
-{
-    #if (PlSnP_baseParallelism == 1)
-        SnP_ExtractBytes(stateWithIndex(instanceIndex), data, offset, length);
-    #else
-        SnP_ExtractBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
-    #endif
-}
-
-void PlSnP_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++) {
-        #if (PlSnP_baseParallelism == 1)
-            SnP_ExtractBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
-        #else
-            SnP_ExtractLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
-        #endif
-        data += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
-    }
-}
-
-void PlSnP_ExtractAndAddBytes(void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
-{
-    #if (PlSnP_baseParallelism == 1)
-        SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex), input, output, offset, length);
-    #else
-        SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, input, output, offset, length);
-    #endif
-}
-
-void PlSnP_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
-{
-    unsigned int i;
-
-    for(i=0; i<PlSnP_factor; i++) {
-        #if (PlSnP_baseParallelism == 1)
-            SnP_ExtractAndAddBytes(stateWithIndex(i), input, output, 0, laneCount*SnP_laneLengthInBytes);
-        #else
-            SnP_ExtractAndAddLanesAll(stateWithIndex(i), input, output, laneCount, laneOffset);
-        #endif
-        input += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
-        output += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
-    }
-}
-
-#undef PlSnP_factor
-#undef SnP_stateOffset
-#undef stateWithIndex
-#undef JOIN0
-#undef JOIN
-#undef PlSnP_StaticInitialize
-#undef PlSnP_InitializeAll
-#undef PlSnP_AddByte
-#undef PlSnP_AddBytes
-#undef PlSnP_AddLanesAll
-#undef PlSnP_OverwriteBytes
-#undef PlSnP_OverwriteLanesAll
-#undef PlSnP_OverwriteWithZeroes
-#undef PlSnP_PermuteAll
-#undef PlSnP_ExtractBytes
-#undef PlSnP_ExtractLanesAll
-#undef PlSnP_ExtractAndAddBytes
-#undef PlSnP_ExtractAndAddLanesAll
-#undef SnP_stateAlignment
-#undef SnP_stateSizeInBytes
-#undef PlSnP_factor
-#undef SnP_stateOffset
-#undef stateWithIndex
-#undef SnP_StaticInitialize
-#undef SnP_Initialize
-#undef SnP_InitializeAll
-#undef SnP_AddByte
-#undef SnP_AddBytes
-#undef SnP_AddLanesAll
-#undef SnP_OverwriteBytes
-#undef SnP_OverwriteWithZeroes
-#undef SnP_OverwriteLanesAll
-#undef SnP_ExtractBytes
-#undef SnP_ExtractLanesAll
-#undef SnP_ExtractAndAddBytes
-#undef SnP_ExtractAndAddLanesAll
diff --git a/Modules/_sha3/kcp/SnP-Relaned.h b/Modules/_sha3/kcp/SnP-Relaned.h
index 086e635ff8f6f3..631fb5ae2a478d 100644
--- a/Modules/_sha3/kcp/SnP-Relaned.h
+++ b/Modules/_sha3/kcp/SnP-Relaned.h
@@ -1,16 +1,23 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
 http://creativecommons.org/publicdomain/zero/1.0/
+
+---
+
+This file contains macros that help implement a permutation in a SnP-compatible way.
+It converts an implementation that implement state input/output functions
+in a lane-oriented fashion (i.e., using SnP_AddLanes() and SnP_AddBytesInLane,
+and similarly for Overwite, Extract and ExtractAndAdd) to the byte-oriented SnP.
+Please refer to SnP-documentation.h for more details.
 */
 
 #ifndef _SnP_Relaned_h_
diff --git a/Modules/_sha3/kcp/align.h b/Modules/_sha3/kcp/align.h
index 6650fe8c3cf0ef..82ad2f90516cd4 100644
--- a/Modules/_sha3/kcp/align.h
+++ b/Modules/_sha3/kcp/align.h
@@ -1,12 +1,11 @@
 /*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
+The eXtended Keccak Code Package (XKCP)
+https://github.com/XKCP/XKCP
 
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
+Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
+
+For more information, feedback or questions, please refer to the Keccak Team website:
+https://keccak.team/
 
 To the extent possible under law, the implementer has waived all copyright
 and related or neighboring rights to the source code in this file.
@@ -17,7 +16,6 @@ and related or neighboring rights to the source code in this file.
 #define _align_h_
 
 /* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
-
 #ifdef ALIGN
 #undef ALIGN
 #endif
diff --git a/Modules/_sha3/sha3module.c b/Modules/_sha3/sha3module.c
index cae10f99d5b8df..f16fb2e3015d04 100644
--- a/Modules/_sha3/sha3module.c
+++ b/Modules/_sha3/sha3module.c
@@ -22,14 +22,13 @@
 /* **************************************************************************
  *                          SHA-3 (Keccak) and SHAKE
  *
- * The code is based on KeccakCodePackage from 2016-04-23
- * commit 647f93079afc4ada3d23737477a6e52511ca41fd
+ * The code is based on eXtended Keccak Code Package from 2021-01-20
+ * commit cf2a63166998349447539626ea08c27a44f4857e
  *
  * The reference implementation is altered in this points:
- *  - C++ comments are converted to ANSI C comments.
  *  - all function names are mangled
- *  - typedef for UINT64 is commented out.
  *  - brg_endian.h is removed
+ *  - config.h is removed
  *
  * *************************************************************************/
 
@@ -37,23 +36,14 @@
   /* opt64 uses un-aligned memory access that causes a BUS error with msg
    * 'invalid address alignment' on SPARC. */
   #define KeccakOpt 32
-#elif PY_BIG_ENDIAN
-  /* opt64 is not yet supported on big endian platforms */
-  #define KeccakOpt 32
 #elif SIZEOF_VOID_P == 8
-  /* opt64 works only on little-endian 64bit platforms with unsigned int64 */
+  /* opt64 works only on 64bit platforms with unsigned int64 */
   #define KeccakOpt 64
 #else
   /* opt32 is used for the remaining 32 and 64bit platforms */
   #define KeccakOpt 32
 #endif
 
-#if KeccakOpt == 64
-  /* 64bit platforms with unsigned int64 */
-  typedef uint64_t UINT64;
-  typedef unsigned char UINT8;
-#endif
-
 /* replacement for brg_endian.h */
 #define IS_LITTLE_ENDIAN 1234
 #define IS_BIG_ENDIAN 4321
@@ -84,6 +74,7 @@
 #define KeccakP1600_OverwriteBytesInLane _PySHA3_KeccakP1600_OverwriteBytesInLane
 #define KeccakP1600_OverwriteLanes _PySHA3_KeccakP1600_OverwriteLanes
 #define KeccakP1600_OverwriteWithZeroes _PySHA3_KeccakP1600_OverwriteWithZeroes
+#define KeccakP1600_Permute_Nrounds _PySHA3_KeccakP1600_Permute_Nrounds
 #define KeccakP1600_Permute_12rounds _PySHA3_KeccakP1600_Permute_12rounds
 #define KeccakP1600_Permute_24rounds _PySHA3_KeccakP1600_Permute_24rounds
 #define KeccakWidth1600_Sponge _PySHA3_KeccakWidth1600_Sponge
@@ -95,12 +86,12 @@
 #define KeccakP1600_AddByte _PySHA3_KeccakP1600_AddByte
 #define KeccakP1600_Permute_Nrounds _PySHA3_KeccakP1600_Permute_Nrounds
 #define KeccakP1600_SetBytesInLaneToZero _PySHA3_KeccakP1600_SetBytesInLaneToZero
+#elif KeccakOpt == 64
+#define KeccakP1600_12rounds_FastLoop_Absorb _PySHA3_KeccakP1600_12rounds_FastLoop_Absorb
 #endif
 
 /* we are only interested in KeccakP1600 */
-#define KeccakP200_excluded 1
-#define KeccakP400_excluded 1
-#define KeccakP800_excluded 1
+#define XKCP_has_KeccakP1600 1
 
 /* inline all Keccak dependencies */
 #include "kcp/KeccakHash.h"
@@ -243,7 +234,7 @@ py_sha3_new_impl(PyTypeObject *type, PyObject *data, int usedforsecurity)
         else {
             res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
         }
-        if (res != SUCCESS) {
+        if (res != KECCAK_SUCCESS) {
             PyErr_SetString(PyExc_RuntimeError,
                             "internal error in SHA3 Update()");
             goto error;
@@ -322,7 +313,7 @@ _sha3_sha3_224_digest_impl(SHA3object *self)
     SHA3_copystate(temp, self->hash_state);
     LEAVE_HASHLIB(self);
     res = SHA3_done(&temp, digest);
-    if (res != SUCCESS) {
+    if (res != KECCAK_SUCCESS) {
         PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Final()");
         return NULL;
     }
@@ -350,7 +341,7 @@ _sha3_sha3_224_hexdigest_impl(SHA3object *self)
     SHA3_copystate(temp, self->hash_state);
     LEAVE_HASHLIB(self);
     res = SHA3_done(&temp, digest);
-    if (res != SUCCESS) {
+    if (res != KECCAK_SUCCESS) {
         PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Final()");
         return NULL;
     }
@@ -396,7 +387,7 @@ _sha3_sha3_224_update(SHA3object *self, PyObject *data)
         res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
     }
 
-    if (res != SUCCESS) {
+    if (res != KECCAK_SUCCESS) {
         PyBuffer_Release(&buf);
         PyErr_SetString(PyExc_RuntimeError,
                         "internal error in SHA3 Update()");
@@ -618,12 +609,12 @@ _SHAKE_digest(SHA3object *self, unsigned long digestlen, int hex)
     SHA3_copystate(temp, self->hash_state);
     LEAVE_HASHLIB(self);
     res = SHA3_done(&temp, NULL);
-    if (res != SUCCESS) {
+    if (res != KECCAK_SUCCESS) {
         PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 done()");
         goto error;
     }
     res = SHA3_squeeze(&temp, digest, digestlen * 8);
-    if (res != SUCCESS) {
+    if (res != KECCAK_SUCCESS) {
         PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Squeeze()");
         return NULL;
     }