diff --git a/CMakeLists.txt b/CMakeLists.txt
index c78ad9449..33992dec8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,7 @@ add_subdirectory(cmake)
 add_subdirectory(dcp)
 add_subdirectory(divider)
 add_subdirectory(dma)
+add_subdirectory(encrypted)
 add_subdirectory(flash)
 add_subdirectory(gpio)
 add_subdirectory(hstx)
diff --git a/README.md b/README.md
index 5a436a348..f718271ea 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,12 @@ App|Description
 [channel_irq](dma/channel_irq) | Use an IRQ handler to reconfigure a DMA channel, in order to continuously drive data through a PIO state machine.
 [sniff_crc](dma/sniff_crc) | Use the DMA engine's 'sniff' capability to calculate a CRC32 on a data buffer.
 
+### Encrypted (RP235x Only)
+
+App|Description
+---|---
+[hello_encrypted](encrypted/hello_encrypted) | Create a self-decrypting binary.
+
 ### HSTX (RP235x Only)
 
 App|Description
diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt
index f29f0efe2..2d6d77f0d 100644
--- a/bootloaders/encrypted/CMakeLists.txt
+++ b/bootloaders/encrypted/CMakeLists.txt
@@ -4,17 +4,6 @@ add_executable(enc_bootloader
         aes.S
         )
 
-# Add command to update otp.json if privateaes.bin changes
-add_custom_command(OUTPUT ${CMAKE_CURRENT_LIST_DIR}/otp.json
-    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_LIST_DIR}/update-key.cmake"
-    DEPENDS ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin)
-# Copy that otp.json file to build directory
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json"
-    DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json)
-add_custom_target(otp_json DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/otp.json)
-add_dependencies(enc_bootloader otp_json)
-
 # pull in common dependencies
 target_link_libraries(enc_bootloader pico_stdlib pico_rand)
 
@@ -46,11 +35,8 @@ function(add_linker_script target origin length)
     pico_set_linker_script(${target} ${CMAKE_CURRENT_BINARY_DIR}/${target}.ld)
 endfunction()
 
-# create linker script to run from 0x20070000
-add_linker_script(enc_bootloader "0x20070000" "64k")
-
-# configure otp output
-pico_set_otp_key_output_file(enc_bootloader ${CMAKE_CURRENT_BINARY_DIR}/otp.json)
+# create linker script to run from 0x20078000
+add_linker_script(enc_bootloader "0x20078000" "32k")
 
 # sign, hash, and clear SRAM
 pico_sign_binary(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/private.pem)
@@ -86,10 +72,13 @@ pico_set_binary_type(hello_serial_enc no_flash)
 # create linker script to ensure it doesn't overwrite the bootloader at 0x20070000
 add_linker_script(hello_serial_enc "0x20000000" "448k")
 
+# configure otp output
+pico_set_otp_key_output_file(hello_serial_enc ${CMAKE_CURRENT_BINARY_DIR}/otp.json)
+
 # sign, hash, and encrypt
 pico_sign_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/private.pem)
 pico_hash_binary(hello_serial_enc)
-pico_encrypt_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin)
+pico_encrypt_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin ${CMAKE_CURRENT_LIST_DIR}/ivsalt.bin)
 
 # package uf2 in flash
 pico_package_uf2_output(hello_serial_enc 0x10000000)
diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md
index f079d9469..0e10e5e3d 100644
--- a/bootloaders/encrypted/README.md
+++ b/bootloaders/encrypted/README.md
@@ -1,15 +1,32 @@
-Replace private.pem and privateaes.bin with your own keys - your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with:
+For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Make sure you **don't lose your keys and salts**, else you may not be able to update the code on your device.
+
+Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with:
 
 ```bash
 openssl ecparam -name secp256k1 -genkey -out private.pem
 ```
 
-The AES key is just be a 32 byte binary file - you can create one with
+The AES key is stored as a 4-way share in a 128 byte binary file - you can create one with
+
+```bash
+dd if=/dev/urandom of=privateaes.bin bs=1 count=128
+```
+
+or in Powershell 7
+```powershell
+[byte[]] $(Get-SecureRandom -Maximum 256 -Count 128) | Set-Content privateaes.bin -AsByteStream
+```
+
+The IV salt is just a 16 byte binary file - you can create it the same way, replacing `128` with `16` and `privateaes.bin` with `ivsalt.bin` in the commands above.
 
+You will need to program your OTP using the `otp.json` file generated by the build in your build folder
+NOTE: This will enable secure boot on your device, so only correctly signed binaries can then run, and will also lock down the OTP pages the AES key and IV salt are stored in.
 ```bash
-dd if=/dev/urandom of=privateaes.bin bs=1 count=32
+picotool otp load otp.json
 ```
 
+> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see section 5.10
+
 Then either drag & drop the UF2 files to the device in order (enc_bootloader first, then hello_serial_enc) waiting for a reboot in-between, or run
 ```bash
 picotool load enc_bootloader.uf2
diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S
index feccaae68..093c4b0f1 100644
--- a/bootloaders/encrypted/aes.S
+++ b/bootloaders/encrypted/aes.S
@@ -1,45 +1,31 @@
+/* MEMORY LAYOUT ASSUMPTIONS
+
+The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see
+the macro getchaffaddress.
+
+The stack must be located at the end of Y scratch RAM: see the memory
+wiping at the end of ctr_crypt_s where memory between the start of Y
+scratch RAM and the stack pointer is overwritten.
+*/
+
 .syntax unified
 .cpu cortex-m33
 .thumb
 
+#include "config.h"
 #include "hardware/platform_defs.h"
 #include "hardware/regs/addressmap.h"
+#include "hardware/regs/clocks.h"
 #include "hardware/regs/sha256.h"
+#include "hardware/regs/resets.h"
+#include "hardware/regs/rosc.h"
+#include "hardware/regs/trng.h"
 #include "hardware/rcp.h"
 
-#include "config.h"
-
-.global delay
-.global aes_start
-.global aes_end
-.global flush_reg
-.global isr_systick
-.extern systick_data
-
-.global gen_lut_inverse
-.global gen_lut_sbox
-.if NEED_INV_ROUNDS
-.global gen_lut_inv_sbox
-.endif
-
-.if INCLUDE_ENCRYPT_CBC
-.global cbc_encrypt_s
-.endif
-.if INCLUDE_DECRYPT_CBC
-.global cbc_decrypt_s
-.endif
-.if INCLUDE_CRYPT_CTR
-.global ctr_crypt_s
-.endif
-
-.global remap
-.global gen_rand
-.global init_key
+.global decrypt
+.global chaff
 
-.global rkey_s
-.global lut_a,lut_a_map
-.global lut_b,lut_b_map
-.global rstate
+.extern lock_key
 
 @ RCP macros
 
@@ -53,18 +39,23 @@
 #define CTAG7  0x32
 #define CTAG8  0x33
 #define CTAG9  0x34
-#define CTAG10 0x35
-#define CTAG11 0x36
+#define CTAG10 0x35 @ not used
+#define CTAG11 0x36 @ not used
 #define CTAG12 0x37
 #define CTAG13 0x38
 #define CTAG14 0x39
 #define CTAG15 0x3a
 #define CTAG16 0x3b
 #define CTAG17 0x3c
+#define CTAG18 0x3d @ not used
 
-.macro SET_COUNT n
+@ number of blocks from the TRNG processed to initialise rstate_sha
+#define TRNG_BLOCKS 25
+
+@ The lower jitterpriorty is, the more the jitter
+.macro SET_COUNT n,jitterpriority
 .if RC_COUNT
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_count_set \n
 .else
  rcp_count_set_nodelay \n
@@ -72,9 +63,9 @@
 .endif
 .endm
 
-.macro CHK_COUNT n
+.macro CHK_COUNT n,jitterpriority
 .if RC_COUNT
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_count_check \n
 .else
  rcp_count_check_nodelay \n
@@ -82,9 +73,9 @@
 .endif
 .endm
 
-.macro GET_CANARY rx,tag
+.macro GET_CANARY rx,tag,jitterpriority
 .if RC_CANARY
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_canary_get \rx,\tag
 .else
  rcp_canary_get_nodelay \rx,\tag
@@ -92,9 +83,9 @@
 .endif
 .endm
 
-.macro CHK_CANARY rx,tag
+.macro CHK_CANARY rx,tag,jitterpriority
 .if RC_CANARY
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_canary_check \rx,\tag
 .else
  rcp_canary_check_nodelay \rx,\tag
@@ -102,25 +93,81 @@
 .endif
 .endm
 
-.macro GET_CANARY_NJ rx,tag  @ with no jitter even if you ask for it (otherwise slows down gen_rand a lot)
-.if RC_CANARY
- rcp_canary_get_nodelay \rx,\tag
-.endif
+@ Clear internal stripe load registers, and r0-r3
+@ 0 <= offset <= 32
+.macro clear03 offset=0
+ getchaffaddress r0,\offset
+ ldmia r0,{r0-r3}
 .endm
 
-.macro CHK_CANARY_NJ rx,tag  @ with no jitter even if you ask for it
-.if RC_CANARY
- rcp_canary_check_nodelay \rx,\tag
-.endif
+.macro clear03_preserve_r3 offset=0
+ getchaffaddress r0,\offset
+ ldmia r0!,{r1-r2}
+ ldmia r0!,{r1-r2}
+.endm
+
+.macro clear01 offset=0
+ getchaffaddress r0,\offset
+ ldmia r0,{r0,r1}
+.endm
+
+@ Put workspace in the second scratch area
+@ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants,
+@ otherwise they may end up silently replaced with 0 or 0xffffffff
+.section .scratch_y.aes,"aw",%progbits
+
+workspace_start:
+
+@ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress
+@ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000)
+@ getchaffaddress is used by clear03 and clear01 and other sensitive cases which require the first load to be a random one
+@ chaff has to be 0 mod 16 for other reasons
+.macro getchaffaddress rx,offset=0
+@ ldr \rx,=(chaff+\offset)
+ mov \rx,#(0x1000+\offset)
+ movt \rx,#0x2008
 .endm
+chaff:
+.space 48
+
+.balign 16
+rkey_s:                      @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words
+                             @ see comment at init_key_4way for description of layout and meaning of rkey_s
+.space 600
+rkey4way:                    @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space
+.space 128
+.if CT_BPERM
+bperm_rand:                  @ 32 half words that define the oblivious permutation of blocks
+.space 64
+.endif
 
-.section .stack.aes
-@ Regardless of configuration the code uses a single 256-entry LUT. If both
-@ encryption and decryption are enabled then this is a table of inverses
-@ of GF(2⁸) field elements, from which both the S-box and inverse S-box
-@ functions can be derived; otherwise it can be a simple inverse S-box
-@ table.
-@ In either case the LUT is represented as two shares, lut_a and lut_b,
+.balign 16
+permscratch:                 @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s)
+perm16:
+.space 16
+@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s
+.balign 16
+fourway:                     @ Must be 0 mod 16
+shareA:                      @ 0 mod 16
+.space 20                    @ Only need 16 bytes, but choosing shareB!=shareA mod 16
+shareB:                      @ 4 mod 16
+.space 20
+shareC:                      @ 8 mod 16
+.space 4
+statevperm:                  @ 12 mod 16
+.space 4                     @ vperm state rotation: only last two bits are operational; other bits random
+RKshareC:                    @ Round key common share C; see comment at init_key_4way for explanation
+.space 4
+RKshareCchange:              @ Temporary used by ref_roundkey_share_s
+.space 4
+IV0:                         @ 2-way share of IV for block 0
+.space 36                    @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16)
+                             @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers
+                             @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless
+
+@ Regardless of configuration, the code uses a single 256-entry LUT,
+@ which is a simple S-box table.
+@ The LUT is represented as two shares, lut_a and lut_b,
 @ whose values must be EORed. Furthermore, the contents of each share are
 @ scambled according to a 4-byte "map". The map comprises two bytes that
 @ are EORed into the addressing of the share, and two bytes that are
@@ -133,67 +180,200 @@
 @ shares, namely
 @ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀  and
 @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁
-lut_a:                       @ LUT share A
-.space 256
+.balign 16
+lut_a:                       @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup)
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
 lut_a_map:                   @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b
 .space 4
-.space 4                     @ align to multiple of 8
-lut_b:                       @ LUT share B
+.space 4                     @ align to 8 mod 16
+lut_b:                       @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup)
 .space 256
 lut_b_map:
 .space 4
 .space 4                     @ align to multiple of 8
-rkey_s:                      @ round key shares
-.if RK_ROR
-.space 600
-.else
-.space 480
-.endif
-.if CT_BPERM
-ctr_scratch:                 @ scratch area for CTR code to use when "decrypting" out-of-range blocks
+
+.balign 16
+rstate_all_start:            @ Mark start of RNG data to allow selective memory wipe
+rstate_sha:                  @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero
 .space 16
+jstate:                      @ 32-bit jitter state
+.space 4
+rstate_lfsr:                 @ 32-bit LFSR random state and constant used to step it
+.space 4
+.word 0x1d872b41             @ constant that defines a maximal-length LFSR
+rstate_all_end:              @ Mark end of RNG data to allow selective memory wipe
+
+.if CT_BPERM
+.balign 16
+murmur3_constants:           @ Five constants used in murmur3_32 hash
+.word 0xcc9e2d51
+.word 0x1b873593
+.word 0xe6546b64
+.word 0x85ebca6b
+.word 0xc2b2ae35
 .endif
-rstate:                      @ SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero
-.space 16
 
-.section .text.aes,"ax",%progbits
+scratch_y_end:
 
-.thumb_func
-aes_start:
- nop
+@ Initialisation code in main .text section
+.section .text,"ax",%progbits
 
-.if GEN_RAND_SHA
+@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments.
+@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some
+@ random numbers.
+@ Trashes r0-r6
 .balign 4
-.thumb_func
-@ random numbers using SHA256 hardware
-@ preserves r1-r3
-gen_rand:
- GET_CANARY_NJ r0,CTAG1
- push {r0-r3,r14}
- ldr r0,=#SHA256_BASE
-4:
- ldr r2,=#rstate
- ldrb r1,[r2]                @ get word counter from bottom byte of rstate[] (offset into SUM registers)
- subs r3,r1,#4               @ decrement it to previous SUM register
- ble 1f                      @ if the offset was 4 or less we have run out of SUM register values
+init_rstate:
+ CHK_COUNT 24,6
+ ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET
+ ldr r5,=SHA256_BASE
+ movs r1,#1
+ str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]
+ ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]     @ reads as 0
+ movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS                     @ initialise SHA internal state by writing START bit
+ str r1,[r5,#SHA256_CSR_OFFSET]
+ str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET       -TRNG_RNG_IMR_OFFSET]
+ movs r6,#TRNG_BLOCKS*2+1                                            @ odd so that we break out of the loop half-way through loading the SHA hardware, giving
+                                                                     @ time for previous SHA computation to complete
+2:
+ movs r1,#0xff                                                       @ TRNG setup is inside loop in case it is skipped.
+ str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET]     @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples
+ str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]     @ start ROSC if it is not already started
+ str r1,[r4,#TRNG_RNG_ICR_OFFSET           -TRNG_RNG_IMR_OFFSET]     @ clear all interrupts (including EHR_VLD)
+ adds r0,r4,#TRNG_EHR_DATA0_OFFSET         -TRNG_RNG_IMR_OFFSET
+ movs r2,#TRNG_TRNG_BUSY_OFFSET            -TRNG_RNG_IMR_OFFSET
+1:
+ ldr r1,[r4,r2]                                                      @ wait for 192 ROSC samples to fill EHR,should take constant time
+ cmp r1,#0
+ bne 1b
+ subs r6,#1                                                          @ done?
+ beq 3f
+ movs r1,#8
+1:
+ ldmia r0!,{r2}                                                      @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1)
+ str r2,[r5,#SHA256_WDATA_OFFSET]                                    @ for a total of half a SHA-256 block
+ subs r1,#1
+ bne 1b
+ ldr r2,[r5,#SHA256_SUM0_OFFSET]                                     @ TRNG is now sampling again; use some SHA bits to modulate the chain length
+ str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]
+ b.n 2b
+
+3:
+ CHK_COUNT 25,6
+ str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]     @ turn off rand source and wipe SHA bits left in TRNG config; r1=0
+ str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]
+ adds r5,r5,#SHA256_SUM0_OFFSET
+@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc)
+ ldmia r5,{r0-r3}  @ load first 4 words of the 8 word SHA256 output
+ ldr r6,=rstate_sha
+@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha
+ stmia r6,{r0-r3}
+ CHK_COUNT 26,6
+ movs r0,#0
+ strb r0,[r6]      @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data"
+
+@ try to find a non-zero initialiser to create a non-degenerate LFSR random state
+ ldr r1,[r5,#16]   @ SHA SUM4
+ cbnz r1,1f        @ is word 4 non-zero? then use it
+ ldr r1,[r5,#20]   @ SHA SUM5
+ cbnz r1,1f        @ otherwise, is word 5 non-zero? use it
+ mov r1,r6         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability)
+1:
+ str r1,[r6,#rstate_lfsr-rstate_sha]
+ 
+@ try to find a non-zero initialiser to create a non-degenerate ROSC random state
+ ldr r1,[r5,#24]   @ SHA SUM6
+ cbnz r1,1f        @ is word 6 non-zero? then use it
+ ldr r1,[r5,#28]   @ SHA SUM7
+ cbnz r1,1f        @ otherwise, is word 7 non-zero? use it
+ mov r1,r6         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability)
+1:
+ ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE
+ str r1,[r2,#0]    @ Initialise ROSC LFSR
+ CHK_COUNT 27,6
+ 
+.if GEN_RAND_SHA
+.if SH_JITTER
+ movs r2,#0
+ str r2,[r6,#jstate-rstate_sha]
+.endif
+.endif
+
+ CHK_COUNT 28,6
+ bx r14
+
+@ Put AES core code in first scratch area
+.section .scratch_x.aes,"ax",%progbits
+
+.if GEN_RAND_SHA
+@ we need SHA256_SUM0_OFFSET==8 (see note below)
 .if SHA256_SUM0_OFFSET!=8
 .err
 .endif
-2:
- ldr r0,[r0,r1]              @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8
- strb r3,[r2]                @ save updated SUM register offset in bottom byte of rstate[]
- pop {r1}
- CHK_CANARY_NJ r1,CTAG1
- pop {r1-r3,r15}
 
+@ Return single random word in r0
+@ Preserves r1-r13
+.balign 4
+gen_rand_sha:
+ push {r14}
+ GET_CANARY r14,CTAG1,2
+ push {r1-r3,r14}
+.if SH_JITTER
+ ldr r2,=rstate_sha
+ ldr r0,[r2,#jstate-rstate_sha]
+ movs r1,#1
+ ands r3,r0,#3
+ movs r3,r3,lsl#2
+ movs r3,r1,lsl r3       @ 1<<(4*(r0&3))
+ udiv r3,r3,r1           @ Takes constant + (r0&3) cycles
+ lsrs r0,r0,#2
+ bne 1f
+ bl gen_rand_sha_nonpres
+ ldr r2,=rstate_sha
+1:
+ str r0,[r2,#jstate-rstate_sha]
+.endif
+ bl gen_rand_sha_nonpres
+ pop {r1-r3,r14}
+ CHK_CANARY r14,CTAG1,0
+ pop {r15}
+
+@ Return single random word in r0
+@ Trashes r1-r3
+.balign 4
+gen_rand_sha_nonpres:
+ ldr r0,=SHA256_BASE
+ ldr r2,=rstate_sha
+ ldrb r1,[r2]                @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers)
+ subs r3,r1,#4               @ decrement it to previous SUM register
+ ble 1f                      @ if the offset was 4 or less we have run out of SUM register values
+ ldr r0,[r0,r1]              @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8
+ strb r3,[r2]                @ save updated SUM register offset in bottom byte of rstate_sha[]
+ bx r14
 1:
+@ [CK_JITTER code was here]
  movs r3,#SHA256_SUM6_OFFSET+1
  strb r3,[r2]                @ reset word counter: the +1 is compensated for later
  movw r1,#(1<<SHA256_CSR_BSWAP_LSB)+(1<<SHA256_CSR_START_LSB)
  str r1,[r0,#SHA256_CSR_OFFSET]        @ start SHA256 hardware
- movs r3,#3                  @ take four words from rstate, incrementing as we go
+ movs r3,#3                  @ take four words from rstate_sha, incrementing as we go
  ldr r1,[r2]
- adds r1,r1,#255             @ overall this adds 256 to the value in rstate and resets the bottom byte to SHA256_SUM6_OFFSET
+ adds r1,r1,#255             @ overall this adds 256 to the value in rstate_sha and resets the bottom byte to SHA256_SUM6_OFFSET
 1:
  str r1,[r2],#4
  str r1,[r0,#SHA256_WDATA_OFFSET]
@@ -203,96 +383,260 @@ gen_rand:
  sub r3,r3,#1                @ preserve the carry
  b 1b
 3:
- ldr r1,=#1223352428         @ 12 more words with a fixed value
- movs r3,#12
-1:
+ movs r1,#0x80               @ End of message bit (with byte-swapped endianity) = start of message padding
  str r1,[r0,#SHA256_WDATA_OFFSET]
- subs r3,r3,#1
+ movs r1,#10
+1:
+ str r3,[r0,#SHA256_WDATA_OFFSET]
+ subs r1,r1,#1
  bne 1b
+ mov r1,#0x80000000          @ Specifies message length = 128 bits (with byte-swapped endianity)
+ str r1,[r0,#SHA256_WDATA_OFFSET]
 1:
  ldr r3,[r0,#SHA256_CSR_OFFSET]
  lsrs r3,r3,#SHA256_CSR_SUM_VLD_LSB+1
  bcc 1b                      @ wait for hardware to finish
  ldr r0,[r0,#SHA256_SUM7_OFFSET]
- pop {r1}
- CHK_CANARY_NJ r1,CTAG1
- pop {r1-r3,r15}
-
-.else
+ bx r14
+.endif
 
-@ preserves r1-r3
+@ simple LFSR rand versions
+@ return a random number in r0
+@ This version preserves all r1-r13
+@ 23 or 24 cycles including branch = 23 or 24 cycles/word
+@ (would be 20 or 21 cycles if written out)
 .balign 4
 .thumb_func
-gen_rand:
- GET_CANARY_NJ r0,CTAG1
- push {r0,r1,r14}
- ldr r14,=rstate
- ldr r0,[r14]
- ldr r1,=0x1d872b41         @ constant for a maximum-length sequence
- and r1,r1,r0,asr#31         @ will we be shifting out a 1? keep the constant, otherwise 0
- eor r0,r1,r0,lsl#1
- str r0,[r14]
- pop {r1}
- CHK_CANARY_NJ r1,CTAG1
- pop {r1,r15}
+.if !GEN_RAND_SHA
+gen_rand_sha:
+gen_rand_lfsr:               @ Not used
+ push {r14}
+ GET_CANARY r14,CTAG2,2
+ push {r1,r2,r14}
+ bl gen_rand_lfsr_nonpres
+ pop {r1,r2,r14}
+ CHK_CANARY r14,CTAG2,0
+ pop {r15}
+.endif
 
+@ Trashes r1,r2
+@ 12 cycles including branch = 12 cycles/word
+.balign 4
+.if !GEN_RAND_SHA
+gen_rand_sha_nonpres:
 .endif
+gen_rand_lfsr_nonpres:
+ ldr r2,=rstate_lfsr
+ ldmia r2,{r0-r1}           @ r0=state_in, r1=0x1d872b41=constant for a maximum-length sequence
+ and r1,r1,r0,asr#31        @ will we be shifting out a 1? keep the constant, otherwise 0
+ eor r0,r1,r0,lsl#1
+ str r0,[r2]
+ bx r14
+
+.macro loadlfsr
+ ldr r2,=rstate_lfsr
+ ldmia r2,{r0-r1}           @ r0=lfsr_state, r1=lfsr_const=0x1d872b41 for a maximum-length sequence
+.endm
+
+.macro steplfsr
+ ands r3,r1,r0,asr#31       @ will we be shifting out a 1? keep the constant, otherwise 0
+ eors r0,r3,r0,lsl#1
+.endm
+
+.macro savelfsr
+ str r0,[r2]
+.endm
+
 .ltorg
 
 .balign 4
 .thumb_func
-gen_lut_inverse:
-@ set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage
-@ return r0=lut_a, r1=lut_b
- ldr r0,=lut_a
- ldr r1,=lut_b
-@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms
- mov r2,#0
- strb r2,[r0]                @ (*)
- mov r3,#1                   @ we maintain invariant that r2=log(r3)
+decrypt:
+@ r0=4-way key, r1=IV_shareA, r2=IV_shareB, r3=message buffer, [r13]=number of blocks
+ ldr r12,[r13]               @ Pop 5th argument in r12 (which we are allowed to treat as scratch according to AAPCS)
+ push {r14}
+ GET_CANARY r14,CTAG3,6
+ SET_COUNT 23,6
+ push {r4-r11,r14}
+ push {r0-r3,r12}            @ Save the five arguments
+ bl reset_sha_trng
+ bl init_rstate
+@ randomly re-share the LUT contents
+ ldr r4,=lut_a
+ mov r5,#64                  @ 64 words = 256 bytes
 1:
- strb r2,[r0,r3]             @ log table
- strb r3,[r1,r2]             @ antilog table
- lsls r12,r3,#25
- it cs
- eorcs r12,r12,#0x1b000000   @ multiply by x
- eor r3,r3,r12,lsr#24        @ multiply by x+1 ("3"), which is a primitive element
- add r2,r2,#1
- cmp r2,#255
- bls 1b
- movs r2,#255
+ bl gen_rand_sha_nonpres
+ ldr r6,[r4,#lut_b-lut_a]    @ EOR a random word into both shares
+ eors r6,r6,r0
+ str r6,[r4,#lut_b-lut_a]
+ ldr r6,[r4]
+ eors r6,r6,r0
+ stmia r4!,{r6}
+ subs r5,r5,#1
+ bne 1b
+ CHK_COUNT 29,6
+ bl remap                    @ scramble the LUTs
+ pop {r0}                    @ pointer to 4way key data
+ CHK_COUNT 30,6
+ bl init_key_4way
+ CHK_COUNT 31,6
+ bl lock_key
+ pop {r0-r3}                 @ r0=IV_shareA, r1=IV_shareB, r2=message, r3=num blocks
+ bl ctr_crypt_s
+ bl randomisechaff
+ clear03
+ pop {r4-r11,r14}
+ CHK_CANARY r14,CTAG3,6
+ pop {r15}
+
+.balign 4
+.thumb_func
+reset_sha_trng:
+ ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET
+ ldr r2,[r1]
+ ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS
+ orrs r2,r2,r3
+ str r2,[r1]       @ reset the SHA hardware and the TRNG hardware
+ CHK_COUNT 23,6
+ bics r2,r2,r3
+ str r2,[r1]       @ release the reset
+ bx r14
+
+.balign 4
+.thumb_func
+makesmallperm:
+@ Make a uniformly random permutation of R0 bytes and stores the resulting byte array at R1
+@ Should be very uniform up to R0=10; maybe 11 or 12 are also OK. (10! << 2^32)
+@ To make it valid up to R0=256, move the bl gen_rand_sha inside the loop
+@ Uses inside-out method (slightly more efficient variant of Fisher-Yates)
+@ Trashes r0-r3
+
+ push {r14}
+ GET_CANARY r14,CTAG4,6
+ push {r4-r6,r14}
+ movs r4,r1
+ movs r6,r0
+ movs r1,#0
+ movs r2,#1
+ bl gen_rand_sha
+
+1:
+@ r1,r2=i,i+1,   i=0, 2, 4, ...
+ cmp r1,r6
+ beq 2f
+
+ umull r0,r3,r0,r2
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r1]
+ strb r1,[r4,r3]
+ adds r1,r1,#2
+
+@ r2,r1=i,i+1,   i=1, 3, 5, ...
+ cmp r2,r6
+ beq 2f
+
+ umull r0,r3,r0,r1
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r2]
+ strb r2,[r4,r3]
+ adds r2,r2,#2
+
+ b 1b
+
+2:
+ pop {r4-r6,r14}
+ CHK_CANARY r14,CTAG4,6
+ pop {r15}
+
+.balign 4
+.thumb_func
+makeperm16:
+@ Make a random permutation of 16 things using the inside-out method (slightly more efficient variant of Fisher-Yates)
+@ Store it in the 16 bytes at perm16
+@ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha
+@ Trashes r0-r5
+
+ GET_CANARY r0,CTAG5,1
+ push {r0,r14}
+ ldr r4,=perm16
+ bl gen_rand_sha_nonpres
+
+@ i=0
+ movs r1,#0
+ movs r2,#1       @ r1,r2=i,i+1
+ strb r1,[r4]
+
+@ i=1
+ adds r1,r1,#2    @ r1,r2=i+1,i
+ umull r0,r3,r0,r1
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r2]
+ strb r2,[r4,r3]
+
+1:
+@ i=2, 4, 6, 8
+ adds r2,r2,#2    @ r1,r2=i,i+1
+ umull r0,r3,r0,r2
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r1]
+ strb r1,[r4,r3]
+
+@ i=3, 5, 7, 9
+ adds r1,r1,#2    @ r1,r2=i+1,i
+ umull r0,r3,r0,r1
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r2]
+ cmp r1,#10
+ strb r2,[r4,r3]
+ bne 1b
+
+@ refresh random number after extracting 10! from it
+@ 10! and 16!/10! are both much less than 2^32, so the permutation will be extremely close to uniform
+ bl gen_rand_sha
+
 1:
- ldrb r3,[r0,r2]             @ for each i≠0, find log,...
- eor r3,r3,#255              @ ... negate...
- ldrb r3,[r1,r3]             @ ... and antilog to get inverse
- strb r3,[r0,r2]
- subs r2,r2,#1
- bne 1b                      @ note that inverse(0)=0 by (*) above
+@ i=10, 12, 14
+ adds r2,r2,#2    @ r1,r2=i,i+1
+ umull r0,r3,r0,r2
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r1]
+ strb r1,[r4,r3]
+
+@ i=11, 13, 15
+ adds r1,r1,#2    @ r1,r2=i+1,i
+ umull r0,r3,r0,r1
+ ldrb r5,[r4,r3]
+ strb r5,[r4,r2]
+ cmp r1,#16
+ strb r2,[r4,r3]
+ bne 1b
+
+ pop {r0,r14}
+ CHK_CANARY r0,CTAG5,4
  bx r14
 
 .balign 4
 .thumb_func
 remap:
 @ do a random remap of the LUTs
-@ preserves r0-r11
- push {r14}
- GET_CANARY r14,CTAG2
- push {r0-r11,r14}
- bl gen_rand
+@ preserves r0-r11; trashes r12
+ GET_CANARY r12,CTAG6,6
+ push {r0-r12,r14}
+ bl gen_rand_sha_nonpres
  ldr r1,=lut_a
  bl remap_1
- bl gen_rand
+ bl gen_rand_sha_nonpres
  ldr r1,=lut_b
  bl remap_1
- pop {r0-r11,r14}
- CHK_CANARY r14,CTAG2
- pop {r15}
+ pop {r0-r12,r14}
+ CHK_CANARY r12,CTAG6,6
+ bx r14
 
 remap_1:
 @ r0: B0:xa B1:xb B2:ya B3:yb
 @ r1: array of 256 bytes, followed by a 4-byte map
 @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0
- GET_CANARY r6,CTAG3
+ GET_CANARY r6,CTAG7,6
  push {r6,r14}
  mov r14,0x01010101
  ubfx r6,r0,#16,#8
@@ -337,213 +681,271 @@ remap_1:
  subs r2,r2,#4
  bpl 1b
  pop {r6,r14}
- CHK_CANARY r6,CTAG3
+ CHK_CANARY r6,CTAG7,6
  bx r14
 
-.if NEED_HPERM
+.if RK_ROR
+
+@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC
+@ Trashes r0-r12
+@ If i = word number 0..3,
+@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
+@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
+@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4])
+@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16
 .balign 4
 .thumb_func
-hperm:
-@ rotate state within registers
-@ r0: B0: rotate amount for r4,r8; B1: rotate amount for r5,r9; B2: rotate amount for r6,r10; B3: rotate amount for r7,r11
-@ return r0 value required to undo
- movs r1,#0x18               @ constant for subsequent ANDs
- and r2,r1,r0,lsl#3          @ extract amount
- rors r4,r4,r2               @ rotate share A
- rors r8,r8,r2               @ rotate share B
- and r2,r1,r0,lsr#5          @ etc.
- rors r5,r5,r2
- rors r9,r9,r2
- and r2,r1,r0,lsr#13
- rors r6,r6,r2
- rors r10,r10,r2
- and r2,r1,r0,lsr#21
- rors r7,r7,r2
- rors r11,r11,r2
-@ movs r1,#0                 @ not needed as 0x18 has zeros in all the required places to do a two-bit-wise negate
- usub8 r0,r1,r0
+ref_roundkey_shares_s:
+ mov r11,#15                 @ there are 15 expanded keys
+ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rounds
+ ldr r4,=rkey_s
+ loadlfsr
+ steplfsr                    @ r0=change in RKshareC
+ ldr r2,=RKshareCchange
+ str r0,[r2]
+ ldr r3,=RKshareC
+ ldr r5,[r3]
+ eors r5,r5,r0
+ str r5,[r3]
+ @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter
+
+ref_roundkey_shares_s_loop:
+ ldmia r4!,{r5-r8,r10}       @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA
+
+ ldr r12,[r4,#16]            @ r12 = X_B=vperm+rotations of rkey shareB
+ mov r2,r12,lsr#30           @ r2 = vpermB
+ sub r9,r2,r10,lsr#30        @ r9 = vpermB - vpermA (|junk)
+ mov r2,r9,lsl#3             @ r2 = 8*(vpermB - vpermA) mod 32
+ mov r12,r12,ror r2
+ usub8 r12,r10,r12           @ r12 = rotsA - (rotsB ror r2)
+
+ @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff
+ steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
+ steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
+ steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
+ steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16;                    str r3,[r4,r9,lsl#2]
+
+ ldr r3,=RKshareCchange
+ ldr r3,[r3]
+ movs r2,#0
+ usub8 r10,r2,r10
+ ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2
+ ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2
+ ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2
+ ror r2,r3,r10;                    eors r8,r8,r2
+
+ subs r4,r4,#20
+ stmia r4,{r5-r8}
+ adds r4,r4,#40
+ subs r11,r11,#1
+
+ bne ref_roundkey_shares_s_loop
+ ldr r2,=rstate_lfsr         @ restore rstate_lfsr
+ savelfsr                    @ Save lfsr_state
+ clear03 24
+ref_roundkey_shares_s_exit:
  bx r14
-.endif
 
-.if NEED_VPERM
 .balign 4
 .thumb_func
-vperm:
-@ rotate state registers r4->r5-r6->r7->r4 etc. in constant time
-@ r0: b0..1: rotate amount
-@ returns r0 value required to undo
-@ preserves r2
- and r1,r0,#2
- rsbs r1,r1,#0               @ 0 or fffffffe depending on b1 of r0
- uadd8 r1,r1,r1              @ set/clear all GE flags according to b1 of r0: set if rotate of two places is required
- mov r1,r4
- sel r4,r6,r4
- sel r6,r1,r6
- mov r1,r5
- sel r5,r7,r5
- sel r7,r1,r7
- mov r1,r8
- sel r8,r10,r8
- sel r10,r1,r10
- mov r1,r9
- sel r9,r11,r9
- sel r11,r1,r11
- and r1,r0,#1
- rsbs r1,r1,#0               @ 0 or ffffffff depending on b0 of r0
- uadd8 r1,r1,r1              @ set/clear all GE flags according to b0 of r0: set if rotate of one place is required
- mov r1,r4
- sel r4,r5,r4
- sel r5,r6,r5
- sel r6,r7,r6
- sel r7,r1,r7
- mov r1,r8
- sel r8, r9  ,r8
- sel r9, r10 ,r9
- sel r10,r11,r10
- sel r11,r1 ,r11
- rsbs r0,r0,#0               @ generate control value for inverse operation
+@ Rotates roundkey vperms and RK_ROR rotations by random amounts
+@ Trashes r0-r10
+@ If i = word number 0..3,
+@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
+@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
+@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4])
+@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16
+ref_roundkey_hvperms_s:
+ movs r7,#30
+ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 30 key shares
+ GET_CANARY r10,CTAG9,6
+ push {r10,r14}
+ ldr r10,=rkey_s
+ref_roundkey_hvperms_s_loop:
+ bl gen_rand_lfsr_nonpres     @ r0=new vperm high|rotations
+ ldmia r10,{r2-r5,r9}         @ r2-r5=roundkey share A/B, r9=old vperm high|rotations
+ str r0,[r10,#16]
+ mov r8,r0,lsr#30             @ r8=new vperm low
+ sub r6,r8,r9,lsr#30          @ r6=(new vperm low)-(old vperm low) | junk
+ mov r8,r6,lsl#3              @ r8=8*((new vperm low)-(old vperm low)) mod 32
+ mov r0,r0,ror r8
+ usub8 r0,r9,r0               @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations)
+ movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
+ movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
+ movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
+ movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2]
+ adds r10,r10,#20
+ subs r7,r7,#1
+ bne ref_roundkey_hvperms_s_loop
+ clear03 28
+ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to analysis code
+ pop {r10,r14}
+ CHK_CANARY r10,CTAG9,6
  bx r14
-.endif
 
-.if IK_SHUFREAD
-@ randomly shuffle an array n bytes long, n≤65536 a power of 2, by performing k random exchanges, k>0
-@ r0: array pointer p
-@ r1: n
-@ r2: k
-@ does not need to be a subroutine!!!
-array_shuf:
- push {r4-r6,r14}
- mov r4,r0
- subs r5,r1,#1     @ mask for random number generation
- mov r6,r2
-1:
- bl gen_rand
- and r1,r5,r0,lsr#16
- and r0,r5,r0      @ r0,r1 are two random numbers 0..n-1
- ldrb r2,[r4,r0]
- ldrb r3,[r4,r1]
- strb r3,[r4,r0]
- strb r2,[r4,r1]
- subs r6,r6,#1
- bne 1b
- pop {r4-r6,r15}
-.endif
+.else
+
+@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC
+@ Trashes r0-r11
+.balign 4
+.thumb_func
+ref_roundkey_shares_s:
+ mov r11,#15                 @ there are 15 expanded keys
+ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rounds
+ GET_CANARY r4,CTAG8,6
+ push {r4,r14}
+ ldr r4,=rkey_s
+ loadlfsr
+ steplfsr                    @ r0=change in RKshareC
+ ldr r3,=RKshareC
+ ldr r5,[r3]
+ eors r5,r5,r0
+ str r5,[r3]
+ mov r10,r0
+ref_roundkey_shares_s_loop:
+ ldmia r4!,{r5-r9}           @ r5-r8 = rkey shareA with vperm r9
+
+ @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later)
+
+ ldr r3,[r4,#16]             @ rkey shareB has a vperm of r10>>30
+ movs r3,r3,lsr#30
+ sub r9,r3,r9,lsr#30         @ r9 = vperm_B - vperm_A (|junk)
+ @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter
+
+ steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
+ steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
+ steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
+ steplfsr; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]
+
+ subs r4,r4,#20
+ stmia r4,{r5-r8}
+ adds r4,r4,#40
+ subs r11,r11,#1
+
+ @ clear03: would need to do this with, say r3,r5-r8
+
+ bne ref_roundkey_shares_s_loop
+ savelfsr
+ clear03 24
+ref_roundkey_shares_s_exit:
+ pop {r4,r14}
+ CHK_CANARY r4,CTAG8,6
+ bx r14
+
+.balign 4
+.thumb_func
+@ Rotates roundkey vperms by random amounts
+@ Trashes r0-r9
+ref_roundkey_hvperms_s:
+ movs r7,#30
+ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 30 key shares
+ GET_CANARY r0,CTAG9,6
+ push {r0,r14}
+ bl gen_rand_lfsr_nonpres
+ ldr r1,=rkey_s
+ref_roundkey_hvperms_s_loop:
+ cmp r7,#15
+ bne 2f
+@ Get a new random r0 after using 15 x 2 bits of the original one
+@ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss,
+@ and the gain is only calling gen_rand_lfsr twice instead of 30 times.
+ push {r1}; bl gen_rand_lfsr_nonpres; pop {r1}
+ 2:
+ ldmia r1,{r2-r5,r9}    @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits)
+ mov r8,r9,lsr#30       @ r8=old vperm (low)
+ add r6,r9,r0           @ r6=new vperm (high) | new junk
+ str r6,[r1,#16]
+ rsb  r6,r8,r6,lsr#30   @ r6=(new vperm low)-(old vperm low) | junk bits
+ ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1
+ ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1
+ ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1
+ ands r6,r6,#3; str r5,[r1,r6,lsl#2]
+ adds r1,r1,#20
+ movs r0,r0,ror#2
+ subs r7,r7,#1
+ bne ref_roundkey_hvperms_s_loop
+ clear03 28
+ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to analysis code
+ pop {r0,r14}
+ CHK_CANARY r0,CTAG9,6
+ bx r14
 
-@ "refresh" shares of rkeys by random eor into both shares of each word
-.if RK_ROR
-@ and randomly change rotate amount on each word of each share
 .endif
-@ preserves r0-r11
+
+.ltorg
+
+.if ST_VPERM
 .balign 4
-ref_round_keys_s:
- push {r14}
- GET_CANARY r14,CTAG4
- push {r0-r11,r14}
- ldr r0,=rkey_s
- mov r1,#15                  @ there are 15 expanded keys
-1:
-.if RK_ROR
- ldmia r0,{r2-r11}
- push {r0-r1}
-
- bl gen_rand                 @ xra=random extra rotates for share A
- usub8 r6,r6,r0              @ ra-=xra bytewise
- rors r2,r2,r0               @ a=ror(a,xra)
- rev16 r0,r0                 @ byte order 2301, i.e. B1 at the bottom
- rors r3,r3,r0               @ a=ror(a,xra)
- rev r0,r0                   @ byte order 1032, i.e. B2 at the bottom
- rors r4,r4,r0               @ a=ror(a,xra)
- rev16 r0,r0                 @ byte order 0123, i.e. B3 at the bottom
- rors r5,r5,r0               @ a=ror(a,xra)
-
- bl gen_rand                 @ xrb=random extra rotates for share B
- usub8 r11,r11,r0            @ rb-=xrb bytewise
- rors r7,r7,r0               @ b=ror(b,xrb)
- rev16 r0,r0
- rors r8,r8,r0               @ b=ror(b,xrb)
- rev r0,r0
- rors r9,r9,r0               @ b=ror(b,xrb)
- rev16 r0,r0
- rors r10,r10,r0             @ b=ror(b,xrb)
- usub8 r1,r6,r11             @ ra-rb bytewise
-
- bl gen_rand                 @ xab=extra exclusive OR into shares
- eors r2,r2,r0               @ a^=xab
- rors r0,r0,r1               @ ror(xab,ra-rb)
- eors r7,r7,r0               @ b^=ror(xab,ra-rb)
- rev16 r1,r1
-
- bl gen_rand                 @ xab
- eors r3,r3,r0               @ a^=xab
- rors r0,r0,r1               @ ror(xab,ra-rb)
- eors r8,r8,r0               @ b^=ror(xab,ra-rb)
- rev r1,r1
-
- bl gen_rand                 @ xab
- eors r4,r4,r0               @ a^=xab
- rors r0,r0,r1               @ ror(xab,ra-rb)
- eors r9,r9,r0               @ b^=ror(xab,ra-rb)
- rev16 r1,r1
-
- bl gen_rand                 @ xab
- eors r5,r5,r0               @ a^=xab
- rors r0,r0,r1               @ ror(xab,ra-rb)
- eors r10,r10,r0             @ b^=ror(xab,ra-rb)
-
- pop {r0-r1}
- stmia r0!,{r2-r11}
-.else
- ldmia r0,{r4-r11}           @ EOR random data into the shares
- push {r0-r1}
- bl gen_rand
- eor r4,r4,r0
- eor r8,r8,r0
- bl gen_rand
- eor r5,r5,r0
- eor r9,r9,r0
- bl gen_rand
- eor r6,r6,r0
- eor r10,r10,r0
- bl gen_rand
- eor r7,r7,r0
- eor r11,r11,r0
- pop {r0-r1}
- stmia r0!,{r4-r11}
+.thumb_func
+@ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount
+@ given in the bottom two bits of R0 and update the rotation recorded at statevperm.
+@ On entry R1 must point to statevperm.
+@ Trashes r0-r3,r12
+@ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ...
+@           r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ...
+@ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise.
+addstatevperm:
+ ldr r2,[r1]
+ adds r2,r2,r0
+ str r2,[r1]
+
+ ldr r1,=shareA
+ ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1
+ ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1
+ ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1
+ ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1
+ ldmia r1,{r4-r7}
+
+ getchaffaddress r12          @ Overwrite temporary storage with random numbers
+ ldmia r12!,{r2,r3}
+ stmia r1!,{r2,r3}
+ ldmia r12!,{r2,r3}
+ stmia r1!,{r2,r3}
+
+ ldr r1,=shareB
+ ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1
+ ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1
+ ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1
+ ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1
+ ldmia r1,{r8-r11}
+
+ getchaffaddress r0,16        @ Overwrite temporary storage with random numbers
+ ldmia r0!,{r2,r3}
+ stmia r1!,{r2,r3}
+ ldmia r0!,{r2,r3}
+ stmia r1!,{r2,r3}
+
+addstatevperm_exit:           @ label exit point to be to able to specify to analysis code
+ bx r14
 .endif
- subs r1,r1,#1
- bne 1b
- pop {r0-r11,r14}
- CHK_CANARY r14,CTAG4
- pop {r15}
 
-@ switch from non-shared to shared state
+@ Conjugate lut_a, lut_b with (state) shareC
+@ I.e., EOR the input and output with shareC.
+@ We need to pick one input for each share A and B, and one output for ONE of the shares A and B
+@ Arbitrarily choosing a0, b1 and d0
 .balign 4
-ns_to_s:
- push {r14}
- GET_CANARY r14,CTAG5
- push {r0-r3,r14}
- bl gen_rand
- mov r8,r0
- bl gen_rand
- mov r9,r0
- bl gen_rand
- mov r10,r0
- bl gen_rand
- mov r11,r0
- eors r4,r4,r8
- eors r5,r5,r9
- eors r6,r6,r10
- eors r7,r7,r11
- pop {r0-r3,r14}
- CHK_CANARY r14,CTAG5
- pop {r15}
+conjshareC:
+.if ST_SHAREC
+ ldr r1,=shareC
+ ldr r0,[r1]                   @ Get shareC as a word (all bytes the same)
+ ldr r1,=lut_a                 @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs...
+ ldr r2,[r1,#0x100]
+ eors r2,r2,r0,lsr#24
+ str r2,[r1,#0x100]
+ movs r0,r0,lsr#16
+ ldr r1,=lut_b                 @ ... (continued) Here we're EORing share C into a0, b1 and d0.
+ ldr r2,[r1,#0x100]
+ eors r2,r2,r0,lsl#8
+ str r2,[r1,#0x100]
+.endif
+ bx r14
 
-.if NEED_ROUNDS
 .balign 4
 .thumb_func
 shift_rows_s:
-@ first "rotate" the two most-significant bytes of the state by two registers
-@ slightly faster (but not shorter?) with ubfx/bfi
+@ First "rotate" the two most-significant bytes of the state by two registers
+@ Trashes r0-r3
+@ Slightly faster (but not shorter?) with ubfx/bfi
  eors r0,r4,r6               @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
  lsrs r0,r0,#16
  lsls r0,r0,#16
@@ -567,18 +969,18 @@ shift_rows_s:
  ands r0,r0,#0xff00ff00
  eors r6,r6,r0
  eors r7,r7,r1               @                                       state[3]^=tb;
-@ repeat for other share
- eors r0,r8,r10              @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
- lsrs r0,r0,#16
+@ repeat for other share, conjugated by ror#16
+ clear01                     @ barrier
+ eors r0,r8,r10              @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta;
  lsls r0,r0,#16
+ lsrs r0,r0,#16
  eors r8,r8,r0
  eors r10,r10,r0
- eors r0,r9,r11              @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
- lsrs r0,r0,#16
+ eors r0,r9,r11              @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta;
  lsls r0,r0,#16
+ lsrs r0,r0,#16
  eors r9,r9,r0
  eors r11,r11,r0
-
  eors r1,r11,r8              @ tb=state[3]^state[0]; tb&=0xff00ff00;
  ands r1,r1,#0xff00ff00
  eors r0,r8,r9               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
@@ -590,64 +992,11 @@ shift_rows_s:
  eors r0,r10,r11             @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
  ands r0,r0,#0xff00ff00
  eors r10,r10,r0
- eors r11,r11,r1             @                                       state[3]^=tb;
- bx r14
-.endif
-
-.if NEED_INV_ROUNDS
-.balign 4
-.thumb_func
-inv_shift_rows_s:
-@ first half is the same as shift_rows; halves could be done in opposite order for tail chain
- eors r0,r4,r6               @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
- lsrs r0,r0,#16
- lsls r0,r0,#16
- eors r4,r4,r0
- eors r6,r6,r0
- eors r0,r5,r7               @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
- lsrs r0,r0,#16
- lsls r0,r0,#16
- eors r5,r5,r0
- eors r7,r7,r0
-
- eors r1,r7,r4               @ tb=state[3]^state[0]; tb&=0xff00ff00;
- ands r1,r1,#0xff00ff00
- eors r0,r6,r7               @ ta=state[2]^state[3]; ta&=0xff00ff00; state[3]^=ta;
- ands r0,r0,#0xff00ff00
- eors r7,r7,r0
- eors r0,r5,r6               @ ta=state[1]^state[2]; ta&=0xff00ff00; state[2]^=ta;
- ands r0,r0,#0xff00ff00
- eors r6,r6,r0
- eors r0,r4,r5               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[1]^=ta;
- ands r0,r0,#0xff00ff00
- eors r5,r5,r0
- eors r4,r4,r1               @                                       state[0]^=tb;
 
- eors r0,r8,r10              @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
- lsrs r0,r0,#16
- lsls r0,r0,#16
- eors r8,r8,r0
- eors r10,r10,r0
- eors r0,r9,r11              @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
- lsrs r0,r0,#16
- lsls r0,r0,#16
- eors r9,r9,r0
- eors r11,r11,r0
+ eors r11,r11,r1             @                                       state[3]^=tb;
 
- eors r1,r11,r8              @ tb=state[3]^state[0]; tb&=0xff00ff00;
- ands r1,r1,#0xff00ff00
- eors r0,r10,r11             @ ta=state[2]^state[3]; ta&=0xff00ff00; state[3]^=ta;
- ands r0,r0,#0xff00ff00
- eors r11,r11,r0
- eors r0,r9,r10              @ ta=state[1]^state[2]; ta&=0xff00ff00; state[2]^=ta;
- ands r0,r0,#0xff00ff00
- eors r10,r10,r0
- eors r0,r8,r9               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[1]^=ta;
- ands r0,r0,#0xff00ff00
- eors r9,r9,r0
- eors r8,r8,r1               @                                       state[0]^=tb;
+ clear01                     @ barrier
  bx r14
-.endif
 
 @ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1
 @ r0x00 is a register holding 0x00000000;  r0x1b is a register holding 0x1b1b1b1b
@@ -665,8 +1014,6 @@ inv_shift_rows_s:
 
 @ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1
 .macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b
-@ !!! can probably save some registers, e.g. allow trashing of r0x00, r0x1b
-@ can possibly also simplify slightly with refactorisation
  uadd8 \rt,\rx,\rx           @ field multiplication by 2 as above
  sel \rw,\r0x1b,\r0x00
  eors \rt,\rt,\rw            @ 2x
@@ -687,9 +1034,9 @@ inv_shift_rows_s:
  eors \rx,\rt,\rw,ror#8      @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24
 .endm
 
-.if NEED_ROUNDS
 .balign 4
 .thumb_func
+@ Trashes r0-r3,r12
 mix_cols_s:
  mov r2,#0x00000000
  mov r3,#0x1b1b1b1b
@@ -697,1002 +1044,901 @@ mix_cols_s:
  mixcol r5 ,r0,r1,r2,r3
  mixcol r6 ,r0,r1,r2,r3
  mixcol r7 ,r0,r1,r2,r3
+ ldr r12,=chaff
+ ldmia r12!,{r0,r1}          @ overwrite sensitive shareA-related quantities r0,r1 with random numbers
  mixcol r8 ,r0,r1,r2,r3
  mixcol r9 ,r0,r1,r2,r3
  mixcol r10,r0,r1,r2,r3
  mixcol r11,r0,r1,r2,r3
+ ldmia r12!,{r0,r1}          @ overwrite  sensitive shareB-related quantities r0,r1 with random numbers
  bx r14
-.endif
-
-.if NEED_INV_ROUNDS
-.balign 4
-.thumb_func
-inv_mix_cols_s:
- push {r14}
- GET_CANARY r14,CTAG6
- push {r14}
- mov r12,#0x00000000
- mov r14,#0x1b1b1b1b
- invmixcol r4 ,r0,r1,r2,r3,r12,r14     @ apply invmixcol to each state word
- invmixcol r5 ,r0,r1,r2,r3,r12,r14
- invmixcol r6 ,r0,r1,r2,r3,r12,r14
- invmixcol r7 ,r0,r1,r2,r3,r12,r14
- invmixcol r8 ,r0,r1,r2,r3,r12,r14
- invmixcol r9 ,r0,r1,r2,r3,r12,r14
- invmixcol r10,r0,r1,r2,r3,r12,r14
- invmixcol r11,r0,r1,r2,r3,r12,r14
- pop {r14}
- CHK_CANARY r14,CTAG6
- pop {r15}
-.endif
 
-.if SBOX_VIA_INV
-@ bytewise EOR-convolution with constant 0x1f
-.macro conv_0x1f rx,rt,ru
- eors \rt,\rx,\rx,ror#31     @ t=x^ROL(x,1);
- eors \rt,\rt,\rt,ror#30     @ t=t^ROL(t,2);
- eors \rt,\rt,\rx,ror#28     @ t=t^ROL(x,4);     @ convolution with byte boundaries "trashed"
- ands \ru,\rx,#0xf0f0f0f0    @ u=x&0xf0f0f0f0;
- eors \ru,\ru,\ru,ror#31     @ u=u^ROL(u,1);
- eors \ru,\ru,\ru,ror#30     @ u=u^ROL(u,2);
- ands \ru,\ru,#0x87878787    @ u=u&0x87878787;   @ compensation for trashing
- eors \ru,\ru,\ru,ror#24     @ u=u^ROL(u,8);
- eors \rx,\rt,\ru,ror#7      @ t^=ROR(u,7);      @ with trashing fixed
-.endm
-
-@ bytewise EOR-convolution with constant 0x4a
-.macro conv_0x4a rx,rt,ru
- eors \rt,\rx,\rx,ror#30     @ t=x^ROL(x,2);
- eors \rt,\rt,\rx,ror#27     @ t=t^ROL(x,5);
- ands \ru,\rx,#0xf8f8f8f8    @ u=x&0xf8f8f8f8;
- eors \ru,\ru,\ru,ror#29     @ u=u^ROL(u,3);
- ands \ru,\ru,#0xc7c7c7c7    @ u=u&0xc7c7c7c7;
- eors \ru,\ru,\ru,ror#24     @ u=u^ROL(u,8);
- eors \rt,\rt,\ru,ror#6      @ t^=ROR(u,6);
- ands \ru,\rt,#0x80808080    @ t=rorbytes(t,7);
- uadd8 \rt,\rt,\rt
- orrs \rx,\rt,\ru,lsr#7
+@ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups)
+.macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3
+ ubfx \Rspare0,\Rtarg,#0,  #8
+ ubfx \Rspare1,\Rtarg,#8,  #8
+ ubfx \Rspare2,\Rtarg,#16, #8
+ ubfx \Rspare3,\Rtarg,#24, #8
+
+ ldrb \Rspare0,[\Rtable,\Rspare0]
+ ldrb \Rspare1,[\Rtable,\Rspare1]
+ ldrb \Rspare2,[\Rtable,\Rspare2]
+ ldrb \Rspare3,[\Rtable,\Rspare3]
+ orr \Rspare0,\Rspare0,\Rspare1,lsl#8
+ orr \Rspare2,\Rspare2,\Rspare3,lsl#8
+ orr \Rtarg,\Rspare0,\Rspare2,lsl#16
 .endm
 
+@ map all bytes of the state through the split LUT, lut_a and lut_b
+@ Trashes r0-r3,r12
 .balign 4
 .thumb_func
 map_sbox_s:
- push {r14}
- GET_CANARY r14,CTAG7
- push {r14}
- bl lutmap_state_s           @ the S-box function is an inverse followed by an affine transformation:
- conv_0x1f r4 ,r0,r1         @ see https://en.wikipedia.org/wiki/Rijndael_S-box
- conv_0x1f r5 ,r0,r1
- conv_0x1f r6 ,r0,r1
- conv_0x1f r7 ,r0,r1
- conv_0x1f r8 ,r0,r1
- conv_0x1f r9 ,r0,r1
- conv_0x1f r10,r0,r1
- conv_0x1f r11,r0,r1
- eor r4 ,r4 ,#0xcacacaca     @ scramble the shares slightly: 0x63=0xca^0xa9 etc.
- eor r5 ,r5 ,#0xf5f5f5f5
- eor r6 ,r6 ,#0x0c0c0c0c
- eor r7 ,r7 ,#0xa2a2a2a2
- eor r8 ,r8 ,#0xa9a9a9a9
- eor r9 ,r9 ,#0x96969696
- eor r10,r10,#0x6f6f6f6f
- eor r11,r11,#0xc1c1c1c1
- pop {r14}
- CHK_CANARY r14,CTAG7
- pop {r15}
+ GET_CANARY r12,CTAG12,3
+ push {r12,r14}
+
+ ldr r0,=shareA                 @ Write out state share A to memory
+@ stmia r0,{r4-r7}              @ Used to do a STM
+ getchaffaddress r1
+ ldr r2,[r1]
+ str r4,[r0]                    @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms,
+ str r2,[r1]                    @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired
+ str r5,[r0,#4]                 @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic.
+ str r2,[r1]                    @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1.
+ str r6,[r0,#8]                 @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but
+ str r2,[r1]                    @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic.
+ str r7,[r0,#12]
+ str r2,[r1]
+
+ ldr r0,=shareB                 @ Write out state share B to memory
+ stmia r0,{r8-r11}              @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with
+
+ bl makeperm16                  @ Rebuild random 16-way permutation. Maybe do this less frequently
+@ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation
+
+ bl gen_rand_sha_nonpres
+ mov r11,r0
+ ldr r8,=lut_a
+ ldr r9,=lut_b
+ ldr r0,[r8,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
+ eors r3,r0,r0,lsr#8            @ R3 = a0^a1 | junk
+ uxtb r10,r3
+ ldr r1,[r9,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
+ eors r1,r0,r1
+ eors r2,r1,r1,lsr#8
+ movs r12,r1,lsr#16             @ R12 = c0^d0 | (c1^d1)<<8
+ bfi r12,r2,#16,#8              @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
+
+ ldr r4,=perm16
+ ldr r5,=shareA
+ ldr r6,=shareB
+ movs r1,#0;movs r2,#0;movs r3,#0
+@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
+ movs r0,#15
+1:                              @ (Ordering instructions to minimise result delays)
+ ldrb r1,[r4,r0]                @ r1 = perm[r0]
+ mov  r11,r11,ror#11            @ Rotate random 32 bits to present a new low 8 bits
+ eors r7,r1,#2                  @ r7 = perm[r0]^2
+ ldrb r2,[r5,r1]                @ r2 = shareA[perm[r0]]
+ eor  r11,r11,r2,ror#8          @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted)
+ ldrb r3,[r6,r7]                @ r3 = shareB[perm[r0]^2]
+ eor  r2,r2,r10                 @ r2 = shareA[perm[r0]]^a0^a1
+ eors r2,r2,r3                  @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]
+ ldrb r3,[r8,r2]                @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]
+ eor  r2,r2,r12,lsr#16          @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]
+ eor  r3,r3,r12                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8)
+ eor  r3,r3,r11                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8)
+ strb r3,[r5,r1]                @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand
+ ldrb r3,[r9,r2]                @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]
+ subs r0,r0,#1
+ eor  r3,r3,r11                 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand
+ eor  r3,r3,r12,lsr#8           @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8)
+ strb r3,[r6,r7]                @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1
+ bpl 1b
+ clear03 8                      @ barrier
+
+ ldmia r6,{r8-r11}              @ Read state share B back from memory
+ clear03 12                     @ barrier
+ getchaffaddress r0,16
+ bfi r0,r5,#0,#4                @ match chaff pointer (r0) to share A location (R5) mod 16
+ @ldmia r5,{r4-r7}               @ Read state share A back from memory
+ @clear03 16                     @ barrier
+ ldr r4,[r5]                    @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s
+ ldr r1,[r0]
+ ldr r6,[r5,#8]
+ ldr r1,[r0,#8]
+ ldr r7,[r5,#12]
+ ldr r1,[r0,#12]
+ ldr r5,[r5,#4]                 @ Do r5 last because it's the address register
+ ldr r1,[r0,#4]
+
+@ Refresh state shares because luts only give imperfect share-by-value
+@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent)
+@ loadlfsr
+@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16              @ Barriers between each pair of eors to prevent implicit r4^r8 etc
+@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16
+@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16
+@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16
+@ savelfsr
+
+ pop {r12,r14}
+ CHK_CANARY r12,CTAG12,5
+ bx r14
+
+.ltorg
 
-.if NEED_INV_ROUNDS
 .balign 4
 .thumb_func
-inv_map_sbox_s:
- push {r14}
- GET_CANARY r14,CTAG8
- push {r14}                  @ similarly, the inverse S-box is an affine transformation followed by an inverse
- conv_0x4a r4 ,r0,r1
- conv_0x4a r5 ,r0,r1
- conv_0x4a r6 ,r0,r1
- conv_0x4a r7 ,r0,r1
- conv_0x4a r8 ,r0,r1
- conv_0x4a r9 ,r0,r1
- conv_0x4a r10,r0,r1
- conv_0x4a r11,r0,r1
- eor r4 ,r4 ,#0xd1d1d1d1     @ scramble the shares slightly: 0x05=0xd1^0xd4 etc.
- eor r5 ,r5 ,#0x94949494
- eor r6 ,r6 ,#0xfcfcfcfc
- eor r7 ,r7 ,#0x3a3a3a3a
- eor r8 ,r8 ,#0xd4d4d4d4
- eor r9 ,r9 ,#0x91919191
- eor r10,r10,#0xf9f9f9f9
- eor r11,r11,#0x3f3f3f3f
- bl lutmap_state_s
- pop {r14}
- CHK_CANARY r14,CTAG8
- pop {r15}
-.endif
-
-.else
+randomisechaff:
+@ Randomise 48 bytes of chaff values (random load values)
+@ Uses 12 bytes of permscratch
+@ Trashes r0-3
+ GET_CANARY r0,CTAG13,6
+ push {r0,r14}
+ movs r0,#12
+ ldr r1,=permscratch
+ bl makesmallperm           @ Store the random words in a random order to make 2nd order attacks harder
+ movs r1,#11
+1:
+ push {r1}
+ bl gen_rand_sha_nonpres
+ pop {r1}
+ ldr r2,=permscratch
+ ldrb r2,[r2,r1]
+ getchaffaddress r3
+ str r0,[r3,r2,lsl#2]
+ subs r1,r1,#1
+ bpl 1b
+ pop {r0,r14}
+ CHK_CANARY r0,CTAG13,6
+ bx r14
 
 .balign 4
-.thumb_func
-gen_lut_sbox:
-@ set both lut_a and lut_b to the S-box table
-@ returns r0=lut_a+256, r1=lut_b+256
- push {r14}
- GET_CANARY r14,CTAG9
- push {r14}                  @ similarly, the inverse S-box is an affine transformation followed by an inverse
- bl gen_lut_inverse          @ first generate the table of inverses in lut_a
- mov r14,#256
+refreshchaff_and_lfsr:
+@ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff
+@ Re-randomise LFSR with SHA
+@ Uses 12 bytes of permscratch
+@ Trashes r0-3,12
+ GET_CANARY r0,CTAG14,6
+ push {r0,r14}
+
+@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence
+ bl gen_rand_sha_nonpres
+ ldr r1,=rstate_lfsr
+ ldr r2,[r1]
+ adds r2,r2,r0
+ beq 1f           @ Don't update LFSR state to 0
+ str r2,[r1]
 1:
- ldrb r2,[r0]
- eors r3,r2,r2,lsl#1         @ convolve byte with 0x1f
- eors r3,r3,r3,lsl#2
- eors r3,r3,r2,lsl#4
- eors r2,r3,r3,lsr#8
- eor r2,r2,#0x63             @ and add 0x63
- strb r2,[r0],#1
- strb r2,[r1],#1
- subs r14,r14,#1
- bne 1b
- pop {r14}
- CHK_CANARY r14,CTAG9
- pop {r15}
 
-.if NEED_INV_ROUNDS
+@ Choose a random order to update chaff words to make 2nd order attacks harder
+ movs r0,#12
+ ldr r1,=permscratch
+ bl makesmallperm
+ 
+ movs r1,#11
+1:
+ push {r1}
+ bl gen_rand_lfsr_nonpres
+ pop {r1}
+ ldr r2,=permscratch
+ ldr r3,=chaff
+ ldrb r2,[r2,r1]
+ ldr r12,[r3,r2,lsl#2]
+ add r0,r0,r12
+ str r0,[r3,r2,lsl#2]
+ subs r1,r1,#1
+ bpl 1b
+ pop {r0,r14}
+ CHK_CANARY r0,CTAG14,6
+ bx r14
+
 .balign 4
 .thumb_func
-gen_lut_inv_sbox:
-@ set lut_a to the inverse S-box table
- push {r14}
- GET_CANARY r14,CTAG10
- push {r14}
- bl gen_lut_sbox             @ get the forwards S-box
- sub r0,r0,#256
- sub r1,r1,#256
- mov r2,#0
+@ Do sbox on the four bytes of the 4-way share r4-r7
+@ Trashes r0,r8-r12
+init_key_sbox:
+ GET_CANARY r12,CTAG15,6
+ push {r1-r3,r12,r14}
+ bl gen_rand_sha_nonpres; mov r8,r0
+ bl gen_rand_sha_nonpres; mov r9,r0
+ bl gen_rand_sha_nonpres; mov r10,r0
+ bl gen_rand_sha_nonpres; mov r11,r0
+ ldr r0,=fourway                @ Write out 4-way share to memory
+ stmia r0,{r8-r11}              @ Save random values first to obscure saving of state
+ stmia r0,{r4-r7}
+ movs r4,#0                     @ Clear r4-r7 so that they don't interact with makesmallperm
+ movs r5,#0
+ movs r6,#0
+ movs r7,#0
+
+ bl randomisechaff              @ Randomise block of memory mainly used for obscuring loads
+
+ movs r0,#4
+ ldr r1,=permscratch
+ bl makesmallperm               @ Build random 4-way permutation determining order of bytes to be SBOXed
+ ldr r1,=permscratch            @ Write out random addresses in advance to save two registers (reusing permscratch)
+ ldr r4,[r1]
+ ldr r0,=fourway
+ uxtab r5,r0,r4
+ uxtab r6,r0,r4,ror#8
+ uxtab r7,r0,r4,ror#16
+ uxtab r8,r0,r4,ror#24
+ stmia r1,{r5-r8}               @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3]
+
+ bl gen_rand_sha                @ Save some randomness for the resharing operation later
+ movs r7,r0
+ bl gen_rand_sha
+ movs r8,r0
+
+ ldr r2,=lut_a
+ ldr r3,=lut_b
+ ldr r0,[r2,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
+ eors r10,r0,r0,lsr#8
+ uxtb r10,r10                   @ R10 = a0^a1
+ ldr r1,[r3,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
+ eors r1,r0,r1
+ eors r4,r1,r1,lsr#8
+ uxtb r11,r4                    @ R11 = a0^a1^b0^b1
+ eor r10,r10,r11,lsl#8          @ R10 = a0^a1 | (a0^a1^b0^b1)<<8
+ movs r12,r1,ror#16             @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24
+
+ ldr r1,=permscratch
+ ldr r11,=chaff
+@ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk
 1:
- ldrb r3,[r1],#1             @ get y=S-box(x)...
- strb r2,[r0,r3]             @ ... and store x at location y
- adds r2,r2,#1
- cmp r2,#255
- bls 1b
- pop {r14}
- CHK_CANARY r14,CTAG10
- pop {r15}
-.endif
-.endif
+ ands r5,r1,#12
+ adds r5,r11,r5                 @ Align chaff address to r1
+ ldr  r6,[r1],#4                @ r6 = fourway + perm[i] (i=0-3, loop iteration)
+ ldr  r5,[r5]                   @ Random load to mask previous load
+
+ ands r9,r6,#12
+ add  r9,r11,r9                 @ r9 = chaff address aligned to (r6 bic 3) mod 16
+ ldrb r4,[r6,#0]
+ ldr  r14,[r9,#0]               @ Random load to mask previous load
+ eor  r4,r4,r10
+ eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
+
+ ldrb r5,[r6,#4]
+ ldr  r14,[r9,#4]               @ Random load to mask previous load
+ eors r4,r4,r5
+ eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
+
+ ldrb r5,[r6,#8]
+ ldr  r14,[r9,#8]               @ Random load to mask previous load
+ eors r4,r4,r5
+ eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
+
+ ldrb r5,[r6,#12]
+ ldr  r14,[r9,#12]              @ Random load to mask previous load
+ eors r4,r4,r5                  @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk
+ eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
+
+ ands r14,r4,#255
+ ldrb r5,[r2,r14]               @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]
+ and  r14,r4,#15
+ add  r14,r14,#32
+ ldrb r14,[r11,r14]             @ Random load to mask previous load (r2 and r11 are both 0 mod 16)
+ eors r5,r5,r12                 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24
+@ split r5 into two shares and store at [r6,#0] and [r6,#4]
+ strb r7,[r6,#0]
+ eors r5,r5,r7
+ strb r5,[r6,#4]
+
+ mov r5,r10,lsr#8               @ r5=a0^a1^b0^b1
+ ldr  r14,[r11,#44]             @ Need to eor into a random destination register
+ eors r14,r4,r5                 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8
+ and r14,r14,#255
+
+ ldrb r5,[r3,r14]               @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]
+ and  r14,r14,#15
+ add  r4,r11,#24
+ ldrb r14,[r4,r14]              @ Random load to mask previous load (r3==8 and r11==0 mod 16)
+ eor  r5,r5,r12,ror#8           @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24
+@ split r5 into two shares and store at [r6,#8] and [r6,#12]
+ strb r8,[r6,#8]
+ eors r5,r5,r8
+ strb r5,[r6,#12]
+
+ movs r7,r7,ror#8
+ movs r8,r8,ror#8
+
+ tst r1,#12                     @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16
+ bne 1b
+
+ ldr r0,=fourway
+ ldmia r0,{r4-r7}               @ Load SBOXed values back into register r4-r7
+ ldmia r11,{r8-r12,r14}         @ Random load to mask previous load and to obfuscate registers
+
+ pop {r1-r3,r12,r14}
+ CHK_CANARY r12,CTAG15,6
+ bx r14
 
-@ if we are using direct S-box lookup then [inv_]map_sbox_s is the same as lutmap_state_s
-.if !SBOX_VIA_INV
 .balign 4
 .thumb_func
-map_sbox_s:
-.if NEED_INV_ROUNDS
-.thumb_func
-inv_map_sbox_s:
-.endif
+@ r1 = pointer to 4 x 4-way share (16 words); left unchanged
+@ r3 = rkey_s+40*roundkeynumber; advanced by 40
+@ Trashes r8-r12
+@ If i = word number 0..3,
+@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
+@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
+@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4])
+@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16
+storeroundkey:
+ GET_CANARY r8,CTAG16,6
+ push {r2,r8,r14}
+
+@ eor two 4-way share components to make a component of a 2-way share
+@ Note that we load from 4-way share at a random address then convert to 2-way share and
+@ store at a fixed address, rather than the other way around, so that 2-way shares are obscured
+@ by vperm (we don't know which 2-way share is being processed at a particular point in time).
+@ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share
+
+ bl gen_rand_sha             @ Get r0 = vperm for shareA of the round key
+ str r0,[r3,#16]
+ mov r8,r0,lsr#30
+ rsb r8,r8,#0                @ r8=-vperm
+.if RK_ROR
+ movs r2,#0
+ usub8 r2,r2,r0              @ r2=-hperms
 .endif
-
-@ map all bytes of the state through the LUT
-.balign 4
-lutmap_state_s:
- push {r14}
- GET_CANARY r14,CTAG11
- push {r14}
- ldr r12,=lut_a
- ldr r14,=lut_b
- mov r0,#0x8000              @ "counter" for bytes of state mapped
+ mov r9,#4
 1:
- ldr r3,[r12,#0x100]         @ lut_a_map
- eor r1,r4,r3                @ share A of x ^ share A of lut_a address map
- eor r1,r1,r8                @ ^ share B of x
- eor r1,r1,r3,ror#8          @ ^ share B of lut_a address map
- uxtb r1,r1
- ldrb r1,[r12,r1]            @ look up in lut_a
- eor r1,r1,r3,ror#16         @ ^ share A of lut_a data map
- ldr r3,[r14,#0x100]         @ lut_b_map
- eor r1,r1,r3,ror#24         @ ^ share B of lut_b data map, generating share A of the result
-
- eor r2,r4,r3                @ share A of x ^ share A of lut_b address map
- eor r2,r2,r8                @ ^ share B of x
- eor r2,r2,r3,ror#8          @ ^ share B of lut_b address map
- uxtb r2,r2
- ldrb r2,[r14,r2]            @ look up in lut_b
- eor r2,r2,r3,ror#16         @ ^ share A of lut_b data map
- ldr r3,[r12,#0x100]         @ lut_a_map
- eor r2,r2,r3,ror#24         @ ^ share B of lut_a data map, generating share B of the result
-
- lsrs r4,#8                  @ shift share A of state down one byte...
- orrs r4,r4,r5,lsl#24
- lsrs r5,#8
- orrs r5,r5,r6,lsl#24
- lsrs r6,#8
- orrs r6,r6,r7,lsl#24
- lsrs r7,#8
- orrs r7,r7,r1,lsl#24        @ and insert share A of mapped byte
-
- lsrs r8,#8                  @ shift share B of state down one byte...
- orrs r8,r8,r9,lsl#24
- lsrs r9,#8
- orrs r9,r9,r10,lsl#24
- lsrs r10,#8
- orrs r10,r10,r11,lsl#24
- lsrs r11,#8
- orrs r11,r11,r2,lsl#24      @ and insert share B of mapped byte
-
- lsrs r0,#1                  @ count 16 iterations
+ and r8,r8,#3
+ adds r0,r1,r8,lsl#4
+
+ ldmia r0,{r10,r11}
+.if RK_ROR
+ mov r10,r10,ror r2
+ mov r11,r11,ror r2
+ movs r2,r2,ror#8
+.endif
+ eor r10,r10,r11
+ str r10,[r3],#4
+ add r8,r8,#1
+ subs r9,r9,#1
  bne 1b
- pop {r14}
- CHK_CANARY r14,CTAG11
- pop {r15}
 
-@ perform one EOR step in round key generation
-@ !!! can we introduce some more randomness into the shares here?
-.balign 4
-grk_s_step:
- ldmia r0!,{r5-r7,r12}       @ from last round key_a but one
- eors r5,r5,r4
- eors r6,r6,r5
- eors r7,r7,r6
- eors r12,r12,r7
- stmia r1!,{r5-r7,r12}
- mov r4,r12
+ adds r1,r1,#8
+ adds r3,r3,#4               @ skip over vperm (already stored)
+
+ bl gen_rand_sha             @ Get r0 = vperm for shareB of the round key
+ str r0,[r3,#16]
+ mov r8,r0,lsr#30
+ rsb r8,r8,#0                @ r8=-vperm
 .if RK_ROR
- movs r12,#0
- str r12,[r0],#4
- str r12,[r1],#4
+ movs r2,#0
+ usub8 r2,r2,r0              @ r2=-hperms
 .endif
- ldmia r0!,{r9-r11,r12}      @ from last round key_a but one
- eors r9,r9,r8
- eors r10,r10,r9
- eors r11,r11,r10
- eors r12,r12,r11
- stmia r1!,{r9-r11,r12}
- mov r8,r12
+ mov r9,#4
+ ldr r12,=RKshareC
+ ldr r12,[r12]
+1:
+ and r8,r8,#3
+ adds r0,r1,r8,lsl#4
+ ldmia r0,{r10,r11}
+ eor r10,r10,r12             @ Mix in RKshareC into round key shareB
 .if RK_ROR
- movs r12,#0
- str r12,[r0],#4
- str r12,[r1],#4
-.endif
- bx r14
+ mov r10,r10,ror r2
+ mov r11,r11,ror r2
+ movs r2,r2,ror#8
+.endif
+ mov r10,r10,ror#16
+ mov r11,r11,ror#16
+ eor r10,r10,r11
+ str r10,[r3],#4
+ add r8,r8,#1
+ subs r9,r9,#1
+ bne 1b
 
-.macro jitter rx
-.if IK_JITTER
- rors \rx,\rx,#1
- bcc \@f
-\@:
-.else
-@ nothing
-.endif
-.endm
+ subs r1,r1,#8               @ Restore r1 = (r1 on entry)
+ adds r3,r3,#4               @ Set     r3 = (r3 on entry) + 40
+
+ pop {r2,r8,r14}
+ CHK_CANARY r8,CTAG16,6
+ bx r14
 
 .balign 4
 .thumb_func
-init_key:
-@ r0: rkeys_s
-@ r1: raw key data (32 bytes)
-.if RK_ROR
-@ rkeys_s is a 40*15=600-byte region
-@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3], each of which is followed by a word containing
-@ four byte-wide rotate values ra[i] and rb[i]
-@ such that rk[i]=(rka[i] ROR ra[i])^(rkb[i] ROR rb[i]) gives the round keys
-@ rotations always operate mod 32, so we do not bother to mask the rotate amounts to 5 bits
-.else
-@ rkeys_s is a 32*15=480-byte region
-@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3]
-@ such that rk[i]=rka[i]^rkb[i] gives the round keys
-.endif
- GET_CANARY r12,CTAG12
- push {r4-r12,r14}
-.if IK_JITTER
- push {r0,r1}
- bl gen_rand
- mov r12,r0
- pop {r0,r1}
-.endif
- jitter r12
- mov r4,r0
- mov r5,r1
-.if IK_SHUFREAD
- SET_COUNT 73
- add r6,r4,#128              @ use 64 bytes of temporary space at r0+128 for buf
- mov r7,#0
+init_key_4way:
+@ On entry, r0 points to 4-way shared raw key data (128 bytes)
+@ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7
+@ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K.
+@
+@ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows.
+@ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4],
+@ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information.
+@ In addition a common share word, RKshareC, is set randomly.
+@ For a given round, rk[i] = the i^th word of the actual round key is given by:
+@ vpermA=rka[4]>>30
+@ vpermB=rkb[4]>>30
+@ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4])
+@ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16
+@ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC
+
+ GET_CANARY r12,CTAG17,6
+ push {r0-r12,r14}
+ 
+@ Transfer 4-way key into local workspace, rerandomising the shares
+ mov r5,r0                   @ r5=4-way key input
+ bl randomisechaff
+ ldr r6,=rkey4way
+ movs r7,#8
 1:
- bl gen_rand
- and r0,r0,#0x1f
- strb r0,[r6,#32]            @ buf contains each number 0..31 and 32 more random numbers in that range
- strb r7,[r6],#1             @ so each number at least once...
- adds r7,r7,#1
- cmp r7,#32
+ ldmia r5!,{r1-r4}
+ bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0
+ bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0
+ bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0
+ stmia r6!,{r1-r4}
+ subs r7,r7,#1
  bne 1b
- CHK_COUNT 73
- add r0,r4,#128
- mov r10,r0
- movs r1,#64
- movs r2,#200
- bl array_shuf               @ ... in a random order
- mov r11,#63
- CHK_COUNT 74
-.else
- mov r6,#31
-.endif
+
+@ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for
+@ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys.
+ bl gen_rand_sha_nonpres
+ ldr r12,=RKshareC
+ str r0,[r12]                @ Make RKshareC random word
+ ldr r3,=rkey_s              @ r3=rkey_s
+ ldr r1,=rkey4way            @ r1=rkey4way
+ bl storeroundkey            @ Store round key 0 and advance r3 by 40
+ adds r1,r1,#64
+ bl storeroundkey            @ Store round key 1 and advance r3 by 40
+ adds r1,r1,#48
+ ldmia r1!,{r4-r7}           @ r4-r7 = 4-way share of previous round key word
+                             @ r1=rkey4way+128 on entry to main loop
+ movs r2,#0                  @ r2=word counter (0-51), offset from word 8
+
+@ Note that r1-r3 are not sensitive values, so it's safe to stack
+@ them and conditionally branch on them.
+
+@ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of
+@   Rounds 0,1     Rounds 2,3            Rounds 12,13       Round 14
+@   a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56
+@   a1 b1 c1 d1 -> a9 b9 c9 d9           a49 b49 c49 d49    a57 b57 c57 d57
+@   a2 b2 c2 d2    etc                   a50 b50 c50 d50    a58 b58 c58 d58
+@   a3 b3 c3 d3                          a51 b51 c51 d51    a59 b59 c59 d59
+@   a4 b4 c4 d4                          a52 b52 c52 d52    ===============
+@   a5 b5 c5 d5                          a53 b53 c53 d53
+@   a6 b6 c6 d6                          a54 b54 c54 d54
+@   a7 b7 c7 d7                          a55 b55 c55 d55
+
+init_key_expandloop:
+@ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8)
+@ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words)
+@ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4])
+@ r4-r7 = 4-way share of previous roundkey word
+
+ tst r2,#7
+ bne 1f
+ subs r1,r1,#128             @ Every 8th word, reset cyclic buffer pointer and do ROTWORD
+ movs r4,r4,ror#8
+ movs r5,r5,ror#8
+ movs r6,r6,ror#8
+ movs r7,r7,ror#8
 1:
- SET_COUNT 104
- jitter r12
-.if IK_SHUFREAD
- ldrb r6,[r10,r11]           @ now process the raw key bytes in the order given by buf, some more than once
-.endif
- lsrs r8,r6,#4
-.if RK_ROR
- add r7,r6,r8,lsl#3
- add r7,r7,r8,lsl#4          @ 0..15 -> 0..15, 16..31 -> 40..55
-.else
- add r7,r6,r8,lsl#4          @ 0..15 -> 0..15, 16..31 -> 32..47
-.endif
- ldrb r9,[r5,r6]             @ fetch key byte
- bl gen_rand                 @ make random shares of round key 0
- CHK_COUNT 104
- eor r9,r9,r0
- strb r9,[r4,r7]
-.if RK_ROR
- adds r7,#20
-.else
- adds r7,#16
-.endif
- strb r0,[r4,r7]
-.if IK_SHUFREAD
- subs r11,r11,#1
-.else
- subs r6,r6,#1
-.endif
- CHK_COUNT 105
- bpl 1b
- CHK_COUNT 106
- mov r0,r4
-.if RK_ROR
- movs r1,#0
- str r1,[r0,#16]
- str r1,[r0,#36]
-.endif
-@ now generate the other round keys
- movs r2,#1                  @ round constant
-.if RK_ROR
- add r1,r0,#80
- ldr r4,[r0,#52]             @ last word from previous round key_a
- ldr r8,[r0,#72]             @ last word from previous round key_b
-.else
- add r1,r0,#64
- ldr r4,[r0,#44]             @ last word from previous round key_a
- ldr r8,[r0,#60]             @ last word from previous round key_b
-.endif
- CHK_COUNT 107
+
+ tst r2,#3
+ bne 1f
+ bl init_key_sbox            @ Every 4th word, do SUBBYTES (sbox) on r4-r7
 1:
- SET_COUNT 42
- rors r4,r4,#8
- rors r8,r8,#8
- push {r0-r3}
-.if IK_JUNK
- bl gen_rand                 @ put some junk in r5-r7, r9-r11
- mov r5,r0
- bl gen_rand
- mov r6,r0
- bl gen_rand
- mov r7,r0
- bl gen_rand
- mov r9,r0
- bl gen_rand
- mov r10,r0
- bl gen_rand
- mov r11,r0
-.endif
- CHK_COUNT 42
-.if IK_REMAP
- bl remap
-.endif
- CHK_COUNT 43
-.if IK_PERM
- bl gen_rand
- bl vperm
- push {r0}
- bl gen_rand
- bl hperm
- push {r0}
- bl map_sbox_s               @ this actually maps all of r4..r7, r8..r11 - i.e., trashes r5, r6, r7, r9, r10, r11
- pop {r0}
- bl hperm
- pop {r0}
- bl vperm
-.else
- bl map_sbox_s               @ this actually maps all of r4..r7, r8..r11 - i.e., trashes r5, r6, r7, r9, r10, r11
-.endif
- CHK_COUNT 44
- pop {r0-r3}
- eors r4,r4,r2               @ round constant
- bl grk_s_step
- CHK_COUNT 45
- lsls r2,#1                  @ step round constant
- cmp r2,#0x40                @ done?
- bhi 2f
- push {r0-r2}
- bl map_sbox_s               @ this actually maps all of r4..r7, r8..r11 - i.e., trashes r5, r6, r7, r9, r10, r11
- CHK_COUNT 46
- pop {r0-r2}
- bl grk_s_step
- CHK_COUNT 47
- b 1b
-2:
- CHK_COUNT 46
- pop {r4-r12,r14}
- CHK_CANARY r12,CTAG12
+
+ tst r2,#7
+ bne 1f
+ movs r0,r2,lsr#3
+ mov r8,#1
+ movs r8,r8,lsl r0
+ eors r4,r4,r8               @ Every 8th word, add in round constant
+1:
+
+ ldmia r1,{r8-r11}           @ eor with key from two rounds ago and advance r1 by 16
+ eors r4,r4,r8
+ eors r5,r5,r9
+ eors r6,r6,r10
+ eors r7,r7,r11
+ stmia r1!,{r4-r7}
+
+ add r2,r2,#1
+ tst r2,#3
+ bne 1f
+ subs r1,r1,#64
+ bl storeroundkey            @ Store round key 1+r2/4 and advance r3 by 40
+ adds r1,r1,#64
+1:
+
+ cmp r2,#52
+ bne init_key_expandloop
+
+ pop {r0-r12,r14}
+ CHK_CANARY r12,CTAG17,6
  bx r14
 
-@ add the round key shares pointed to by r12 into the state shares
+.ltorg
+
+@ Add the round key shares pointed to by r12 into the state shares
+@ Trashes r0-r3
 .balign 4
 addrkey_s:
- push {r14}
- GET_CANARY r14,CTAG13
- push {r0-r3,r14}
-.if RK_ROR
- ldmia r12!,{r0-r3,r14}      @ share A of round key + ROR data
- rors r0,r0,r14              @ ROR first word
- eors r4,r4,r0               @ add to state
- rev16 r0,r14                @ move byte 1 of ROR data into byte 0
- rors r1,r1,r0
- eors r5,r5,r1
- rev r0,r0                   @ move byte 2 of ROR data into byte 0
- rors r2,r2,r0
- eors r6,r6,r2
- rev16 r0,r0                 @ move byte 3 of ROR data into byte 0
- rors r3,r3,r0
- eors r7,r7,r3
-.else
- ldmia r12!,{r0-r3}          @ share A of round key
- eors r4,r4,r0
- eors r5,r5,r1
- eors r6,r6,r2
- eors r7,r7,r3
-.endif
-.if RK_ROR
- ldmia r12!,{r0-r3,r14}      @ share B of round key + ROR data
- rors r0,r0,r14              @ ROR first word
- eors r8,r8,r0               @ etc., as above
- rev16 r0,r14
- rors r1,r1,r0
- eors r9,r9,r1
- rev r0,r0
- rors r2,r2,r0
- eors r10,r10,r2
- rev16 r0,r0
- rors r3,r3,r0
- eors r11,r11,r3
-.else
- ldmia r12!,{r0-r3}          @ share B of round key
- eors r8 ,r8 ,r0
- eors r9 ,r9 ,r1
- eors r10,r10,r2
- eors r11,r11,r3
-.endif
- pop {r0-r3,r14}
- CHK_CANARY r14,CTAG13
- pop {r15}
-
-.if NEED_ROUNDS
 
-@ perform encryption rounds
-@ r4-r7, r8-r11: state
-@ preserves r0-r3,r12
-.balign 4
-rounds_s:
- push {r14}
- GET_CANARY r14,CTAG14
- push {r0-r3,r12,r14}
- mov r2,#0                   @ round counter
-1:
- ldr r12,=rkey_s
- add r12,r12,r2,lsl#5        @ pointer to key shares for this round
-.if RK_ROR
- add r12,r12,r2,lsl#3
-.endif
- bl addrkey_s
-.if ST_VPERM
- bl gen_rand
- bl vperm                    @ V shuffle
-.endif
- push {r0,r2}                @ save round count
-.if ST_HPERM
- bl gen_rand
- bl hperm                    @ H shuffle
- push {r0}
-.endif
- bl map_sbox_s
-.if ST_HPERM
- pop {r0}
- bl hperm                    @ undo H shuffle
-.endif
- bl shift_rows_s
- ldr r2,[r13,#4]             @ increment round counter on stack
- adds r2,r2,#1
- str r2,[r13,#4]
- cmp r2,#14
- beq 2f                      @ break from loop? (last round has no mix_cols)
- bl mix_cols_s
- pop {r0,r2}
+ ldr r0,=chaff               @ guaranteed 0 mod 16
 .if ST_VPERM
- bl vperm                    @ undo V shuffle
-.endif
- b 1b
-2:
-@ bl inv_mix_cols_s @ or could skip in last round above
- pop {r0,r2}
-.if ST_VPERM
- bl vperm                    @ undo V shuffle
-.endif
-.if RK_ROR
- ldr r12,=rkey_s+14*40      @ final round key shares
+ ldr r3,=statevperm
+ ldr r3,[r3]                 @ r3=vperm state rotation in bottom two bits
+ ldr r2,[r0,#12]             @ barrier load
 .else
- ldr r12,=rkey_s+14*32      @ final round key shares
-.endif
- bl addrkey_s
- pop {r0-r3,r12,r14}
- CHK_CANARY r14,CTAG14
- pop {r15}
+ movs r3,#0
 .endif
+ bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
+ ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
+ ldr r2,[r0,#16]             @ barrier load
 
-.if NEED_INV_ROUNDS
-@ perform decryption rounds
-@ r4-r7, r8-r11: state
-@ preserves r0-r2
-.balign 4
-inv_rounds_s:
- push {r14}
- GET_CANARY r14,CTAG15
- push {r0-r2,r14}
+ rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
+@ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot
+@ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr
 .if RK_ROR
- ldr r12,=rkey_s+14*40      @ final round key shares
+ movs r0,r2,lsl#3
+ movs r1,r1,ror r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1;                   rors r0,r0,r1; eors r4,r4,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2];                movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0
 .else
- ldr r12,=rkey_s+14*32      @ final round key shares
-.endif
- bl addrkey_s
- mov r2,#13                  @ round counter
- push {r2}
-.if ST_VPERM
- bl gen_rand
- bl vperm                    @ V shuffle
- push {r0}
-.endif
- b 2f                        @ into middle of loop (last round has no mix_cols)
-1:
- push {r2}
-.if ST_VPERM
- bl gen_rand
- bl vperm                    @ V shuffle
- push {r0}
-.endif
- bl inv_mix_cols_s
-2:
- bl inv_shift_rows_s
-.if ST_HPERM
- bl gen_rand
- bl hperm                    @ H shuffle
- push {r0}
-.endif
- bl inv_map_sbox_s
-.if ST_HPERM
- pop {r0}
- bl hperm                    @ undo H shuffle
-.endif
-.if ST_VPERM
- pop {r0}
- bl vperm                    @ undo V shuffle
-.endif
- pop {r2}
- ldr r12,=rkey_s
- add r12,r12,r2,lsl#5        @ pointer to key shares for this round
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2];                eors r7,r7,r0
+.endif
+ clear03_preserve_r3
+ add r12,r12,#20
+ @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr
+ 
+ bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
+ ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
+ ldr r2,[r0,#16]             @ barrier load
+ rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
+ ldr r3,=RKshareC            @ r3=common round key shareC
+ bfi r0,r3,#0,#4
+ ldr r3,[r3]
+ ldr r0,[r0]                 @ barrier load
+ 
+@ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot
+@ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr
 .if RK_ROR
- add r12,r12,r2,lsl#3
-.endif
- bl addrkey_s
- subs r2,r2,#1
- bpl 1b
- pop {r0-r2,r14}
- CHK_CANARY r14,CTAG15
- pop {r15}
+ movs r0,r2,lsl#3
+ movs r1,r1,ror r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16;   adds r2,r2,#1;                   rors r0,r0,r1; eor r8,r8,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16;   adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0
+.else
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16;   adds r2,r2,#1; eors r8,r8,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16;   adds r2,r2,#1; eors r9,r9,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0
+ ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                eors r11,r11,r0
 .endif
+ clear03
+ bx r14
 
-.if INCLUDE_ENCRYPT_CBC
 .balign 4
 .thumb_func
-@ encrypt data in place
+@ de/encrypt data in place
 @ r0: ivec
 @ r1: buf
-@ r2: number of blocks
-@ this implementation does not scramble the shares properly; consider a better implementation
-@ if security is required in encryption
-cbc_encrypt_s:
- push {r14}
- GET_CANARY r14,CTAG16
- push {r4-r11,r14}
- ldmia r0,{r4-r7}            @ load iv into share a
-2:
- ldmia r1,{r8-r11}           @ load plaintext into share b
- bl rounds_s
- eor r4,r4,r8                @ convert shared to non-shared
- eor r5,r5,r9
- eor r6,r6,r10
- eor r7,r7,r11
- stmia r1!,{r4-r7}
- subs r2,r2,#1
- bne 2b
- pop {r4-r11,r14}
- CHK_CANARY r14,CTAG16
- pop {r15}
+@ r2: n, number of blocks, n>0
+.if CT_BPERM
+@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV,
+@ the key, and the block number. We can therefore process them in any order, and using a
+@ random order helps to defeat attacks that work on the output of the AES, since an attacker
+@ wouldn't know what plaintext or ciphertext corresponds to a particular instruction.
 .endif
 
-.if INCLUDE_DECRYPT_CBC
-.balign 4
-.thumb_func
-@ decrypt data in place
-@ r0: ivec
-@ r1: buf
-@ r2: number of blocks
-@ return
-@ r0=0 OK
-@ r0=1: fault detected
-@ could be simplified to use more ldmia:s at the cost of another 8 words of stack
-cbc_decrypt_s:
- push {r14}
- GET_CANARY r14,CTAG17
- push {r4-r11,r14}
- ldmia r0,{r4-r7}            @ load IV
- bl ns_to_s
- push {r4-r11}               @ IV shares on the stack
-2:
- bl remap
- bl ref_round_keys_s         @ refresh the round keys
- ldmia r1,{r4-r7}            @ load the ciphertext
- bl ns_to_s                  @ convert to shares
- bl inv_rounds_s             @ do decryption rounds
-
-.if ROUND_TRIP_TEST
-
-@ compute plaintext {r4-r7}^{r8-r11}^{SP[0..3]}^{SP[4..7]}
-@ as shares {r4-r7}^{SP[0..3]}, {r8-r11}^{SP[4..7]}
- ldrd r0,r3,[r13,#0]
- eor r0,r0,r4
- eor r3,r3,r5
- strd r0,r3,[r13,#0]
- ldrd r0,r3,[r13,#8]
- eor r0,r0,r6
- eor r3,r3,r7
- strd r0,r3,[r13,#8]
- ldrd r0,r3,[r13,#16]
- eor r0,r0,r8
- eor r3,r3,r9
- strd r0,r3,[r13,#16]
- ldrd r0,r3,[r13,#24]
- eor r0,r0,r10
- eor r3,r3,r11
- strd r0,r3,[r13,#24]        @ plaintext_s now on the stack
- bl rounds_s                 @ restore original ciphertext (or we could have saved it)
-
- ldmia r1!,{r0,r3}           @ reload actual ciphertext and compare to check for faults
- eors r0,r0,r4
- eors r0,r0,r8
- bne 1f                      @ mismatch? could repeat this bne or add other protection against its being skipped
- eors r3,r3,r5
- eors r3,r3,r9
- bne 1f
- ldmia r1!,{r0,r3}
- eors r0,r0,r6
- eors r0,r0,r10
- bne 1f
- eors r3,r3,r7
- eors r3,r3,r11
- bne 1f
- subs r1,r1,#16
-
- pop {r0,r3}                 @ now EOR plaintext shares on stack to recover non-shared plaintext
- ldr r14,[sp,#8]
- eors r0,r0,r14
- ldr r14,[sp,#12]
- eors r3,r3,r14
- stmia r1!,{r0,r3}           @ overwrite ciphertext with plaintext
+ctr_crypt_s:
+@ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks
+ GET_CANARY r12,CTAG0,6
+ push {r0-r12,r14}           @ save all registers so that when we restore we overwrite any secrets
 
- pop {r0,r3}
- ldr r14,[sp,#8]
- eors r0,r0,r14
- ldr r14,[sp,#12]
- eors r3,r3,r14
- stmia r1!,{r0,r3}           @ overwrite ciphertext with plaintext
+ push {r0-r3}
+ 
+ SET_COUNT 93,6
 
- add r13,#16                 @ first share of plaintext has now been popped; skip the other share
+.if CT_BPERM
+@ Initialise 32 random numbers (which fit in half-words)
+@ r3=number of blocks
+ ldr r4,=bperm_rand
+ movs r5,#32
+1:
+ bl gen_rand_sha
+ umull r0,r2,r0,r3        @ Random number between 0 and n-1 (n=#blocks)
+ strh r2,[r4],#2
+ subs r5,r5,#1
+ bne 1b
+.endif
 
-.else
+ bl randomisechaff
 
-@ compute plaintext {r4-r7}^{r8-r11}^{SP[0..3]}^{SP[4..7]}
-@ as shares {r4-r7}^{SP[0..3]}, {r8-r11}^{SP[4..7]}
- pop {r0,r3}
- eor r4,r0,r4
- eor r5,r3,r5
- pop {r0,r3}
- eor r6,r0,r6
- eor r7,r3,r7
- pop {r0,r3}
- eor r8,r0,r8
- eor r9,r3,r9
- pop {r0,r3}
- eor r10,r0,r10
- eor r11,r3,r11              @ now plaintext_s in r4-r11
- eor r8,r8,r4                @ convert to non-shared
- eor r9,r9,r5
- eor r10,r10,r6
- eor r11,r11,r7              @ now plaintext_ns in r8-r11
- ldmia r1,{r4-r7}            @ ciphertext_ns in r4-r7
- stmia r1!,{r8-r11}          @ overwrite ciphertext_ns with plaintext_ns
- bl ns_to_s                  @ convert non-shared ciphertext to shared
+@ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0
+@ Not doing shareC or state vperm at this point
+ pop {r0}
+ ldmia r0,{r4-r7}         @ r4-r7 = IVshareA
+ clear03 16
+ pop {r1}
+ ldmia r1,{r8-r11}        @ r8-r11 = IVshareB
+ clear03 32
+ bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc
+ bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16
+ bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16
+ bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16
+ ldr r0,=IV0
+ stmia r0,{r4-r7}
+ adds r0,r0,#20
+ stmia r0,{r8-r11}
+@ "Decommission" IV0 so that it doesn't get stacked
+ bl gen_rand_sha_nonpres; movs r4,r0
+ bl gen_rand_sha_nonpres; movs r5,r0
+ bl gen_rand_sha_nonpres; movs r6,r0
+ bl gen_rand_sha_nonpres; movs r7,r0
+ bl gen_rand_sha_nonpres; mov  r8,r0
+ bl gen_rand_sha_nonpres; mov  r9,r0
+ bl gen_rand_sha_nonpres; mov r10,r0
+ bl gen_rand_sha_nonpres; mov r11,r0
+ pop {r1,r2}
+@ r1=cipher/plaintext buffer, r2=number of blocks
+
+ movs r3,#0
+ CHK_COUNT 93,6
+
+ctr_crypt_mainloop:
+ SET_COUNT 80,6
+@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
+
+@ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it)
+ push {r1-r3}
+@ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret)
+
+ tst r3,#(REFCHAFF_PERIOD-1)
+ bne 1f
+ bl refreshchaff_and_lfsr
+1:
 
-.endif
+ ldr r3,[r13,#8]             @ get block count off the stack
+ tst r3,#(REMAP_PERIOD-1)
+ bne 1f
+ bl remap                    @ shuffle the LUTs; this preserves R3
+1:
+ CHK_COUNT 80,6
 
- push {r4-r11}               @ push ciphertext_s, replacing iv or previous ciphertext_s on stack
- subs r2,r2,#1               @ count the blocks
- bne 2b
- add r13,#32
- mov r0,#0                   @ return OK status
- pop {r4-r11,r14}
- CHK_CANARY r14,CTAG17
- pop {r15}
+ tst r3,#(REFROUNDKEYSHARES_PERIOD-1)
+ bne 1f
+ bl ref_roundkey_shares_s    @ refresh the round key shares
+1:
 
-.if ROUND_TRIP_TEST
+ ldr r3,[r13,#8]             @ get block count off the stack
+ tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1)
+ bne 1f
+ bl ref_roundkey_hvperms_s   @ refresh the round key vperms
 1:
-@ fault here
- rcp_panic
-.endif
-.endif
 
-.if INCLUDE_CRYPT_CTR
-.balign 4
-.thumb_func
-@ de/encrypt data in place
-@ r0: ivec
-@ r1: buf
-@ r2: n, number of blocks, n>0
-.if CT_BPERM
-@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on
-@ the IV, the key, and the block number. We can therefore process them in any order. Hence
-@ we generate all the residues mod u=2^k such that u≥n in a pseudo-random order using a linear conguential
-@ generator (x_i+1 = a x_i + c mod u), and process the blocks in that order. We choose
-@ x_0 and a randomly (subject to a=5 mod 8), as well as adding an overall random offset
-@ to the sequence, which is equivalent to choosing a random c.
-@
-@ For residues greater than or equal to n we "decrypt" an area of scratch
-@ memory, taking the same time as a real decryption.  The inefficiency
-@ due to rounding up the number of blocks processed to the next power of
-@ two is a factor of 2 in the worst case.
-@ q.v. https://en.wikipedia.org/wiki/Linear_congruential_generator#m_a_power_of_2,_c_%E2%89%A0_0
-.endif
-ctr_crypt_s:
- GET_CANARY r3,CTAG0
- SET_COUNT 171
+ CHK_COUNT 81,6
+
+ pop {r1-r3}
+@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
+
+@ Now calculate r12 = block number-to-be-deciphered from r3 = block counter
 .if CT_BPERM
- push {r0,r1,r3,r4-r11,r14}
- mvn r4,#0
- subs r5,r2,#1               @ make sure we generate optimal mask for n an exact power of 2
- clz r5,r5
- lsrs r4,r4,r5               @ mask m=2^k-1 s.t. m≥n
- orrs r4,r4,#7               @ m≥7
- bl gen_rand
- bic r5,r0,#7
- adds r5,r5,#5               @ multiplier a, randomly initialised, but make sure it is 5 mod 8
- bl gen_rand
- mov r7,r0                   @ initial block pointer x₀, randomly initialised
- bl gen_rand
- mov r8,r0                   @ sequence offset, randomly initialised: this is equivalent to choosing a random c
- mov r6,r4
+@ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7
+ push {r1}
+ ldr r0,=murmur3_constants
+ ldmia r0,{r9-r12,r14}       @ load five murmur3_32 hash constants
+ ldr r0,=bperm_rand
+ movs r1,#31
+ movs r4,r3                  @ r4=i
+1:
+ ldrh r5,[r0],#2             @ r5=k
+ subs r5,r5,r4               @ r5=k-i
+ ands r6,r2,r5,asr#31        @ r6=n*(k-i<0)
+ adds r5,r5,r6               @ r5=j=(k-i)%n
+ adds r6,r4,r5               @ r6=i+j
+ subs r7,r4,r5               @ r7=i-j
+ and  r8,r7,r7,asr#31        @ r8=min(i-j,0)
+ sub  r7,r7,r8,lsl#1         @ r7=|i-j|
+ mla  r6,r6,r2,r7            @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j}
+ eors r6,r6,r1,lsl#27        @ mix with swap-or-not round counter to get different hash functions
+@ Now do murmur3_32 hash of r6
+ mul  r6,r6,r9
+ movs r6,r6,ror#17
+ mul  r6,r6,r10
+ movs r6,r6,ror#19
+ adds r6,r6,r6,lsl#2
+ add  r6,r6,r11
+ eors r6,r6,#4
+ eors r6,r6,r6,lsr#16
+ mul  r6,r6,r12
+ eors r6,r6,r6,lsr#13
+ mul  r6,r6,r14
+ eors r6,r6,r6,lsr#16        @ not actually used here
+@ Now set i to j, conditional on the top bit of r6
+ subs r7,r5,r4               @ r7=j-i
+ ands r7,r7,r6,asr#31        @ r7=(j-i)*(top bit of r6)
+ adds r4,r4,r7               @ r4=j if top bit of r6, else i
+ subs r1,r1,#1
+ bpl 1b
+ pop {r1}
+ mov r12,r4
+.else
+ mov r12,r3
+.endif
+ CHK_COUNT 82,6
+
+@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered)
+ push {r1-r3,r12}
+@ r4-r11 = IV0, r12=block number
+
+processIV:                   @ non-target label to assist power analysis
+ ldr r8,=IV0
+ ldmia r8,{r4-r7}            @ load IV0_A
+ clear03 16
+ add r8,r8,#20
+ ldmia r8,{r8-r11}           @ load IV0_B
+ clear03 32
+ rev r0,r12
+ eor r7,r7,r0                @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n.
+                             @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n)
+@ r4-r11 = IV for the current block
+ CHK_COUNT 83,6
+.if ST_SHAREC
+ bl gen_rand_sha_nonpres     @ Create state share C; all bytes the same
+ ands r0,r0,#255
+ orrs r0,r0,r0,lsl#8
+ orrs r12,r0,r0,lsl#16
+ ldr r1,=shareC
+ str r12,[r1]
 .else
- push {r0,r3,r4-r11,r14}
  movs r12,#0
 .endif
- CHK_COUNT 171
+@ r4-r11 = IV for the current block w/o shareC, r12=shareC
+@ refresh state shares and mix in shareC
+ bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc
+ bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16
+ bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16
+ bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16
+.if ST_VPERM
+ bl gen_rand_sha_nonpres
+ ldr r1,=statevperm
+ movs r2,#0
+ str r2,[r1]
+ bl addstatevperm            @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG)
+.endif
+
+ CHK_COUNT 84,6
+ bl conjshareC               @ Add the effect of shareC to lut_a, lut_b
+ CHK_COUNT 85,6
+@ now perform the 15 encryption rounds on (key, state=IV+x)
+@ here r4-r7, r8-r11: state
+ mov r2,#0                   @ round counter
+rounds_s_mainloop:
+ ldr r12,=rkey_s
+ add r12,r12,r2,lsl#5        @ pointer to key shares for this round
+ add r12,r12,r2,lsl#3
+ push {r2}                   @ save round count
+ bl addrkey_s
+ bl map_sbox_s
+ bl shift_rows_s
+.if ST_VPERM
+ ldr r2,[r13]                @ peek at stack to get round count
+ cmp r2,#NUMREFSTATEVPERM
+ bcs 1f
+ bl gen_rand_lfsr_nonpres
+ ldr r1,=statevperm
+ bl addstatevperm            @ V shuffle of r4-r11
 1:
- SET_COUNT 129
-.if CT_BPERM
- add r12,r7,r8               @ add sequence offset
- and r12,r12,r4              @ get block pointer mod 2^k
- cmp r12,r2                  @ set C if beyond end of buffer
- sbcs r3,r3,r3               @ r3==0xffffffff in buffer, 0x00000000 past end
- uadd8 r3,r3,r3              @ set/clear all GE flags if in buffer/past end
- ldr r1,[r13,#4]             @ get buffer address from stack
- add r1,r1,r12,lsl#4         @ calculate address of block
- ldr r3,=ctr_scratch
- sel r1,r1,r3                @ if beyond end of buffer, just process scratch area
- ldr r0,[r13]                @ get IV address from stack
- push {r4-r8,r12}
-.else
- ldr r0,[r13]                @ get IV address from stack
- push {r12}
 .endif
- CHK_COUNT 129
-@ It is not clear if the following addition of the block number in r12 to the IV can usefully
-@ be done in terms of shares. Instead we do an addition and subtraction whose overall effect
-@ is the same, and which provides a small degree of masking. The IV is not a secret anyway.
- ldmia r0,{r4-r7}            @ load IV
- rev r7,r7                   @ prepare for byte-big-endian, bit-little-endian (!) addition
- rev r6,r6
- rev r5,r5
- rev r4,r4
- bl gen_rand
- bic r8,r0,#0x80000000       @ only 31 bits so we don't get any overflows in the following
- add r9,r8,r12               @ "masked" block number
- adds r7,r7,r9               @ 128-bit addition
- adcs r6,r6,#0
- adcs r5,r5,#0
- adcs r4,r4,#0
- subs r7,r7,r8               @ 128-bit subtraction, unmasking block number
- sbcs r6,r6,r8,asr#31
- sbcs r5,r5,r8,asr#31
- sbcs r4,r4,r8,asr#31
- rev r7,r7
- rev r6,r6
- rev r5,r5
- rev r4,r4
- CHK_COUNT 130
- bl remap                    @ shuffle the LUts
- CHK_COUNT 131
- bl ref_round_keys_s         @ refresh the round keys
- CHK_COUNT 132
- bl ns_to_s                  @ convert IV+x to shares
- CHK_COUNT 133
- bl rounds_s                 @ forward AES rounds on IV+x
- CHK_COUNT 134
- ldr r3,[r1]                 @ decrypt ciphertext
- eors r3,r3,r4
- eors r3,r3,r8
- str r3,[r1]
- ldr r3,[r1,#4]
+ pop {r2}
+ adds r2,r2,#1               @ increment round counter
+ cmp r2,#14
+ beq 2f                      @ break from loop? (last round has no mix_cols)
+ push {r2}
+ bl mix_cols_s
+ pop {r2}
+ b rounds_s_mainloop
+2:
+ CHK_COUNT 86,6
+ ldr r12,=rkey_s+14*40       @ final round key shares
+ bl addrkey_s
+ CHK_COUNT 87,6
+ bl conjshareC               @ Undo the effect of shareC from lut_a, lut_b
+ CHK_COUNT 88,6
+.if ST_VPERM
+@ Undo the effects of vperm rotation recorded in statevperm
+ ldr r1,=statevperm
+ ldr r2,[r1]
+ rsbs r0,r2,#0
+ bl addstatevperm
+.endif
+
+ pop {r1-r3,r12}
+ push {r3}
+@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered
+
+decryption_start:
+@ Decrypt ciphertext using AES output in shares: r4-r11
+.if ST_SHAREC
+ ldr r0,=shareC
+ ldr r0,[r0]
+.else
+ movs r0,#0
+.endif
+ ldr r14,=chaff
+@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff
+ CHK_COUNT 89,6
+ add r1,r1,r12,lsl#4         @ Temporarily r1 points to block-to-be-deciphered
+ ldr r3,[r1]                 @ r3=ciphertext word
+ eors r3,r3,r4               @ r3=r3^shareA
+ ldr r4,[r14]                @ barrier load
+ eor r3,r3,r8,ror#16         @ r3=r3^shareB
+ eors r3,r3,r0               @ r3=r3^shareC
+ str r3,[r1]                 @ plaintext word=r3
+ ldr r3,[r1,#4]              @ and similarly for words 1,2,3 of block...
+ ldr r4,[r14,#4]
  eors r3,r3,r5
- eors r3,r3,r9
+ eor r3,r3,r9,ror#16
+ eors r3,r3,r0
  str r3,[r1,#4]
  ldr r3,[r1,#8]
+ ldr r4,[r14,#8]
  eors r3,r3,r6
- eors r3,r3,r10
+ eor r3,r3,r10,ror#16
+ eors r3,r3,r0
  str r3,[r1,#8]
  ldr r3,[r1,#12]
+ ldr r4,[r14,#12]
  eors r3,r3,r7
- eors r3,r3,r11
+ eor r3,r3,r11,ror#16
+ eors r3,r3,r0
  str r3,[r1,#12]
- CHK_COUNT 135
-.if CT_BPERM
- pop {r4-r8,r12}
- muls r7,r7,r5               @ LCG step: x<-ax+1
- adds r7,r7,#1
- subs r6,r6,#1
- CHK_COUNT 136
- bcs 1b
- pop {r0,r1,r3,r4-r11,r14}
-.else
- pop {r12}
- adds r1,r1,#16
- add r12,r12,#1
- cmp r12,r2
- CHK_COUNT 136
- bne 1b
- pop {r0,r3,r4-r11,r14}
-.endif
- CHK_COUNT 137
- CHK_CANARY r3,CTAG0
- bx r14
-.endif
 
-.ltorg
+ sub r1,r1,r12,lsl#4         @ Restore r1 to point to start of buffer
+ CHK_COUNT 90,6
 
-.thumb_func
-aes_end:
- nop
-
-@@@@@@@@@@@@@@@@@@@@@@@@@ test functions @@@@@@@@@@@@@@@@@@@@@@@@@
-
-@ .global test_v
-
-@ .section .text.test_v,"ax",%progbits
-@ .macro fn
-@  ldr.n r0,=0x12345678
-@  ldr.n r0,=0xedcba987
-@ .endm
-@ .macro tenfn
-@  fn
-@  fn
-@  fn
-@  fn
-@  fn
-@  fn
-@  fn
-@  fn
-@  fn
-@  fn
-@ .endm
-@ .macro hundredfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@  tenfn
-@ .endm
-@
-@ .thumb_func
-@ test_v:
-@ .balign 4
-@ 1:
-@  hundredfn
-@  b 1b
-@  bx r14
-@ .ltorg
-
-@ switch from shared to non-shared state
-@ s_to_ns:
-@  eor r4,r4,r8
-@  eor r5,r5,r9
-@  eor r6,r6,r10
-@  eor r7,r7,r11
-@  bx r14
-
-.section .text.debugging,"ax",%progbits
+ pop {r3}                    @ Restore block counter
+@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
+decryption_end:
 
-.thumb_func
-delay:
-.if CHIPW
- subs r0,r0,#3     @ we are clocked approximately three times slower
-.else
- subs r0,r0,#1
-.endif
- bcs delay
- bx r14
-
-.thumb_func
-flush_reg:
-@ put known values into r0-r3,r12
- mov r0, #0x80808080
- mov r1, #0x81818181
- mov r2, #0x82828282
- mov r3, #0x83838383
- mov r12,#0x8c8c8c8c
- bx r14
+ adds r3,r3,#1
+ cmp r3,r2
+ CHK_COUNT 91,6
+ bne ctr_crypt_mainloop
 
-.thumb_func
-isr_systick:
- mov.w r2,#0xd0000000 @ set GPIO24
- mov.w r3,#0x01000000
- str r3,[r2,#24]
- ldr r0,=systick_data
+#if WIPE_MEMORY
+@ Wipe memory from workspace_start up to the stack pointer
+@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals
+ ldr r4,=workspace_start
+ ldr r5,=rstate_all_start
+1:
+ bl gen_rand_sha_nonpres
+ stmia r4!,{r0}
+ cmp r4,r5
+ bcc 1b
+ ldr r4,=rstate_all_end
+ mov r5,r13                  @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register
+1:
+ bl gen_rand_sha_nonpres
+ stmia r4!,{r0}
+ cmp r4,r5
+ bcc 1b
+
+@ Then fill everything with zeros so as not to leave behind clues about the RNG state
+ ldr r4,=workspace_start
+ movs r0,#0
+ mov r5,r13
+1:
+ stmia r4!,{r0}
+ cmp r4,r5
+ bcc 1b
+#endif
 
- ldr r1,[r0]
- adds r1,r1,#1
- stmia r0!,{r1}
- ldr r1,[r13,#0] @ r0..r2
- ldr r2,[r13,#4]
- ldr r3,[r13,#8]
- stmia r0!,{r1-r3}
- ldr r1,[r13,#12] @ r3
- stmia r0!,{r1,r4-r11}
- ldr r1,[r13,#16] @ r12
- ldr r3,[r13,#28] @ RETPSR
- ubfx r2,r3,#9,#1 @ SPREALIGN
- add r2,r13,r2,lsl#2 @ add 4 to SP if SPREALIGN set in RETPSR
- add r2,r2,#0x68 @ r13
- stmia r0!,{r1-r2}
-
- ldr r1,[r13,#20] @ r14
- ldr r2,[r13,#24] @ ReturnAddress
-@ RETPSR still in r3
- stmia r0!,{r1-r3}
-
- ldr r0,=0xe000e010
- mov r1,#5
- str r1,[r0] @ write to CSR
- mov.w r2,#0xd0000000
- mov.w r3,#0x01000000
- str r3,[r2,#32] @ clear GPIO24
- bx r14
\ No newline at end of file
+.if GEN_RAND_SHA
+ SET_COUNT 23,6
+ bl reset_sha_trng           @ clear out the SHA hardware
+.endif
+ pop {r0-r12,r14}
+ CHK_CANARY r12,CTAG0,6
+ bx r14
diff --git a/bootloaders/encrypted/config.h b/bootloaders/encrypted/config.h
index 0a39cedf4..2c4ce0d03 100644
--- a/bootloaders/encrypted/config.h
+++ b/bootloaders/encrypted/config.h
@@ -1,127 +1,90 @@
 #pragma once
 
-#ifndef CM_PROFILE
-#define CM_PROFILE 0
-#endif
-
-#define DEBUG                0         // for use in debugging with serial output (timing not repeatable)
-#define CHIPW                0         // change clock to 48MHz for use with CW hardware
-#define SYSTICK_IMAP         0         // use SYSTICK to get a map of instruction execution (set DEBUG to 0 to get useful timings)
-#define INCLUDE_ENCRYPT_CBC  0         // include code to perform encryption in CBC mode?
-#define INCLUDE_DECRYPT_CBC  0         // include code to perform decryption in CBC mode?
-#define INCLUDE_CRYPT_CTR    1         // include code to perform de/encryption in CTR mode?
-#define ROUND_TRIP_TEST      0         // do the glitch detection test in CBC mode where we re-encrypt each block and compare against original ciphertext?
-#define SBOX_VIA_INV         1         // compute (inverse) S-box values via a table of field inverses rather than via a direct table?
-#define GEN_RAND_SHA         0         // use SHA256 hardware to generate random numbers (disable for Qemu testing)
-
-#if ROUND_TRIP_TEST && !SBOX_VIA_INV
-#error Sorry, if you want to do the round-trip test then SBOX_VIA_INV must also be set
-#endif
-
-#if CM_PROFILE==0
-
-#define RANDOMIZE            0         // new random seed on each reset?
-#define RC_CANARY            0         // use rcp_canary feature
-#define RC_JITTER            0         // use random-delay versions of RCP instructions
-#define RC_COUNT             0         // use rcp_count feature
-#define IK_SHUFREAD          0         // read key bytes in random order?
-#define IK_JUNK              0         // add some random distraction in init_key?
-#define IK_PERM              0         // permute bytes (and possibly distraction bytes) in round key generation?
-#define IK_REMAP             0         // remap S-box in round key generation?
-#define IK_JITTER            0         // jitter timing in init_key?
-#define RK_ROR               0         // store round keys with random RORs?
-#define ST_HPERM             0         // insert random horizontal permutations in state during de/encryption?
-#define ST_VPERM             0         // insert random vertical permutations in state during de/encryption?
-#define CT_BPERM             0         // process blocks in a random order in counter mode?
-
-#elif CM_PROFILE==1
-
-#define RANDOMIZE            1         // new random seed on each reset?
-#define RC_CANARY            0         // use rcp_canary feature
-#define RC_JITTER            0         // use random-delay versions of RCP instructions
-#define RC_COUNT             0         // use rcp_count feature
-#define IK_SHUFREAD          1         // read key bytes in random order?
-#define IK_JUNK              1         // add some random distraction in init_key?
-#define IK_PERM              1         // permute bytes (and possibly distraction bytes) in round key generation?
-#define IK_REMAP             1         // remap S-box in round key generation?
-#define IK_JITTER            0         // jitter timing in init_key?
-#define RK_ROR               1         // store round keys with random RORs?
-#define ST_HPERM             0         // insert random horizontal permutations in state during de/encryption?
-#define ST_VPERM             0         // insert random vertical permutations in state during de/encryption?
-#define CT_BPERM             0         // process blocks in a random order in counter mode?
-
-#elif CM_PROFILE==2
-
-#define RANDOMIZE            1         // new random seed on each reset?
-#define RC_CANARY            0         // use rcp_canary feature
-#define RC_JITTER            0         // use random-delay versions of RCP instructions
-#define RC_COUNT             0         // use rcp_count feature
-#define IK_SHUFREAD          0         // read key bytes in random order?
-#define IK_JUNK              0         // add some random distraction in init_key?
-#define IK_PERM              0         // permute bytes (and possibly distraction bytes) in round key generation?
-#define IK_REMAP             0         // remap S-box in round key generation?
-#define IK_JITTER            0         // jitter timing in init_key?
-#define RK_ROR               0         // store round keys with random RORs?
-#define ST_HPERM             1         // insert random horizontal permutations in state during de/encryption?
+// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high
+// or because the time cost is very low so you may as well have them.
+// They can be set to 0 for analysis or testing purposes.
+
+#ifndef GEN_RAND_SHA
+#define GEN_RAND_SHA         1         // use SHA256 hardware to generate some random numbers
+#endif
+                                       // Some RNG calls are hard coded to LFSR RNG, others to SHA RNG
+                                       // Setting GEN_RAND_SHA to 0 has the effect of redirecting the latter to LFSR RNG
+#ifndef ST_SHAREC
+#define ST_SHAREC            1         // This creates a partial extra share at almost no extra cost
+#endif
+#ifndef ST_VPERM
 #define ST_VPERM             1         // insert random vertical permutations in state during de/encryption?
-#define CT_BPERM             0         // process blocks in a random order in counter mode?
-
-#elif CM_PROFILE==3
-
-#define RANDOMIZE            1         // new random seed on each reset?
-#define RC_CANARY            0         // use rcp_canary feature
-#define RC_JITTER            0         // use random-delay versions of RCP instructions
-#define RC_COUNT             0         // use rcp_count feature
-#define IK_SHUFREAD          0         // read key bytes in random order?
-#define IK_JUNK              0         // add some random distraction in init_key?
-#define IK_PERM              0         // permute bytes (and possibly distraction bytes) in round key generation?
-#define IK_REMAP             0         // remap S-box in round key generation?
-#define IK_JITTER            0         // jitter timing in init_key?
-#define RK_ROR               0         // store round keys with random RORs?
-#define ST_HPERM             0         // insert random horizontal permutations in state during de/encryption?
-#define ST_VPERM             0         // insert random vertical permutations in state during de/encryption?
+#endif
+#ifndef CT_BPERM
 #define CT_BPERM             1         // process blocks in a random order in counter mode?
+#endif
+#ifndef RK_ROR
+#define RK_ROR               1         // store round key shares with random rotations within each word
+#endif
+
+#ifndef WIPE_MEMORY
+#define WIPE_MEMORY          1         // Wipe memory after decryption
+#endif
+
+// The following options should be enabled to increase resistance to glitching attacks.
 
-#elif CM_PROFILE==4
-
-#define RANDOMIZE            1         // new random seed on each reset?
-#define RC_CANARY            0         // use rcp_canary feature
-#define RC_JITTER            0         // use random-delay versions of RCP instructions
-#define RC_COUNT             0         // use rcp_count feature
-#define IK_SHUFREAD          0         // read key bytes in random order?
-#define IK_JUNK              0         // add some random distraction in init_key?
-#define IK_PERM              0         // permute bytes (and possibly distraction bytes) in round key generation?
-#define IK_REMAP             0         // remap S-box in round key generation?
-#define IK_JITTER            1         // jitter timing in init_key?
-#define RK_ROR               0         // store round keys with random RORs?
-#define ST_HPERM             0         // insert random horizontal permutations in state during de/encryption?
-#define ST_VPERM             0         // insert random vertical permutations in state during de/encryption?
-#define CT_BPERM             0         // process blocks in a random order in counter mode?
-
-#elif CM_PROFILE==5
-
-#define RANDOMIZE            1         // new random seed on each reset?
+#ifndef RC_CANARY
 #define RC_CANARY            1         // use rcp_canary feature
-#define RC_JITTER            1         // use random-delay versions of RCP instructions
+#endif
+#ifndef RC_COUNT
 #define RC_COUNT             1         // use rcp_count feature
-#define IK_SHUFREAD          1         // read key bytes in random order?
-#define IK_JUNK              1         // add some random distraction in init_key?
-#define IK_PERM              1         // permute bytes (and possibly distraction bytes) in round key generation?
-#define IK_REMAP             1         // remap S-box in round key generation?
-#define IK_JITTER            1         // jitter timing in init_key?
-#define RK_ROR               1         // store round keys with random RORs?
-#define ST_HPERM             1         // insert random horizontal permutations in state during de/encryption?
-#define ST_VPERM             1         // insert random vertical permutations in state during de/encryption?
-#define CT_BPERM             1         // process blocks in a random order in counter mode?
+#endif
+
+// Although jitter/timing-variation may be circumventable in theory, in practice
+// randomising the timing of operations can make side-channel attacks very much more
+// effort to carry out. These can be disabled for analysis or testing purposes.
+// It is advisable to use a least one form of jitter.
 
+// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default.
+// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.)
+#ifndef RC_JITTER
+#define RC_JITTER            0         // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions.
 #endif
 
-#if RC_COUNT && (INCLUDE_ENCRYPT_CBC || INCLUDE_DECRYPT_CBC)
-#error Sorry, RC_COUNT is only tested in CTR mode
+#ifndef SH_JITTER
+#define SH_JITTER            1         // Insert random delays, tagged onto SHA RNG
 #endif
 
-// derived values
-#define NEED_ROUNDS          (INCLUDE_ENCRYPT_CBC || (INCLUDE_DECRYPT_CBC && ROUND_TRIP_TEST) || INCLUDE_CRYPT_CTR)
-#define NEED_INV_ROUNDS      (INCLUDE_DECRYPT_CBC)
-#define NEED_HPERM           (IK_PERM || ST_HPERM)
-#define NEED_VPERM           (IK_PERM || ST_VPERM)
\ No newline at end of file
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The following options can be adjusted, affecting the performance/security tradeoff
+
+// Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security.
+// No point in making them more than 16 or so, since the time taken by the subroutines would be negligible.
+// These must be a power of 2. Timings as of commit 82d31652
+// 
+//                                        Baseline time per 16-byte block = 14109 (with no jitter)         cycles
+#ifndef REFCHAFF_PERIOD
+#define REFCHAFF_PERIOD             1     // Extra cost per 16-byte block =   474/REFCHAFF_PERIOD          cycles
+#endif
+#ifndef REMAP_PERIOD
+#define REMAP_PERIOD                4     // Extra cost per 16-byte block =  4148/REMAP_PERIOD             cycles
+#endif
+#ifndef REFROUNDKEYSHARES_PERIOD
+#define REFROUNDKEYSHARES_PERIOD    1     // Extra cost per 16-byte block =  1304/REFROUNDKEYSHARES_PERIOD cycles
+#endif
+#ifndef REFROUNDKEYHVPERMS_PERIOD
+#define REFROUNDKEYHVPERMS_PERIOD   1     // Extra cost per 16-byte block =  1486/REFROUNDKEYVPERM_PERIOD  cycles
+#endif
+
+// Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only,
+// so lower = more performance and lower security.
+// The rationale for doing it this way is that later rounds should be protected by CT_BPERM.
+// NUMREFSTATEVPERM can be from 0 to 14.
+#ifndef NUMREFSTATEVPERM
+#define NUMREFSTATEVPERM            7     // Extra cost per 16-byte block =  61*NUMREFSTATEVPERM cycles
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MAX_NUM_BLOCKS 32768
+
+#if SH_JITTER && !GEN_RAND_SHA
+#error GEN_RAND_SHA must be set if you want to use SH_JITTER
+#endif
diff --git a/bootloaders/encrypted/enc-pt.json b/bootloaders/encrypted/enc-pt.json
index 9b7a86d3b..e9a12b7dd 100644
--- a/bootloaders/encrypted/enc-pt.json
+++ b/bootloaders/encrypted/enc-pt.json
@@ -12,8 +12,8 @@
     {
       "name": "A",
       "id": 0,
-      "start": "64K",
-      "size": "448K",
+      "start": "40K",
+      "size": "480K",
       "families": ["rp2350-arm-s"],
       "permissions": {
         "secure": "rw",
@@ -24,7 +24,7 @@
     {
       "name": "B",
       "id": 1,
-      "size": "448K",
+      "size": "480K",
       "families": ["rp2350-arm-s"],
       "permissions": {
         "secure": "rw",
@@ -34,4 +34,4 @@
       "link": ["a", 0]
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c
index 1df509101..54e89d2e5 100644
--- a/bootloaders/encrypted/enc_bootloader.c
+++ b/bootloaders/encrypted/enc_bootloader.c
@@ -16,53 +16,22 @@
 
 #include "config.h"
 
-extern void flush_reg();
-volatile uint32_t systick_data[18]; // count, R0-R15,RETPSR
-
-extern void remap();
-extern uint32_t gen_rand();
-extern void init_key(uint8_t *rk_s, uint8_t *key);
-extern void gen_lut_inverse();
-extern void gen_lut_sbox();
-extern void gen_lut_inv_sbox();
-extern int  ctr_crypt_s(uint8_t*iv,uint8_t*buf,int nblk);
-
-extern uint8_t rkey_s[480];
-extern uint8_t lut_a[256];
-extern uint8_t lut_b[256];
-extern uint32_t lut_a_map;
-extern uint32_t lut_b_map;
-extern uint32_t rstate[4];
-
-static void init_lut_map() {
-    int i;
-    for(i=0;i<256;i++) lut_b[i]=gen_rand()&0xff, lut_a[i]^=lut_b[i];
-    lut_a_map=0;
-    lut_b_map=0;
-    remap();
+#define OTP_KEY_PAGE 30
+
+extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk);
+
+// The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins.
+// That is a suitable point to lock the OTP area where key information is stored.
+void lock_key() {
+    otp_hw->sw_lock[OTP_KEY_PAGE] = 0xf;
 }
 
+
 static __attribute__((aligned(4))) uint8_t workarea[4 * 1024];
 
 int main() {
     stdio_init_all();
 
-    #if RANDOMIZE
-        get_rand_128((rng_128_t*)rstate);   // fill rstate with 128 bits of random data
-    #else
-        rstate[0]=1223352428;
-        rstate[1]=1223352428;
-        rstate[2]=0x41414141;
-        rstate[3]=0x41414141;
-    #endif
-
-    // reset the RNG
-    reset_block(RESETS_RESET_SHA256_BITS);
-    unreset_block(RESETS_RESET_SHA256_BITS);
-    rstate[0]&=0xffffff00;    // bottom byte must be zero
-
-    printf("Rstate at address %x\n", rstate);
-
     printf("Entered bootloader code\n");
     int rc;
     rc = rom_load_partition_table(workarea, sizeof(workarea), false);
@@ -74,15 +43,13 @@ int main() {
     boot_info_t info;
     printf("Getting boot info\n");
     rc = rom_get_boot_info(&info);
-    uint32_t flash_update_base = 0;
     printf("Boot Type %x\n", info.boot_type);
 
     if (info.boot_type == BOOT_TYPE_FLASH_UPDATE) {
-        flash_update_base = info.reboot_params[0];
-        printf("Flash Update Base %x\n", flash_update_base);
+        printf("Flash Update Base %x\n", info.reboot_params[0]);
     }
 
-    rc = rom_pick_ab_partition(workarea, sizeof(workarea), 0, flash_update_base);
+    rc = rom_pick_ab_update_partition((uint32_t*)workarea, sizeof(workarea), 0);
     if (rc < 0) {
         printf("Partition Table A/B choice failed %d - resetting\n", rc);
         reset_usb_boot(0, 0);
@@ -92,31 +59,35 @@ int main() {
 
     rc = rom_get_partition_table_info((uint32_t*)workarea, 0x8, PT_INFO_PARTITION_LOCATION_AND_FLAGS | PT_INFO_SINGLE_PARTITION | (boot_partition << 24));
 
-    uint32_t data_start_addr;
-    uint32_t data_end_addr;
+    uint32_t data_start_addr = 0;
+    uint32_t data_end_addr = 0;
+    uint32_t data_max_size = 0;
     if (rc != 3) {
         printf("No boot partition - assuming bin at start of flash\n");
         data_start_addr = 0;
         data_end_addr = 0x70000; // must fit into 0x20000000 -> 0x20070000
+        data_max_size = data_end_addr - data_start_addr;
     } else {
         uint16_t first_sector_number = (((uint32_t*)workarea)[1] & PICOBIN_PARTITION_LOCATION_FIRST_SECTOR_BITS) >> PICOBIN_PARTITION_LOCATION_FIRST_SECTOR_LSB;
         uint16_t last_sector_number = (((uint32_t*)workarea)[1] & PICOBIN_PARTITION_LOCATION_LAST_SECTOR_BITS) >> PICOBIN_PARTITION_LOCATION_LAST_SECTOR_LSB;
         data_start_addr = first_sector_number * 0x1000;
         data_end_addr = (last_sector_number + 1) * 0x1000;
+        data_max_size = data_end_addr - data_start_addr;
 
-        printf("Partition Start %x, End %x\n", data_start_addr, data_end_addr);
+        printf("Partition Start %x, End %x, Max Size %x\n", data_start_addr, data_end_addr, data_max_size);
     }
 
     printf("Decrypting the chosen image\n");
     uint32_t first_mb_start = 0;
+    bool first_mb_start_found = false;
     uint32_t first_mb_end = 0;
     uint32_t last_mb_start = 0;
-    for (uint16_t i=0; i <= 0x1000; i += 4) {
+    for (uint16_t i=0; i < 0x1000; i += 4) {
         if (*(uint32_t*)(XIP_BASE + data_start_addr + i) == 0xffffded3) {
             printf("Found first block start\n");
             first_mb_start = i;
-        }
-        if (*(uint32_t*)(XIP_BASE + data_start_addr + i) == 0xab123579) {
+            first_mb_start_found = true;
+        } else if (first_mb_start_found && (*(uint32_t*)(XIP_BASE + data_start_addr + i) == 0xab123579)) {
             printf("Found first block end\n");
             first_mb_end = i + 4;
             last_mb_start = *(uint32_t*)(XIP_BASE + data_start_addr + i-4) + first_mb_start;
@@ -124,6 +95,12 @@ int main() {
         }
     }
 
+    if (last_mb_start > data_max_size) {
+        // todo - harden this check
+        printf("ERROR: Encrypted binary is too big for it's partition - resetting\n");
+        reset_usb_boot(0, 0);
+    }
+
     if (*(uint32_t*)(XIP_BASE + data_start_addr + last_mb_start) == 0xffffded3) {
         printf("Found last block start where expected\n");
     } else {
@@ -171,20 +148,17 @@ int main() {
     for (int i=0; i < 4; i++)
         printf("%08x\n", *(uint32_t*)(SRAM_BASE + i*4));
 
-    flush_reg();
-    #if !SBOX_VIA_INV
-        gen_lut_sbox();
-    #else
-        gen_lut_inverse();
-    #endif
-    init_lut_map();
     // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors
     uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE;
-    init_key(rkey_s, (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)]));
-    otp_hw->sw_lock[30] = 0xf;
-    flush_reg();
-    ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16);
-    flush_reg();
+
+    decrypt(
+        (uint8_t*)&(otp_data[OTP_KEY_PAGE * 0x40]),
+        (uint8_t*)&(otp_data[(OTP_KEY_PAGE + 1) * 0x40]),
+        iv, (void*)SRAM_BASE, data_size/16
+    );
+
+    // Lock the IV salt
+    otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf;
 
     printf("Post decryption image begins with\n");
     for (int i=0; i < 4; i++)
diff --git a/bootloaders/encrypted/ivsalt.bin b/bootloaders/encrypted/ivsalt.bin
new file mode 100644
index 000000000..fb9ef50b8
--- /dev/null
+++ b/bootloaders/encrypted/ivsalt.bin
@@ -0,0 +1 @@
+���x��%�^��=T�Č
\ No newline at end of file
diff --git a/bootloaders/encrypted/otp.json b/bootloaders/encrypted/otp.json
deleted file mode 100644
index f86a9e019..000000000
--- a/bootloaders/encrypted/otp.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
-  "30:0" : 
-  {
-    "ecc" : true,
-    "value" : 
-    [
-      "0x00",
-      "0x01",
-      "0x02",
-      "0x03",
-      "0x04",
-      "0x05",
-      "0x06",
-      "0x07",
-      "0x08",
-      "0x09",
-      "0x0a",
-      "0x0b",
-      "0x0c",
-      "0x0d",
-      "0x0e",
-      "0x0f",
-      "0x00",
-      "0x10",
-      "0x20",
-      "0x30",
-      "0x40",
-      "0x50",
-      "0x60",
-      "0x70",
-      "0x80",
-      "0x90",
-      "0xa0",
-      "0xb0",
-      "0xc0",
-      "0xd0",
-      "0xe0",
-      "0xf0"
-    ]
-  },
-  "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ],
-  "OTP_DATA_KEY1_VALID" : "0x010101",
-  "OTP_DATA_KEY2" : [ 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 ],
-  "OTP_DATA_KEY2_VALID" : "0x010101",
-  "PAGE30_LOCK0" : "0x4a4a4a"
-}
\ No newline at end of file
diff --git a/bootloaders/encrypted/privateaes.bin b/bootloaders/encrypted/privateaes.bin
index 0122f8a2c..21a47756d 100644
Binary files a/bootloaders/encrypted/privateaes.bin and b/bootloaders/encrypted/privateaes.bin differ
diff --git a/bootloaders/encrypted/update-key.cmake b/bootloaders/encrypted/update-key.cmake
deleted file mode 100644
index a14c90c7c..000000000
--- a/bootloaders/encrypted/update-key.cmake
+++ /dev/null
@@ -1,23 +0,0 @@
-if (CMAKE_VERSION VERSION_LESS 3.19)
-    # Check if keyfile is not the default, and print warning
-    file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX)
-    if (NOT ${key_file} STREQUAL "000102030405060708090a0b0c0d0e0f00102030405060708090a0b0c0d0e0f0")
-        message(WARNING
-            "Encrypted bootloader AES key not updated in otp.json file, as CMake version is < 3.19"
-            " - you will need to change the key in otp.json manually and re-run the build"
-        )
-    endif()
-else()
-    # Read the JSON file.
-    file(READ ${CMAKE_CURRENT_LIST_DIR}/otp.json json_string)
-    # Read the key file
-    file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX)
-
-    # adds '0x' prefix, comma suffix, and quotes for every byte
-    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "\"0x\\1\", " key_file ${key_file})
-    set(key_file_json "[${key_file}]")
-
-    string(JSON json_string SET ${json_string} "30:0" "value" ${key_file_json})
-
-    file(WRITE ${CMAKE_CURRENT_LIST_DIR}/otp.json ${json_string})
-endif()
diff --git a/encrypted/CMakeLists.txt b/encrypted/CMakeLists.txt
new file mode 100644
index 000000000..c7346d4ea
--- /dev/null
+++ b/encrypted/CMakeLists.txt
@@ -0,0 +1,6 @@
+if (TARGET pico_mbedtls)
+    add_subdirectory_exclude_platforms(hello_encrypted host rp2040 rp2350-riscv)
+else()
+    # Assume picotool has no signing support, if no pico_mbedtls available
+    message("Skipping encrypted example as pico_mbedtls unavailable")
+endif ()
diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt
new file mode 100644
index 000000000..8e400a646
--- /dev/null
+++ b/encrypted/hello_encrypted/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Example encrypted binary
+add_executable(hello_encrypted
+        hello_encrypted.c
+        secret.S
+        )
+
+# include directory containing secret.txt
+target_include_directories(hello_encrypted PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+# add dependency on secret.txt
+set_property(SOURCE secret.S APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_LIST_DIR}/secret.txt)
+
+# pull in common dependencies
+target_link_libraries(hello_encrypted pico_stdlib)
+
+# enable stdio_usb and stdio_uart
+pico_enable_stdio_uart(hello_encrypted 1)
+pico_enable_stdio_usb(hello_encrypted 1)
+
+# set as no_flash binary
+pico_set_binary_type(hello_encrypted no_flash)
+
+# set version (optional)
+pico_set_binary_version(hello_encrypted MAJOR 7 MINOR 3)
+
+# set tbyb (optional)
+# target_compile_definitions(hello_encrypted PRIVATE PICO_CRT0_IMAGE_TYPE_TBYB=1)
+
+# configure otp output
+pico_set_otp_key_output_file(hello_encrypted ${CMAKE_CURRENT_BINARY_DIR}/otp.json)
+
+# sign, hash, and encrypt
+pico_sign_binary(hello_encrypted ${CMAKE_CURRENT_LIST_DIR}/private.pem)
+pico_hash_binary(hello_encrypted)
+pico_encrypt_binary(hello_encrypted
+    ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin
+    ${CMAKE_CURRENT_LIST_DIR}/ivsalt.bin
+    EMBED
+    OTP_KEY_PAGE 29)
+
+# package uf2 in flash
+pico_package_uf2_output(hello_encrypted 0x10000000)
+
+# create map/bin/hex/uf2 file etc.
+pico_add_extra_outputs(hello_encrypted)
+
+# add url via pico_set_program_url
+example_auto_set_url(hello_encrypted)
diff --git a/encrypted/hello_encrypted/README.md b/encrypted/hello_encrypted/README.md
new file mode 100644
index 000000000..d65a2c50c
--- /dev/null
+++ b/encrypted/hello_encrypted/README.md
@@ -0,0 +1,28 @@
+For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Make sure you **don't lose your keys and salts**, else you may not be able to update the code on your device.
+
+Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with:
+
+```bash
+openssl ecparam -name secp256k1 -genkey -out private.pem
+```
+
+The AES key is stored as a 4-way share in a 128 byte binary file - you can create one with
+
+```bash
+dd if=/dev/urandom of=privateaes.bin bs=1 count=128
+```
+
+or in Powershell 7
+```powershell
+[byte[]] $(Get-SecureRandom -Maximum 256 -Count 128) | Set-Content privateaes.bin -AsByteStream
+```
+
+The IV salt is just a 16 byte binary file - you can create it the same way, replacing `128` with `16` and `privateaes.bin` with `ivsalt.bin` in the commands above.
+
+You will need to program your OTP using the `otp.json` file generated by the build in your build folder
+NOTE: This will enable secure boot on your device, so only correctly signed binaries can then run, and will also lock down the OTP pages the AES key and IV salt are stored in.
+```bash
+picotool otp load otp.json
+```
+
+> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see section 5.10
diff --git a/encrypted/hello_encrypted/hello_encrypted.c b/encrypted/hello_encrypted/hello_encrypted.c
new file mode 100644
index 000000000..ff578bdcc
--- /dev/null
+++ b/encrypted/hello_encrypted/hello_encrypted.c
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pico/stdlib.h"
+#include "pico/bootrom.h"
+#include "hardware/sync.h"
+
+int main() {
+    enable_interrupts();
+    stdio_init_all();
+
+#if PICO_CRT0_IMAGE_TYPE_TBYB
+    boot_info_t boot_info = {};
+    int ret = rom_get_boot_info(&boot_info);
+    if (ret) {
+        // BOOT_TBYB_AND_UPDATE_FLAG_BUY_PENDING will always be set, but check anyway
+        if (boot_info.tbyb_and_update_info & BOOT_TBYB_AND_UPDATE_FLAG_BUY_PENDING) {
+            // Need to check flash_update_base is set to see if this is a TBYB update
+            uint32_t flash_update_base = boot_info.reboot_params[0];
+            if (flash_update_base) {
+                printf("Perform self-check... ");
+                if (1 == 1) {   // replace this with your actual self-check function
+                    printf("passed\n");
+                } else {
+                    printf("failed - looping forever\n");
+                    while (true) sleep_ms(1000);
+                }
+            }
+            uint32_t buf_size = flash_update_base ? 4096 : 0;
+            uint8_t* buffer = flash_update_base ? malloc(buf_size) : NULL;
+            int ret = rom_explicit_buy(buffer, buf_size);
+            assert(ret == 0);
+            if (buffer) free(buffer);
+        }
+    }
+#endif
+    extern char secret_data[];
+
+    while (true) {
+        printf("Hello, world!\n");
+        printf("I'm a self-decrypting binary\n");
+        printf("My secret is...\n");
+        sleep_ms(1000);
+        printf(secret_data);
+        sleep_ms(10000);
+    }
+}
diff --git a/encrypted/hello_encrypted/ivsalt.bin b/encrypted/hello_encrypted/ivsalt.bin
new file mode 100644
index 000000000..fb9ef50b8
--- /dev/null
+++ b/encrypted/hello_encrypted/ivsalt.bin
@@ -0,0 +1 @@
+���x��%�^��=T�Č
\ No newline at end of file
diff --git a/encrypted/hello_encrypted/private.pem b/encrypted/hello_encrypted/private.pem
new file mode 100644
index 000000000..bf777d897
--- /dev/null
+++ b/encrypted/hello_encrypted/private.pem
@@ -0,0 +1,8 @@
+-----BEGIN EC PARAMETERS-----
+BgUrgQQACg==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MHQCAQEEIAXAdiilH8wT07TESUzWPt+BY9+NcchvYU3xbnpK+CBNoAcGBSuBBAAK
+oUQDQgAEYYJtMQFGW4AB94tU3u/Qir5sRcYjBYMqCa+8gxsYd9OwMS3dqWKsnVBz
+dyy7bFWdJzXDMb9o20xRRd57Q9xSYw==
+-----END EC PRIVATE KEY-----
diff --git a/encrypted/hello_encrypted/privateaes.bin b/encrypted/hello_encrypted/privateaes.bin
new file mode 100644
index 000000000..21a47756d
Binary files /dev/null and b/encrypted/hello_encrypted/privateaes.bin differ
diff --git a/encrypted/hello_encrypted/secret.S b/encrypted/hello_encrypted/secret.S
new file mode 100644
index 000000000..0014c0d6e
--- /dev/null
+++ b/encrypted/hello_encrypted/secret.S
@@ -0,0 +1,5 @@
+.section .rodata
+.global secret_data
+secret_data:
+.incbin "secret.txt"
+.byte 0
\ No newline at end of file
diff --git a/encrypted/hello_encrypted/secret.txt b/encrypted/hello_encrypted/secret.txt
new file mode 100644
index 000000000..351db192a
--- /dev/null
+++ b/encrypted/hello_encrypted/secret.txt
@@ -0,0 +1 @@
+TODO: Put a funny secret here