Skip to content

Commit 85e5e33

Browse files
committed
internal/chacha20: refactor for readability and consistency
Separated the complex buffering logic from key stream generation more clearly, added plenty of comments and generally refactored the Go implementation for readability. Made the interface with the generic/assembly cores smaller and more consistent, according to golang.org/wiki/TargetSpecific. We will recover the lost performance on unaligned calls by caching 3/4 of the first round across XORKeyStream invocations, which we now have complexity budget for. name old speed new speed delta ChaCha20/64-4 435MB/s ± 2% 429MB/s ± 2% -1.47% (p=0.013 n=10+9) ChaCha20/256-4 496MB/s ± 1% 493MB/s ± 2% ~ (p=0.280 n=10+10) ChaCha20/10x25-4 283MB/s ± 1% 274MB/s ± 2% -3.13% (p=0.000 n=10+10) ChaCha20/4096-4 494MB/s ± 1% 493MB/s ± 5% ~ (p=0.631 n=10+10) ChaCha20/100x40-4 421MB/s ± 3% 408MB/s ± 1% -3.14% (p=0.003 n=9+9) ChaCha20/65536-4 515MB/s ± 1% 519MB/s ± 3% ~ (p=0.161 n=7+10) ChaCha20/1000x65-4 501MB/s ± 2% 501MB/s ± 3% ~ (p=0.497 n=9+10) Also applied a fix for a lingering bug in the ppc64le assembly written by Lynn Boger <[email protected]>. Updates golang/go#24485 Change-Id: I10cf24a7f10359b1b4ae63c9bb1946735b98ac9b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/185439 Reviewed-by: Michael Munday <[email protected]>
1 parent 2dbfe90 commit 85e5e33

File tree

10 files changed

+197
-253
lines changed

10 files changed

+197
-253
lines changed

internal/chacha20/chacha_arm64.go

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,15 @@
33
// license that can be found in the LICENSE file.
44

55
// +build go1.11
6-
// +build !gccgo
6+
// +build !gccgo,!appengine
77

88
package chacha20
99

10-
const (
11-
haveAsm = true
12-
bufSize = 256
13-
)
10+
const bufSize = 256
1411

1512
//go:noescape
1613
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
1714

18-
func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
19-
20-
if len(src) >= bufSize {
21-
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
22-
}
23-
24-
if len(src)%bufSize != 0 {
25-
i := len(src) - len(src)%bufSize
26-
c.buf = [bufSize]byte{}
27-
copy(c.buf[:], src[i:])
28-
xorKeyStreamVX(c.buf[:], c.buf[:], &c.key, &c.nonce, &c.counter)
29-
c.len = bufSize - copy(dst[i:], c.buf[:len(src)%bufSize])
30-
}
15+
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
16+
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
3117
}
File renamed without changes.

internal/chacha20/chacha_generic.go

Lines changed: 124 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -2,57 +2,68 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
// Package ChaCha20 implements the core ChaCha20 function as specified
6-
// in https://tools.ietf.org/html/rfc7539#section-2.3.
5+
// Package chacha20 implements the ChaCha20 encryption algorithm
6+
// as specified in RFC 8439.
77
package chacha20
88

99
import (
1010
"crypto/cipher"
1111
"encoding/binary"
12+
"math/bits"
1213

1314
"golang.org/x/crypto/internal/subtle"
1415
)
1516

16-
// assert that *Cipher implements cipher.Stream
17-
var _ cipher.Stream = (*Cipher)(nil)
18-
1917
// Cipher is a stateful instance of ChaCha20 using a particular key
2018
// and nonce. A *Cipher implements the cipher.Stream interface.
2119
type Cipher struct {
20+
// The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter
21+
// (incremented after each block), and 3 of nonce.
2222
key [8]uint32
23-
counter uint32 // incremented after each block
23+
counter uint32
2424
nonce [3]uint32
25-
buf [bufSize]byte // buffer for unused keystream bytes
26-
len int // number of unused keystream bytes at end of buf
25+
26+
// The last len bytes of buf are leftover key stream bytes from the previous
27+
// XORKeyStream invocation. The size of buf depends on how many blocks are
28+
// computed at a time.
29+
buf [bufSize]byte
30+
len int
2731
}
2832

33+
var _ cipher.Stream = (*Cipher)(nil)
34+
2935
// New creates a new ChaCha20 stream cipher with the given key and nonce.
3036
// The initial counter value is set to 0.
3137
func New(key [8]uint32, nonce [3]uint32) *Cipher {
3238
return &Cipher{key: key, nonce: nonce}
3339
}
3440

35-
// ChaCha20 constants spelling "expand 32-byte k"
41+
// The constant first 4 words of the ChaCha20 state.
3642
const (
37-
j0 uint32 = 0x61707865
38-
j1 uint32 = 0x3320646e
39-
j2 uint32 = 0x79622d32
40-
j3 uint32 = 0x6b206574
43+
j0 uint32 = 0x61707865 // expa
44+
j1 uint32 = 0x3320646e // nd 3
45+
j2 uint32 = 0x79622d32 // 2-by
46+
j3 uint32 = 0x6b206574 // te k
4147
)
4248

49+
const blockSize = 64
50+
51+
// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words.
52+
// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16
53+
// words each round, in columnar or diagonal groups of 4 at a time.
4354
func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
4455
a += b
4556
d ^= a
46-
d = (d << 16) | (d >> 16)
57+
d = bits.RotateLeft32(d, 16)
4758
c += d
4859
b ^= c
49-
b = (b << 12) | (b >> 20)
60+
b = bits.RotateLeft32(b, 12)
5061
a += b
5162
d ^= a
52-
d = (d << 8) | (d >> 24)
63+
d = bits.RotateLeft32(d, 8)
5364
c += d
5465
b ^= c
55-
b = (b << 7) | (b >> 25)
66+
b = bits.RotateLeft32(b, 7)
5667
return a, b, c, d
5768
}
5869

@@ -67,116 +78,141 @@ func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
6778
// the src buffers was passed in a single run. That is, Cipher
6879
// maintains state and does not reset at each XORKeyStream call.
6980
func (s *Cipher) XORKeyStream(dst, src []byte) {
81+
if len(src) == 0 {
82+
return
83+
}
7084
if len(dst) < len(src) {
7185
panic("chacha20: output smaller than input")
7286
}
73-
if subtle.InexactOverlap(dst[:len(src)], src) {
87+
dst = dst[:len(src)]
88+
if subtle.InexactOverlap(dst, src) {
7489
panic("chacha20: invalid buffer overlap")
7590
}
7691

77-
// xor src with buffered keystream first
92+
// First, drain any remaining key stream from a previous XORKeyStream.
7893
if s.len != 0 {
79-
buf := s.buf[len(s.buf)-s.len:]
80-
if len(src) < len(buf) {
81-
buf = buf[:len(src)]
82-
}
83-
td, ts := dst[:len(buf)], src[:len(buf)] // BCE hint
84-
for i, b := range buf {
85-
td[i] = ts[i] ^ b
94+
keyStream := s.buf[bufSize-s.len:]
95+
if len(src) < len(keyStream) {
96+
keyStream = keyStream[:len(src)]
8697
}
87-
s.len -= len(buf)
88-
if s.len != 0 {
89-
return
98+
_ = src[len(keyStream)-1] // bounds check elimination hint
99+
for i, b := range keyStream {
100+
dst[i] = src[i] ^ b
90101
}
91-
s.buf = [len(s.buf)]byte{} // zero the empty buffer
92-
src = src[len(buf):]
93-
dst = dst[len(buf):]
102+
s.len -= len(keyStream)
103+
src = src[len(keyStream):]
104+
dst = dst[len(keyStream):]
94105
}
95106

96-
if len(src) == 0 {
97-
return
107+
const blocksPerBuf = bufSize / blockSize
108+
numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
109+
if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
110+
panic("chacha20: counter overflow")
98111
}
99-
if haveAsm {
100-
if uint64(len(src))+uint64(s.counter)*64 > (1<<38)-64 {
101-
panic("chacha20: counter overflow")
102-
}
103-
s.xorKeyStreamAsm(dst, src)
104-
return
112+
113+
// xorKeyStreamBlocks implementations expect input lengths that are a
114+
// multiple of bufSize. Platform-specific ones process multiple blocks at a
115+
// time, so have bufSizes that are a multiple of blockSize.
116+
117+
rem := len(src) % bufSize
118+
full := len(src) - rem
119+
120+
if full > 0 {
121+
s.xorKeyStreamBlocks(dst[:full], src[:full])
105122
}
106123

107-
// set up a 64-byte buffer to pad out the final block if needed
108-
// (hoisted out of the main loop to avoid spills)
109-
rem := len(src) % 64 // length of final block
110-
fin := len(src) - rem // index of final block
124+
// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
125+
// keep the leftover keystream for the next XORKeyStream invocation.
111126
if rem > 0 {
112-
copy(s.buf[len(s.buf)-64:], src[fin:])
127+
s.buf = [bufSize]byte{}
128+
copy(s.buf[:], src[full:])
129+
s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
130+
s.len = bufSize - copy(dst[full:], s.buf[:])
131+
}
132+
}
133+
134+
func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
135+
if len(dst) != len(src) || len(dst)%blockSize != 0 {
136+
panic("chacha20: internal error: wrong dst and/or src length")
113137
}
114138

115-
// pre-calculate most of the first round
116-
s1, s5, s9, s13 := quarterRound(j1, s.key[1], s.key[5], s.nonce[0])
117-
s2, s6, s10, s14 := quarterRound(j2, s.key[2], s.key[6], s.nonce[1])
118-
s3, s7, s11, s15 := quarterRound(j3, s.key[3], s.key[7], s.nonce[2])
139+
// To generate each block of key stream, the initial cipher state
140+
// (represented below) is passed through 20 rounds of shuffling,
141+
// alternatively applying quarterRounds by columns (like 1, 5, 9, 13)
142+
// or by diagonals (like 1, 6, 11, 12).
143+
//
144+
// 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc
145+
// 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk
146+
// 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk
147+
// 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn
148+
//
149+
// c=constant k=key b=blockcount n=nonce
150+
var (
151+
c0, c1, c2, c3 = j0, j1, j2, j3
152+
c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3]
153+
c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7]
154+
_, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2]
155+
)
119156

120-
n := len(src)
121-
src, dst = src[:n:n], dst[:n:n] // BCE hint
122-
for i := 0; i < n; i += 64 {
123-
// calculate the remainder of the first round
124-
s0, s4, s8, s12 := quarterRound(j0, s.key[0], s.key[4], s.counter)
157+
// Three quarters of the first round don't depend on the counter, so we can
158+
// calculate them here, and reuse them for multiple blocks in the loop.
159+
// TODO(filippo): experiment with reusing across XORKeyStream calls.
160+
s1, s5, s9, s13 := quarterRound(c1, c5, c9, c13)
161+
s2, s6, s10, s14 := quarterRound(c2, c6, c10, c14)
162+
s3, s7, s11, s15 := quarterRound(c3, c7, c11, c15)
125163

126-
// execute the second round
164+
for i := 0; i < len(src); i += blockSize {
165+
// The remainder of the first column round.
166+
s0, s4, s8, s12 := quarterRound(c0, c4, c8, s.counter)
167+
168+
// The second diagonal round.
127169
x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15)
128170
x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12)
129171
x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13)
130172
x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14)
131173

132-
// execute the remaining 18 rounds
174+
// The remaining 18 rounds.
133175
for i := 0; i < 9; i++ {
176+
// Column round.
134177
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
135178
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
136179
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
137180
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
138181

182+
// Diagonal round.
139183
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
140184
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
141185
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
142186
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
143187
}
144188

145-
x0 += j0
146-
x1 += j1
147-
x2 += j2
148-
x3 += j3
149-
150-
x4 += s.key[0]
151-
x5 += s.key[1]
152-
x6 += s.key[2]
153-
x7 += s.key[3]
154-
x8 += s.key[4]
155-
x9 += s.key[5]
156-
x10 += s.key[6]
157-
x11 += s.key[7]
158-
189+
// Finally, add back the initial state to generate the key stream.
190+
x0 += c0
191+
x1 += c1
192+
x2 += c2
193+
x3 += c3
194+
x4 += c4
195+
x5 += c5
196+
x6 += c6
197+
x7 += c7
198+
x8 += c8
199+
x9 += c9
200+
x10 += c10
201+
x11 += c11
159202
x12 += s.counter
160-
x13 += s.nonce[0]
161-
x14 += s.nonce[1]
162-
x15 += s.nonce[2]
203+
x13 += c13
204+
x14 += c14
205+
x15 += c15
163206

164-
// increment the counter
165207
s.counter += 1
166208
if s.counter == 0 {
167-
panic("chacha20: counter overflow")
209+
panic("chacha20: internal error: counter overflow")
168210
}
169211

170-
// pad to 64 bytes if needed
171212
in, out := src[i:], dst[i:]
172-
if i == fin {
173-
// src[fin:] has already been copied into s.buf before
174-
// the main loop
175-
in, out = s.buf[len(s.buf)-64:], s.buf[len(s.buf)-64:]
176-
}
177-
in, out = in[:64], out[:64] // BCE hint
213+
in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
178214

179-
// XOR the key stream with the source and write out the result
215+
// XOR the key stream with the source and write out the result.
180216
xor(out[0:], in[0:], x0)
181217
xor(out[4:], in[4:], x1)
182218
xor(out[8:], in[8:], x2)
@@ -194,22 +230,13 @@ func (s *Cipher) XORKeyStream(dst, src []byte) {
194230
xor(out[56:], in[56:], x14)
195231
xor(out[60:], in[60:], x15)
196232
}
197-
// copy any trailing bytes out of the buffer and into dst
198-
if rem != 0 {
199-
s.len = 64 - rem
200-
copy(dst[fin:], s.buf[len(s.buf)-64:])
201-
}
202233
}
203234

204235
// Advance discards bytes in the key stream until the next 64 byte block
205-
// boundary is reached and updates the counter accordingly. If the key
206-
// stream is already at a block boundary no bytes will be discarded and
207-
// the counter will be unchanged.
236+
// boundary is reached. If the key stream is already at a block boundary no
237+
// bytes will be discarded.
208238
func (s *Cipher) Advance() {
209-
s.len -= s.len % 64
210-
if s.len == 0 {
211-
s.buf = [len(s.buf)]byte{}
212-
}
239+
s.len -= s.len % blockSize
213240
}
214241

215242
// XORKeyStream crypts bytes from in to out using the given key and counters.
@@ -246,11 +273,13 @@ func HChaCha20(key *[8]uint32, nonce *[4]uint32) [8]uint32 {
246273
x12, x13, x14, x15 := nonce[0], nonce[1], nonce[2], nonce[3]
247274

248275
for i := 0; i < 10; i++ {
276+
// Diagonal round.
249277
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
250278
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
251279
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
252280
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
253281

282+
// Column round.
254283
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
255284
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
256285
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)

internal/chacha20/chacha_noasm.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,8 @@
66

77
package chacha20
88

9-
const (
10-
bufSize = 64
11-
haveAsm = false
12-
)
9+
const bufSize = blockSize
1310

14-
func (*Cipher) xorKeyStreamAsm(dst, src []byte) {
15-
panic("not implemented")
11+
func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
12+
s.xorKeyStreamBlocksGeneric(dst, src)
1613
}

0 commit comments

Comments
 (0)