Skip to content

Commit 2d98d47

Browse files
authored
Merge pull request #311 from goccy/feature/optimize-encode-path
Optimize encoding path for escaped string
2 parents 5686ae0 + 1bb8b16 commit 2d98d47

File tree

2 files changed

+186
-89
lines changed

2 files changed

+186
-89
lines changed

internal/encoder/decode_rune.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package encoder
2+
3+
import "unicode/utf8"
4+
5+
const (
6+
// The default lowest and highest continuation byte.
7+
locb = 128 //0b10000000
8+
hicb = 191 //0b10111111
9+
10+
// These names of these constants are chosen to give nice alignment in the
11+
// table below. The first nibble is an index into acceptRanges or F for
12+
// special one-byte cases. The second nibble is the Rune length or the
13+
// Status for the special one-byte case.
14+
xx = 0xF1 // invalid: size 1
15+
as = 0xF0 // ASCII: size 1
16+
s1 = 0x02 // accept 0, size 2
17+
s2 = 0x13 // accept 1, size 3
18+
s3 = 0x03 // accept 0, size 3
19+
s4 = 0x23 // accept 2, size 3
20+
s5 = 0x34 // accept 3, size 4
21+
s6 = 0x04 // accept 0, size 4
22+
s7 = 0x44 // accept 4, size 4
23+
)
24+
25+
// first is information about the first byte in a UTF-8 sequence.
26+
var first = [256]uint8{
27+
// 1 2 3 4 5 6 7 8 9 A B C D E F
28+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
29+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
30+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
31+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
32+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
33+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
34+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
35+
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
36+
// 1 2 3 4 5 6 7 8 9 A B C D E F
37+
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
38+
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
39+
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
40+
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
41+
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
42+
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
43+
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
44+
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
45+
}
46+
47+
// acceptRange gives the range of valid values for the second byte in a UTF-8
48+
// sequence.
49+
type acceptRange struct {
50+
lo uint8 // lowest value for second byte.
51+
hi uint8 // highest value for second byte.
52+
}
53+
54+
const (
55+
lineSep = byte(168) //'\u2028'
56+
paragraphSep = byte(169) //'\u2029'
57+
)
58+
59+
type decodeRuneState int
60+
61+
const (
62+
validUTF8State decodeRuneState = iota
63+
runeErrorState
64+
lineSepState
65+
paragraphSepState
66+
)
67+
68+
func decodeRuneInString(s string) (decodeRuneState, int) {
69+
n := len(s)
70+
s0 := s[0]
71+
x := first[s0]
72+
if x >= as {
73+
// The following code simulates an additional check for x == xx and
74+
// handling the ASCII and invalid cases accordingly. This mask-and-or
75+
// approach prevents an additional branch.
76+
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
77+
if rune(s[0])&^mask|utf8.RuneError&mask == utf8.RuneError {
78+
return runeErrorState, 1
79+
}
80+
return validUTF8State, 1
81+
}
82+
sz := int(x & 7)
83+
var accept acceptRange
84+
switch x >> 4 {
85+
case 0:
86+
accept = acceptRange{locb, hicb}
87+
case 1:
88+
accept = acceptRange{0xA0, hicb}
89+
case 2:
90+
accept = acceptRange{locb, 0x9F}
91+
case 3:
92+
accept = acceptRange{0x90, hicb}
93+
case 4:
94+
accept = acceptRange{locb, 0x8F}
95+
}
96+
if n < sz {
97+
return runeErrorState, 1
98+
}
99+
s1 := s[1]
100+
if s1 < accept.lo || accept.hi < s1 {
101+
return runeErrorState, 1
102+
}
103+
if sz <= 2 {
104+
return validUTF8State, 2
105+
}
106+
s2 := s[2]
107+
if s2 < locb || hicb < s2 {
108+
return runeErrorState, 1
109+
}
110+
if sz <= 3 {
111+
// separator character prefixes: [2]byte{226, 128}
112+
if s0 == 226 && s1 == 128 {
113+
switch s2 {
114+
case lineSep:
115+
return lineSepState, 3
116+
case paragraphSep:
117+
return paragraphSepState, 3
118+
}
119+
}
120+
return validUTF8State, 3
121+
}
122+
s3 := s[3]
123+
if s3 < locb || hicb < s3 {
124+
return runeErrorState, 1
125+
}
126+
return validUTF8State, 4
127+
}

internal/encoder/string.go

Lines changed: 59 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package encoder
33
import (
44
"math/bits"
55
"reflect"
6-
"unicode/utf8"
76
"unsafe"
87
)
98

@@ -349,53 +348,6 @@ var needEscape = [256]bool{
349348

350349
var hex = "0123456789abcdef"
351350

352-
// escapeIndex finds the index of the first char in `s` that requires escaping.
353-
// A char requires escaping if it's outside of the range of [0x20, 0x7F] or if
354-
// it includes a double quote or backslash.
355-
// If no chars in `s` require escaping, the return value is -1.
356-
func escapeIndex(s string) int {
357-
chunks := stringToUint64Slice(s)
358-
for _, n := range chunks {
359-
// combine masks before checking for the MSB of each byte. We include
360-
// `n` in the mask to check whether any of the *input* byte MSBs were
361-
// set (i.e. the byte was outside the ASCII range).
362-
mask := n | below(n, 0x20) | contains(n, '"') | contains(n, '\\')
363-
if (mask & msb) != 0 {
364-
return bits.TrailingZeros64(mask&msb) / 8
365-
}
366-
}
367-
368-
valLen := len(s)
369-
for i := len(chunks) * 8; i < valLen; i++ {
370-
if needEscape[s[i]] {
371-
return i
372-
}
373-
}
374-
375-
return -1
376-
}
377-
378-
// below return a mask that can be used to determine if any of the bytes
379-
// in `n` are below `b`. If a byte's MSB is set in the mask then that byte was
380-
// below `b`. The result is only valid if `b`, and each byte in `n`, is below
381-
// 0x80.
382-
func below(n uint64, b byte) uint64 {
383-
return n - expand(b)
384-
}
385-
386-
// contains returns a mask that can be used to determine if any of the
387-
// bytes in `n` are equal to `b`. If a byte's MSB is set in the mask then
388-
// that byte is equal to `b`. The result is only valid if `b`, and each
389-
// byte in `n`, is below 0x80.
390-
func contains(n uint64, b byte) uint64 {
391-
return (n ^ expand(b)) - lsb
392-
}
393-
394-
// expand puts the specified byte into each of the 8 bytes of a uint64.
395-
func expand(b byte) uint64 {
396-
return lsb * uint64(b)
397-
}
398-
399351
//nolint:govet
400352
func stringToUint64Slice(s string) []uint64 {
401353
return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
@@ -489,10 +441,9 @@ ESCAPE_END:
489441
i = j + 1
490442
j = j + 1
491443
continue
492-
}
493444

494-
// This encodes bytes < 0x20 except for \t, \n and \r.
495-
if c < 0x20 {
445+
case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
446+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
496447
buf = append(buf, s[i:j]...)
497448
buf = append(buf, `\u00`...)
498449
buf = append(buf, hex[c>>4], hex[c&0xF])
@@ -501,33 +452,34 @@ ESCAPE_END:
501452
continue
502453
}
503454

504-
r, size := utf8.DecodeRuneInString(s[j:])
505-
506-
if r == utf8.RuneError && size == 1 {
455+
state, size := decodeRuneInString(s[j:])
456+
switch state {
457+
case runeErrorState:
507458
buf = append(buf, s[i:j]...)
508459
buf = append(buf, `\ufffd`...)
509-
i = j + size
510-
j = j + size
460+
i = j + 1
461+
j = j + 1
511462
continue
512-
}
513-
514-
switch r {
515-
case '\u2028', '\u2029':
516463
// U+2028 is LINE SEPARATOR.
517464
// U+2029 is PARAGRAPH SEPARATOR.
518465
// They are both technically valid characters in JSON strings,
519466
// but don't work in JSONP, which has to be evaluated as JavaScript,
520467
// and can lead to security holes there. It is valid JSON to
521468
// escape them, so we do so unconditionally.
522469
// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
470+
case lineSepState:
523471
buf = append(buf, s[i:j]...)
524-
buf = append(buf, `\u202`...)
525-
buf = append(buf, hex[r&0xF])
526-
i = j + size
527-
j = j + size
472+
buf = append(buf, `\u2028`...)
473+
i = j + 3
474+
j = j + 3
475+
continue
476+
case paragraphSepState:
477+
buf = append(buf, s[i:j]...)
478+
buf = append(buf, `\u2029`...)
479+
i = j + 3
480+
j = j + 3
528481
continue
529482
}
530-
531483
j += size
532484
}
533485

@@ -540,19 +492,37 @@ func appendString(buf []byte, s string) []byte {
540492
return append(buf, `""`...)
541493
}
542494
buf = append(buf, '"')
543-
var escapeIdx int
495+
var (
496+
i, j int
497+
)
544498
if valLen >= 8 {
545-
if escapeIdx = escapeIndex(s); escapeIdx < 0 {
546-
return append(append(buf, s...), '"')
499+
chunks := stringToUint64Slice(s)
500+
for _, n := range chunks {
501+
// combine masks before checking for the MSB of each byte. We include
502+
// `n` in the mask to check whether any of the *input* byte MSBs were
503+
// set (i.e. the byte was outside the ASCII range).
504+
mask := n | (n - (lsb * 0x20)) |
505+
((n ^ (lsb * '"')) - lsb) |
506+
((n ^ (lsb * '\\')) - lsb)
507+
if (mask & msb) != 0 {
508+
j = bits.TrailingZeros64(mask&msb) / 8
509+
goto ESCAPE_END
510+
}
547511
}
512+
valLen := len(s)
513+
for i := len(chunks) * 8; i < valLen; i++ {
514+
if needEscape[s[i]] {
515+
j = i
516+
goto ESCAPE_END
517+
}
518+
}
519+
return append(append(buf, s...), '"')
548520
}
549-
550-
i := 0
551-
j := escapeIdx
521+
ESCAPE_END:
552522
for j < valLen {
553523
c := s[j]
554524

555-
if c >= 0x20 && c <= 0x7f && c != '\\' && c != '"' {
525+
if !needEscape[c] {
556526
// fast path: most of the time, printable ascii characters are used
557527
j++
558528
continue
@@ -594,10 +564,9 @@ func appendString(buf []byte, s string) []byte {
594564
i = j + 1
595565
j = j + 1
596566
continue
597-
}
598567

599-
// This encodes bytes < 0x20 except for \t, \n and \r.
600-
if c < 0x20 {
568+
case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
569+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
601570
buf = append(buf, s[i:j]...)
602571
buf = append(buf, `\u00`...)
603572
buf = append(buf, hex[c>>4], hex[c&0xF])
@@ -606,33 +575,34 @@ func appendString(buf []byte, s string) []byte {
606575
continue
607576
}
608577

609-
r, size := utf8.DecodeRuneInString(s[j:])
610-
611-
if r == utf8.RuneError && size == 1 {
578+
state, size := decodeRuneInString(s[j:])
579+
switch state {
580+
case runeErrorState:
612581
buf = append(buf, s[i:j]...)
613582
buf = append(buf, `\ufffd`...)
614-
i = j + size
615-
j = j + size
583+
i = j + 1
584+
j = j + 1
616585
continue
617-
}
618-
619-
switch r {
620-
case '\u2028', '\u2029':
621586
// U+2028 is LINE SEPARATOR.
622587
// U+2029 is PARAGRAPH SEPARATOR.
623588
// They are both technically valid characters in JSON strings,
624589
// but don't work in JSONP, which has to be evaluated as JavaScript,
625590
// and can lead to security holes there. It is valid JSON to
626591
// escape them, so we do so unconditionally.
627592
// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
593+
case lineSepState:
628594
buf = append(buf, s[i:j]...)
629-
buf = append(buf, `\u202`...)
630-
buf = append(buf, hex[r&0xF])
631-
i = j + size
632-
j = j + size
595+
buf = append(buf, `\u2028`...)
596+
i = j + 3
597+
j = j + 3
598+
continue
599+
case paragraphSepState:
600+
buf = append(buf, s[i:j]...)
601+
buf = append(buf, `\u2029`...)
602+
i = j + 3
603+
j = j + 3
633604
continue
634605
}
635-
636606
j += size
637607
}
638608

0 commit comments

Comments
 (0)