Skip to content

Commit 36c164e

Browse files
committed
vendor: add golang.org/x/text/unicode/norm + x/test/width for IDNA support
Add golang.org/x/text/unicode/norm from x/text git rev a7c0236. Needed by net/http for IDNA normalization. Updates #13835 Change-Id: I8b024e179d573f2b093c209a4b9e4f71f7d4a1f2 Reviewed-on: https://go-review.googlesource.com/29859 Run-TryBot: Brad Fitzpatrick <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Marcel van Lohuizen <[email protected]>
1 parent 15b4d18 commit 36c164e

File tree

16 files changed

+12425
-0
lines changed

16 files changed

+12425
-0
lines changed

src/vendor/golang_org/x/text/transform/transform.go

Lines changed: 705 additions & 0 deletions
Large diffs are not rendered by default.

src/vendor/golang_org/x/text/unicode/norm/composition.go

Lines changed: 514 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
// Copyright 2011 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package norm
6+
7+
// This file contains Form-specific logic and wrappers for data in tables.go.
8+
9+
// Rune info is stored in a separate trie per composing form. A composing form
10+
// and its corresponding decomposing form share the same trie. Each trie maps
11+
// a rune to a uint16. The values take two forms. For v >= 0x8000:
12+
// bits
13+
// 15: 1 (inverse of NFD_QD bit of qcInfo)
14+
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
15+
// 6..0: ccc (compressed CCC value).
16+
// For v < 0x8000, the respective rune has a decomposition and v is an index
17+
// into a byte array of UTF-8 decomposition sequences and additional info and
18+
// has the form:
19+
// <header> <decomp_byte>* [<tccc> [<lccc>]]
20+
// The header contains the number of bytes in the decomposition (excluding this
21+
// length byte). The two most significant bits of this length byte correspond
22+
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
23+
// The byte sequence is followed by a trailing and leading CCC if the values
24+
// for these are not zero. The value of v determines which ccc are appended
25+
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
26+
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
27+
// there is an additional leading ccc. The value of tccc itself is the
28+
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
29+
// are the number of trailing non-starters.
30+
31+
const (
32+
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
33+
headerLenMask = 0x3F // extract the length value from the header byte
34+
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
35+
)
36+
37+
// Properties provides access to normalization properties of a rune.
38+
type Properties struct {
39+
pos uint8 // start position in reorderBuffer; used in composition.go
40+
size uint8 // length of UTF-8 encoding of this rune
41+
ccc uint8 // leading canonical combining class (ccc if not decomposition)
42+
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
43+
nLead uint8 // number of leading non-starters.
44+
flags qcInfo // quick check flags
45+
index uint16
46+
}
47+
48+
// functions dispatchable per form
49+
type lookupFunc func(b input, i int) Properties
50+
51+
// formInfo holds Form-specific functions and tables.
52+
type formInfo struct {
53+
form Form
54+
composing, compatibility bool // form type
55+
info lookupFunc
56+
nextMain iterFunc
57+
}
58+
59+
var formTable []*formInfo
60+
61+
func init() {
62+
formTable = make([]*formInfo, 4)
63+
64+
for i := range formTable {
65+
f := &formInfo{}
66+
formTable[i] = f
67+
f.form = Form(i)
68+
if Form(i) == NFKD || Form(i) == NFKC {
69+
f.compatibility = true
70+
f.info = lookupInfoNFKC
71+
} else {
72+
f.info = lookupInfoNFC
73+
}
74+
f.nextMain = nextDecomposed
75+
if Form(i) == NFC || Form(i) == NFKC {
76+
f.nextMain = nextComposed
77+
f.composing = true
78+
}
79+
}
80+
}
81+
82+
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
83+
// unexpected behavior for the user. For example, in NFD, there is a boundary
84+
// after 'a'. However, 'a' might combine with modifiers, so from the application's
85+
// perspective it is not a good boundary. We will therefore always use the
86+
// boundaries for the combining variants.
87+
88+
// BoundaryBefore returns true if this rune starts a new segment and
89+
// cannot combine with any rune on the left.
90+
func (p Properties) BoundaryBefore() bool {
91+
if p.ccc == 0 && !p.combinesBackward() {
92+
return true
93+
}
94+
// We assume that the CCC of the first character in a decomposition
95+
// is always non-zero if different from info.ccc and that we can return
96+
// false at this point. This is verified by maketables.
97+
return false
98+
}
99+
100+
// BoundaryAfter returns true if runes cannot combine with or otherwise
101+
// interact with this or previous runes.
102+
func (p Properties) BoundaryAfter() bool {
103+
// TODO: loosen these conditions.
104+
return p.isInert()
105+
}
106+
107+
// We pack quick check data in 4 bits:
108+
// 5: Combines forward (0 == false, 1 == true)
109+
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
110+
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
111+
// 1..0: Number of trailing non-starters.
112+
//
113+
// When all 4 bits are zero, the character is inert, meaning it is never
114+
// influenced by normalization.
115+
type qcInfo uint8
116+
117+
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
118+
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
119+
120+
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
121+
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
122+
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
123+
124+
func (p Properties) isInert() bool {
125+
return p.flags&qcInfoMask == 0 && p.ccc == 0
126+
}
127+
128+
func (p Properties) multiSegment() bool {
129+
return p.index >= firstMulti && p.index < endMulti
130+
}
131+
132+
func (p Properties) nLeadingNonStarters() uint8 {
133+
return p.nLead
134+
}
135+
136+
func (p Properties) nTrailingNonStarters() uint8 {
137+
return uint8(p.flags & 0x03)
138+
}
139+
140+
// Decomposition returns the decomposition for the underlying rune
141+
// or nil if there is none.
142+
func (p Properties) Decomposition() []byte {
143+
// TODO: create the decomposition for Hangul?
144+
if p.index == 0 {
145+
return nil
146+
}
147+
i := p.index
148+
n := decomps[i] & headerLenMask
149+
i++
150+
return decomps[i : i+uint16(n)]
151+
}
152+
153+
// Size returns the length of UTF-8 encoding of the rune.
154+
func (p Properties) Size() int {
155+
return int(p.size)
156+
}
157+
158+
// CCC returns the canonical combining class of the underlying rune.
159+
func (p Properties) CCC() uint8 {
160+
if p.index >= firstCCCZeroExcept {
161+
return 0
162+
}
163+
return ccc[p.ccc]
164+
}
165+
166+
// LeadCCC returns the CCC of the first rune in the decomposition.
167+
// If there is no decomposition, LeadCCC equals CCC.
168+
func (p Properties) LeadCCC() uint8 {
169+
return ccc[p.ccc]
170+
}
171+
172+
// TrailCCC returns the CCC of the last rune in the decomposition.
173+
// If there is no decomposition, TrailCCC equals CCC.
174+
func (p Properties) TrailCCC() uint8 {
175+
return ccc[p.tccc]
176+
}
177+
178+
// Recomposition
179+
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
180+
// This clips off the bits of three entries, but we know this will not
181+
// result in a collision. In the unlikely event that changes to
182+
// UnicodeData.txt introduce collisions, the compiler will catch it.
183+
// Note that the recomposition map for NFC and NFKC are identical.
184+
185+
// combine returns the combined rune or 0 if it doesn't exist.
186+
func combine(a, b rune) rune {
187+
key := uint32(uint16(a))<<16 + uint32(uint16(b))
188+
return recompMap[key]
189+
}
190+
191+
func lookupInfoNFC(b input, i int) Properties {
192+
v, sz := b.charinfoNFC(i)
193+
return compInfo(v, sz)
194+
}
195+
196+
func lookupInfoNFKC(b input, i int) Properties {
197+
v, sz := b.charinfoNFKC(i)
198+
return compInfo(v, sz)
199+
}
200+
201+
// Properties returns properties for the first rune in s.
202+
func (f Form) Properties(s []byte) Properties {
203+
if f == NFC || f == NFD {
204+
return compInfo(nfcData.lookup(s))
205+
}
206+
return compInfo(nfkcData.lookup(s))
207+
}
208+
209+
// PropertiesString returns properties for the first rune in s.
210+
func (f Form) PropertiesString(s string) Properties {
211+
if f == NFC || f == NFD {
212+
return compInfo(nfcData.lookupString(s))
213+
}
214+
return compInfo(nfkcData.lookupString(s))
215+
}
216+
217+
// compInfo converts the information contained in v and sz
218+
// to a Properties. See the comment at the top of the file
219+
// for more information on the format.
220+
func compInfo(v uint16, sz int) Properties {
221+
if v == 0 {
222+
return Properties{size: uint8(sz)}
223+
} else if v >= 0x8000 {
224+
p := Properties{
225+
size: uint8(sz),
226+
ccc: uint8(v),
227+
tccc: uint8(v),
228+
flags: qcInfo(v >> 8),
229+
}
230+
if p.ccc > 0 || p.combinesBackward() {
231+
p.nLead = uint8(p.flags & 0x3)
232+
}
233+
return p
234+
}
235+
// has decomposition
236+
h := decomps[v]
237+
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
238+
p := Properties{size: uint8(sz), flags: f, index: v}
239+
if v >= firstCCC {
240+
v += uint16(h&headerLenMask) + 1
241+
c := decomps[v]
242+
p.tccc = c >> 2
243+
p.flags |= qcInfo(c & 0x3)
244+
if v >= firstLeadingCCC {
245+
p.nLead = c & 0x3
246+
if v >= firstStarterWithNLead {
247+
// We were tricked. Remove the decomposition.
248+
p.flags &= 0x03
249+
p.index = 0
250+
return p
251+
}
252+
p.ccc = decomps[v+1]
253+
}
254+
}
255+
return p
256+
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Copyright 2011 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package norm
6+
7+
import "unicode/utf8"
8+
9+
type input struct {
10+
str string
11+
bytes []byte
12+
}
13+
14+
func inputBytes(str []byte) input {
15+
return input{bytes: str}
16+
}
17+
18+
func inputString(str string) input {
19+
return input{str: str}
20+
}
21+
22+
func (in *input) setBytes(str []byte) {
23+
in.str = ""
24+
in.bytes = str
25+
}
26+
27+
func (in *input) setString(str string) {
28+
in.str = str
29+
in.bytes = nil
30+
}
31+
32+
func (in *input) _byte(p int) byte {
33+
if in.bytes == nil {
34+
return in.str[p]
35+
}
36+
return in.bytes[p]
37+
}
38+
39+
func (in *input) skipASCII(p, max int) int {
40+
if in.bytes == nil {
41+
for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
42+
}
43+
} else {
44+
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
45+
}
46+
}
47+
return p
48+
}
49+
50+
func (in *input) skipContinuationBytes(p int) int {
51+
if in.bytes == nil {
52+
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
53+
}
54+
} else {
55+
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
56+
}
57+
}
58+
return p
59+
}
60+
61+
func (in *input) appendSlice(buf []byte, b, e int) []byte {
62+
if in.bytes != nil {
63+
return append(buf, in.bytes[b:e]...)
64+
}
65+
for i := b; i < e; i++ {
66+
buf = append(buf, in.str[i])
67+
}
68+
return buf
69+
}
70+
71+
func (in *input) copySlice(buf []byte, b, e int) int {
72+
if in.bytes == nil {
73+
return copy(buf, in.str[b:e])
74+
}
75+
return copy(buf, in.bytes[b:e])
76+
}
77+
78+
func (in *input) charinfoNFC(p int) (uint16, int) {
79+
if in.bytes == nil {
80+
return nfcData.lookupString(in.str[p:])
81+
}
82+
return nfcData.lookup(in.bytes[p:])
83+
}
84+
85+
func (in *input) charinfoNFKC(p int) (uint16, int) {
86+
if in.bytes == nil {
87+
return nfkcData.lookupString(in.str[p:])
88+
}
89+
return nfkcData.lookup(in.bytes[p:])
90+
}
91+
92+
func (in *input) hangul(p int) (r rune) {
93+
if in.bytes == nil {
94+
if !isHangulString(in.str[p:]) {
95+
return 0
96+
}
97+
r, _ = utf8.DecodeRuneInString(in.str[p:])
98+
} else {
99+
if !isHangul(in.bytes[p:]) {
100+
return 0
101+
}
102+
r, _ = utf8.DecodeRune(in.bytes[p:])
103+
}
104+
return r
105+
}

0 commit comments

Comments
 (0)