Skip to content

Commit 98aa496

Browse files
committed
exp/norm: exposed runeInfo type in API.
For completeness, we also expose the Canonical Combining Class of a rune. This does not increase the data size. R=r CC=golang-dev https://golang.org/cl/5931043
1 parent d8e9b04 commit 98aa496

File tree

6 files changed

+128
-69
lines changed

6 files changed

+128
-69
lines changed

src/pkg/exp/norm/composition.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ const (
2222
// the UTF-8 characters in order. Only the rune array is maintained in sorted
2323
// order. flush writes the resulting segment to a byte array.
2424
type reorderBuffer struct {
25-
rune [maxBufferSize]runeInfo // Per character info.
26-
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
27-
nrune int // Number of runeInfos.
28-
nbyte uint8 // Number or bytes.
25+
rune [maxBufferSize]Properties // Per character info.
26+
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
27+
nrune int // Number of runeInfos.
28+
nbyte uint8 // Number or bytes.
2929
f formInfo
3030

3131
src input
@@ -81,7 +81,7 @@ func (rb *reorderBuffer) flushCopy(buf []byte) int {
8181
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
8282
// It returns false if the buffer is not large enough to hold the rune.
8383
// It is used internally by insert and insertString only.
84-
func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
84+
func (rb *reorderBuffer) insertOrdered(info Properties) bool {
8585
n := rb.nrune
8686
if n >= maxCombiningChars+1 {
8787
return false
@@ -107,12 +107,12 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
107107

108108
// insert inserts the given rune in the buffer ordered by CCC.
109109
// It returns true if the buffer was large enough to hold the decomposed rune.
110-
func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
110+
func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
111111
if rune := src.hangul(i); rune != 0 {
112112
return rb.decomposeHangul(rune)
113113
}
114114
if info.hasDecomposition() {
115-
return rb.insertDecomposed(info.decomposition())
115+
return rb.insertDecomposed(info.Decomposition())
116116
}
117117
return rb.insertSingle(src, i, info)
118118
}
@@ -136,7 +136,7 @@ func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
136136

137137
// insertSingle inserts an entry in the reorderBuffer for the rune at
138138
// position i. info is the runeInfo for the rune at position i.
139-
func (rb *reorderBuffer) insertSingle(src input, i int, info runeInfo) bool {
139+
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) bool {
140140
// insertOrder changes nbyte
141141
pos := rb.nbyte
142142
if !rb.insertOrdered(info) {
@@ -151,15 +151,15 @@ func (rb *reorderBuffer) appendRune(r rune) {
151151
bn := rb.nbyte
152152
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
153153
rb.nbyte += utf8.UTFMax
154-
rb.rune[rb.nrune] = runeInfo{pos: bn, size: uint8(sz)}
154+
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
155155
rb.nrune++
156156
}
157157

158158
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
159159
func (rb *reorderBuffer) assignRune(pos int, r rune) {
160160
bn := rb.rune[pos].pos
161161
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
162-
rb.rune[pos] = runeInfo{pos: bn, size: uint8(sz)}
162+
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
163163
}
164164

165165
// runeAt returns the rune at position n. It is used for Hangul and recomposition.

src/pkg/exp/norm/forminfo.go

Lines changed: 76 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ const (
3232
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
3333
)
3434

35-
// runeInfo is a representation for the data stored in charinfoTrie.
36-
type runeInfo struct {
35+
// Properties provides access to normalization properties of a rune.
36+
type Properties struct {
3737
pos uint8 // start position in reorderBuffer; used in composition.go
3838
size uint8 // length of UTF-8 encoding of this rune
3939
ccc uint8 // leading canonical combining class (ccc if not decomposition)
@@ -43,7 +43,7 @@ type runeInfo struct {
4343
}
4444

4545
// functions dispatchable per form
46-
type lookupFunc func(b input, i int) runeInfo
46+
type lookupFunc func(b input, i int) Properties
4747

4848
// formInfo holds Form-specific functions and tables.
4949
type formInfo struct {
@@ -75,11 +75,14 @@ func init() {
7575

7676
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
7777
// unexpected behavior for the user. For example, in NFD, there is a boundary
78-
// after 'a'. However, a might combine with modifiers, so from the application's
78+
// after 'a'. However, 'a' might combine with modifiers, so from the application's
7979
// perspective it is not a good boundary. We will therefore always use the
8080
// boundaries for the combining variants.
81-
func (i runeInfo) boundaryBefore() bool {
82-
if i.ccc == 0 && !i.combinesBackward() {
81+
82+
// BoundaryBefore returns true if this rune starts a new segment and
83+
// cannot combine with any rune on the left.
84+
func (p Properties) BoundaryBefore() bool {
85+
if p.ccc == 0 && !p.combinesBackward() {
8386
return true
8487
}
8588
// We assume that the CCC of the first character in a decomposition
@@ -88,8 +91,10 @@ func (i runeInfo) boundaryBefore() bool {
8891
return false
8992
}
9093

91-
func (i runeInfo) boundaryAfter() bool {
92-
return i.isInert()
94+
// BoundaryAfter returns true if this rune cannot combine with runes to the right
95+
// and always denotes the end of a segment.
96+
func (p Properties) BoundaryAfter() bool {
97+
return p.isInert()
9398
}
9499

95100
// We pack quick check data in 4 bits:
@@ -101,25 +106,52 @@ func (i runeInfo) boundaryAfter() bool {
101106
// influenced by normalization.
102107
type qcInfo uint8
103108

104-
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
105-
func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 }
109+
func (p Properties) isYesC() bool { return p.flags&0x4 == 0 }
110+
func (p Properties) isYesD() bool { return p.flags&0x1 == 0 }
106111

107-
func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 }
108-
func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe
109-
func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD
112+
func (p Properties) combinesForward() bool { return p.flags&0x8 != 0 }
113+
func (p Properties) combinesBackward() bool { return p.flags&0x2 != 0 } // == isMaybe
114+
func (p Properties) hasDecomposition() bool { return p.flags&0x1 != 0 } // == isNoD
110115

111-
func (r runeInfo) isInert() bool {
112-
return r.flags&0xf == 0 && r.ccc == 0
116+
func (p Properties) isInert() bool {
117+
return p.flags&0xf == 0 && p.ccc == 0
113118
}
114119

115-
func (r runeInfo) decomposition() []byte {
116-
if r.index == 0 {
120+
// Decomposition returns the decomposition for the underlying rune
121+
// or nil if there is none.
122+
func (p Properties) Decomposition() []byte {
123+
if p.index == 0 {
117124
return nil
118125
}
119-
p := r.index
120-
n := decomps[p] & 0x3F
121-
p++
122-
return decomps[p : p+uint16(n)]
126+
i := p.index
127+
n := decomps[i] & headerLenMask
128+
i++
129+
return decomps[i : i+uint16(n)]
130+
}
131+
132+
// Size returns the length of UTF-8 encoding of the rune.
133+
func (p Properties) Size() int {
134+
return int(p.size)
135+
}
136+
137+
// CCC returns the canonical combining class of the underlying rune.
138+
func (p Properties) CCC() uint8 {
139+
if p.index > firstCCCZeroExcept {
140+
return 0
141+
}
142+
return p.ccc
143+
}
144+
145+
// LeadCCC returns the CCC of the first rune in the decomposition.
146+
// If there is no decomposition, LeadCCC equals CCC.
147+
func (p Properties) LeadCCC() uint8 {
148+
return p.ccc
149+
}
150+
151+
// TrailCCC returns the CCC of the last rune in the decomposition.
152+
// If there is no decomposition, TrailCCC equals CCC.
153+
func (p Properties) TrailCCC() uint8 {
154+
return p.tccc
123155
}
124156

125157
// Recomposition
@@ -135,24 +167,40 @@ func combine(a, b rune) rune {
135167
return recompMap[key]
136168
}
137169

138-
func lookupInfoNFC(b input, i int) runeInfo {
170+
func lookupInfoNFC(b input, i int) Properties {
139171
v, sz := b.charinfoNFC(i)
140172
return compInfo(v, sz)
141173
}
142174

143-
func lookupInfoNFKC(b input, i int) runeInfo {
175+
func lookupInfoNFKC(b input, i int) Properties {
144176
v, sz := b.charinfoNFKC(i)
145177
return compInfo(v, sz)
146178
}
147179

180+
// Properties returns properties for the first rune in s.
181+
func (f Form) Properties(s []byte) Properties {
182+
if f == NFC || f == NFD {
183+
return compInfo(nfcTrie.lookup(s))
184+
}
185+
return compInfo(nfkcTrie.lookup(s))
186+
}
187+
188+
// PropertiesString returns properties for the first rune in s.
189+
func (f Form) PropertiesString(s string) Properties {
190+
if f == NFC || f == NFD {
191+
return compInfo(nfcTrie.lookupString(s))
192+
}
193+
return compInfo(nfkcTrie.lookupString(s))
194+
}
195+
148196
// compInfo converts the information contained in v and sz
149-
// to a runeInfo. See the comment at the top of the file
197+
// to a Properties. See the comment at the top of the file
150198
// for more information on the format.
151-
func compInfo(v uint16, sz int) runeInfo {
199+
func compInfo(v uint16, sz int) Properties {
152200
if v == 0 {
153-
return runeInfo{size: uint8(sz)}
201+
return Properties{size: uint8(sz)}
154202
} else if v >= 0x8000 {
155-
return runeInfo{
203+
return Properties{
156204
size: uint8(sz),
157205
ccc: uint8(v),
158206
tccc: uint8(v),
@@ -162,7 +210,7 @@ func compInfo(v uint16, sz int) runeInfo {
162210
// has decomposition
163211
h := decomps[v]
164212
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
165-
ri := runeInfo{size: uint8(sz), flags: f, index: v}
213+
ri := Properties{size: uint8(sz), flags: f, index: v}
166214
if v >= firstCCC {
167215
v += uint16(h&headerLenMask) + 1
168216
ri.tccc = decomps[v]

src/pkg/exp/norm/iter.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ const MaxSegmentSize = maxByteBufferSize
1010
// to a given Form.
1111
type Iter struct {
1212
rb reorderBuffer
13-
info runeInfo // first character saved from previous iteration
14-
next iterFunc // implementation of next depends on form
13+
info Properties // first character saved from previous iteration
14+
next iterFunc // implementation of next depends on form
1515

1616
p int // current position in input source
1717
outStart int // start of current segment in output buffer
@@ -124,7 +124,7 @@ doFast:
124124
break
125125
}
126126
}
127-
} else if d := i.info.decomposition(); d != nil {
127+
} else if d := i.info.Decomposition(); d != nil {
128128
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
129129
p := outp + len(d)
130130
if p > i.maxseg && i.setStart(outp, i.p) {
@@ -245,7 +245,7 @@ doFast:
245245
if i.setStart(outp-1, i.p-1) {
246246
i.p--
247247
outp--
248-
i.info = runeInfo{size: 1}
248+
i.info = Properties{size: 1}
249249
break
250250
}
251251
}
@@ -274,7 +274,7 @@ doNorm:
274274
return outp
275275
}
276276
i.info = i.rb.f.info(i.rb.src, i.p)
277-
if i.info.boundaryBefore() {
277+
if i.info.BoundaryBefore() {
278278
break
279279
}
280280
}

src/pkg/exp/norm/maketables.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,10 @@ func printCharInfoTables() int {
605605

606606
lccc := ccc(d[0])
607607
tccc := ccc(d[len(d)-1])
608+
cc := ccc(r)
609+
if cc != 0 && lccc == 0 && tccc == 0 {
610+
logger.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", cc)
611+
}
608612
if tccc < lccc && lccc != 0 {
609613
const msg = "%U: lccc (%d) must be <= tcc (%d)"
610614
logger.Fatalf(msg, r, lccc, tccc)
@@ -615,7 +619,13 @@ func printCharInfoTables() int {
615619
index = 1
616620
if lccc > 0 {
617621
s += string([]byte{lccc})
618-
index |= 2
622+
index = 2
623+
}
624+
if cc != lccc {
625+
if cc != 0 {
626+
logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", cc)
627+
}
628+
index = 3
619629
}
620630
}
621631
return index, s
@@ -642,7 +652,7 @@ func printCharInfoTables() int {
642652
size := 0
643653
positionMap := make(map[string]uint16)
644654
decompositions.WriteString("\000")
645-
cname := []string{"firstCCC", "firstLeadingCCC", "", "lastDecomp"}
655+
cname := []string{"firstCCC", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
646656
fmt.Println("const (")
647657
for i, m := range decompSet {
648658
sa := []string{}

0 commit comments

Comments
 (0)