|
| 1 | +// Copyright 2011 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +package norm |
| 6 | + |
| 7 | +// This file contains Form-specific logic and wrappers for data in tables.go. |
| 8 | + |
| 9 | +// Rune info is stored in a separate trie per composing form. A composing form |
| 10 | +// and its corresponding decomposing form share the same trie. Each trie maps |
| 11 | +// a rune to a uint16. The values take two forms. For v >= 0x8000: |
| 12 | +// bits |
| 13 | +// 15: 1 (inverse of NFD_QD bit of qcInfo) |
| 14 | +// 13..7: qcInfo (see below). isYesD is always true (no decompostion). |
| 15 | +// 6..0: ccc (compressed CCC value). |
| 16 | +// For v < 0x8000, the respective rune has a decomposition and v is an index |
| 17 | +// into a byte array of UTF-8 decomposition sequences and additional info and |
| 18 | +// has the form: |
| 19 | +// <header> <decomp_byte>* [<tccc> [<lccc>]] |
| 20 | +// The header contains the number of bytes in the decomposition (excluding this |
| 21 | +// length byte). The two most significant bits of this length byte correspond |
| 22 | +// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. |
| 23 | +// The byte sequence is followed by a trailing and leading CCC if the values |
| 24 | +// for these are not zero. The value of v determines which ccc are appended |
| 25 | +// to the sequences. For v < firstCCC, there are none, for v >= firstCCC, |
| 26 | +// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC |
| 27 | +// there is an additional leading ccc. The value of tccc itself is the |
| 28 | +// trailing CCC shifted left 2 bits. The two least-significant bits of tccc |
| 29 | +// are the number of trailing non-starters. |
| 30 | + |
| 31 | +const ( |
| 32 | + qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo |
| 33 | + headerLenMask = 0x3F // extract the length value from the header byte |
| 34 | + headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte |
| 35 | +) |
| 36 | + |
| 37 | +// Properties provides access to normalization properties of a rune. |
| 38 | +type Properties struct { |
| 39 | + pos uint8 // start position in reorderBuffer; used in composition.go |
| 40 | + size uint8 // length of UTF-8 encoding of this rune |
| 41 | + ccc uint8 // leading canonical combining class (ccc if not decomposition) |
| 42 | + tccc uint8 // trailing canonical combining class (ccc if not decomposition) |
| 43 | + nLead uint8 // number of leading non-starters. |
| 44 | + flags qcInfo // quick check flags |
| 45 | + index uint16 |
| 46 | +} |
| 47 | + |
| 48 | +// functions dispatchable per form |
| 49 | +type lookupFunc func(b input, i int) Properties |
| 50 | + |
| 51 | +// formInfo holds Form-specific functions and tables. |
| 52 | +type formInfo struct { |
| 53 | + form Form |
| 54 | + composing, compatibility bool // form type |
| 55 | + info lookupFunc |
| 56 | + nextMain iterFunc |
| 57 | +} |
| 58 | + |
| 59 | +var formTable []*formInfo |
| 60 | + |
| 61 | +func init() { |
| 62 | + formTable = make([]*formInfo, 4) |
| 63 | + |
| 64 | + for i := range formTable { |
| 65 | + f := &formInfo{} |
| 66 | + formTable[i] = f |
| 67 | + f.form = Form(i) |
| 68 | + if Form(i) == NFKD || Form(i) == NFKC { |
| 69 | + f.compatibility = true |
| 70 | + f.info = lookupInfoNFKC |
| 71 | + } else { |
| 72 | + f.info = lookupInfoNFC |
| 73 | + } |
| 74 | + f.nextMain = nextDecomposed |
| 75 | + if Form(i) == NFC || Form(i) == NFKC { |
| 76 | + f.nextMain = nextComposed |
| 77 | + f.composing = true |
| 78 | + } |
| 79 | + } |
| 80 | +} |
| 81 | + |
| 82 | +// We do not distinguish between boundaries for NFC, NFD, etc. to avoid |
| 83 | +// unexpected behavior for the user. For example, in NFD, there is a boundary |
| 84 | +// after 'a'. However, 'a' might combine with modifiers, so from the application's |
| 85 | +// perspective it is not a good boundary. We will therefore always use the |
| 86 | +// boundaries for the combining variants. |
| 87 | + |
| 88 | +// BoundaryBefore returns true if this rune starts a new segment and |
| 89 | +// cannot combine with any rune on the left. |
| 90 | +func (p Properties) BoundaryBefore() bool { |
| 91 | + if p.ccc == 0 && !p.combinesBackward() { |
| 92 | + return true |
| 93 | + } |
| 94 | + // We assume that the CCC of the first character in a decomposition |
| 95 | + // is always non-zero if different from info.ccc and that we can return |
| 96 | + // false at this point. This is verified by maketables. |
| 97 | + return false |
| 98 | +} |
| 99 | + |
| 100 | +// BoundaryAfter returns true if runes cannot combine with or otherwise |
| 101 | +// interact with this or previous runes. |
| 102 | +func (p Properties) BoundaryAfter() bool { |
| 103 | + // TODO: loosen these conditions. |
| 104 | + return p.isInert() |
| 105 | +} |
| 106 | + |
| 107 | +// We pack quick check data in 4 bits: |
| 108 | +// 5: Combines forward (0 == false, 1 == true) |
| 109 | +// 4..3: NFC_QC Yes(00), No (10), or Maybe (11) |
| 110 | +// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. |
| 111 | +// 1..0: Number of trailing non-starters. |
| 112 | +// |
| 113 | +// When all 4 bits are zero, the character is inert, meaning it is never |
| 114 | +// influenced by normalization. |
| 115 | +type qcInfo uint8 |
| 116 | + |
| 117 | +func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } |
| 118 | +func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } |
| 119 | + |
| 120 | +func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } |
| 121 | +func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe |
| 122 | +func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD |
| 123 | + |
| 124 | +func (p Properties) isInert() bool { |
| 125 | + return p.flags&qcInfoMask == 0 && p.ccc == 0 |
| 126 | +} |
| 127 | + |
| 128 | +func (p Properties) multiSegment() bool { |
| 129 | + return p.index >= firstMulti && p.index < endMulti |
| 130 | +} |
| 131 | + |
| 132 | +func (p Properties) nLeadingNonStarters() uint8 { |
| 133 | + return p.nLead |
| 134 | +} |
| 135 | + |
| 136 | +func (p Properties) nTrailingNonStarters() uint8 { |
| 137 | + return uint8(p.flags & 0x03) |
| 138 | +} |
| 139 | + |
| 140 | +// Decomposition returns the decomposition for the underlying rune |
| 141 | +// or nil if there is none. |
| 142 | +func (p Properties) Decomposition() []byte { |
| 143 | + // TODO: create the decomposition for Hangul? |
| 144 | + if p.index == 0 { |
| 145 | + return nil |
| 146 | + } |
| 147 | + i := p.index |
| 148 | + n := decomps[i] & headerLenMask |
| 149 | + i++ |
| 150 | + return decomps[i : i+uint16(n)] |
| 151 | +} |
| 152 | + |
| 153 | +// Size returns the length of UTF-8 encoding of the rune. |
| 154 | +func (p Properties) Size() int { |
| 155 | + return int(p.size) |
| 156 | +} |
| 157 | + |
| 158 | +// CCC returns the canonical combining class of the underlying rune. |
| 159 | +func (p Properties) CCC() uint8 { |
| 160 | + if p.index >= firstCCCZeroExcept { |
| 161 | + return 0 |
| 162 | + } |
| 163 | + return ccc[p.ccc] |
| 164 | +} |
| 165 | + |
| 166 | +// LeadCCC returns the CCC of the first rune in the decomposition. |
| 167 | +// If there is no decomposition, LeadCCC equals CCC. |
| 168 | +func (p Properties) LeadCCC() uint8 { |
| 169 | + return ccc[p.ccc] |
| 170 | +} |
| 171 | + |
| 172 | +// TrailCCC returns the CCC of the last rune in the decomposition. |
| 173 | +// If there is no decomposition, TrailCCC equals CCC. |
| 174 | +func (p Properties) TrailCCC() uint8 { |
| 175 | + return ccc[p.tccc] |
| 176 | +} |
| 177 | + |
| 178 | +// Recomposition |
| 179 | +// We use 32-bit keys instead of 64-bit for the two codepoint keys. |
| 180 | +// This clips off the bits of three entries, but we know this will not |
| 181 | +// result in a collision. In the unlikely event that changes to |
| 182 | +// UnicodeData.txt introduce collisions, the compiler will catch it. |
| 183 | +// Note that the recomposition map for NFC and NFKC are identical. |
| 184 | + |
| 185 | +// combine returns the combined rune or 0 if it doesn't exist. |
| 186 | +func combine(a, b rune) rune { |
| 187 | + key := uint32(uint16(a))<<16 + uint32(uint16(b)) |
| 188 | + return recompMap[key] |
| 189 | +} |
| 190 | + |
| 191 | +func lookupInfoNFC(b input, i int) Properties { |
| 192 | + v, sz := b.charinfoNFC(i) |
| 193 | + return compInfo(v, sz) |
| 194 | +} |
| 195 | + |
| 196 | +func lookupInfoNFKC(b input, i int) Properties { |
| 197 | + v, sz := b.charinfoNFKC(i) |
| 198 | + return compInfo(v, sz) |
| 199 | +} |
| 200 | + |
| 201 | +// Properties returns properties for the first rune in s. |
| 202 | +func (f Form) Properties(s []byte) Properties { |
| 203 | + if f == NFC || f == NFD { |
| 204 | + return compInfo(nfcData.lookup(s)) |
| 205 | + } |
| 206 | + return compInfo(nfkcData.lookup(s)) |
| 207 | +} |
| 208 | + |
| 209 | +// PropertiesString returns properties for the first rune in s. |
| 210 | +func (f Form) PropertiesString(s string) Properties { |
| 211 | + if f == NFC || f == NFD { |
| 212 | + return compInfo(nfcData.lookupString(s)) |
| 213 | + } |
| 214 | + return compInfo(nfkcData.lookupString(s)) |
| 215 | +} |
| 216 | + |
| 217 | +// compInfo converts the information contained in v and sz |
| 218 | +// to a Properties. See the comment at the top of the file |
| 219 | +// for more information on the format. |
| 220 | +func compInfo(v uint16, sz int) Properties { |
| 221 | + if v == 0 { |
| 222 | + return Properties{size: uint8(sz)} |
| 223 | + } else if v >= 0x8000 { |
| 224 | + p := Properties{ |
| 225 | + size: uint8(sz), |
| 226 | + ccc: uint8(v), |
| 227 | + tccc: uint8(v), |
| 228 | + flags: qcInfo(v >> 8), |
| 229 | + } |
| 230 | + if p.ccc > 0 || p.combinesBackward() { |
| 231 | + p.nLead = uint8(p.flags & 0x3) |
| 232 | + } |
| 233 | + return p |
| 234 | + } |
| 235 | + // has decomposition |
| 236 | + h := decomps[v] |
| 237 | + f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 |
| 238 | + p := Properties{size: uint8(sz), flags: f, index: v} |
| 239 | + if v >= firstCCC { |
| 240 | + v += uint16(h&headerLenMask) + 1 |
| 241 | + c := decomps[v] |
| 242 | + p.tccc = c >> 2 |
| 243 | + p.flags |= qcInfo(c & 0x3) |
| 244 | + if v >= firstLeadingCCC { |
| 245 | + p.nLead = c & 0x3 |
| 246 | + if v >= firstStarterWithNLead { |
| 247 | + // We were tricked. Remove the decomposition. |
| 248 | + p.flags &= 0x03 |
| 249 | + p.index = 0 |
| 250 | + return p |
| 251 | + } |
| 252 | + p.ccc = decomps[v+1] |
| 253 | + } |
| 254 | + } |
| 255 | + return p |
| 256 | +} |
0 commit comments