@@ -72,8 +72,9 @@ def is_surrogate(n):
72
72
def load_unicode_data (f ):
73
73
fetch (f )
74
74
gencats = {}
75
- upperlower = {}
76
- lowerupper = {}
75
+ to_lower = {}
76
+ to_upper = {}
77
+ to_title = {}
77
78
combines = {}
78
79
canon_decomp = {}
79
80
compat_decomp = {}
@@ -103,12 +104,16 @@ def load_unicode_data(f):
103
104
104
105
# generate char to char direct common and simple conversions
105
106
# uppercase to lowercase
106
- if gencat == "Lu" and lowcase != "" and code_org != lowcase :
107
- upperlower [code ] = int (lowcase , 16 )
107
+ if lowcase != "" and code_org != lowcase :
108
+ to_lower [code ] = ( int (lowcase , 16 ), 0 , 0 )
108
109
109
110
# lowercase to uppercase
110
- if gencat == "Ll" and upcase != "" and code_org != upcase :
111
- lowerupper [code ] = int (upcase , 16 )
111
+ if upcase != "" and code_org != upcase :
112
+ to_upper [code ] = (int (upcase , 16 ), 0 , 0 )
113
+
114
+ # title case
115
+ if titlecase .strip () != "" and code_org != titlecase :
116
+ to_title [code ] = (int (titlecase , 16 ), 0 , 0 )
112
117
113
118
# store decomposition, if given
114
119
if decomp != "" :
@@ -144,7 +149,32 @@ def load_unicode_data(f):
144
149
gencats = group_cats (gencats )
145
150
combines = to_combines (group_cats (combines ))
146
151
147
- return (canon_decomp , compat_decomp , gencats , combines , lowerupper , upperlower )
152
+ return (canon_decomp , compat_decomp , gencats , combines , to_upper , to_lower , to_title )
153
+
154
+ def load_special_casing (f , to_upper , to_lower , to_title ):
155
+ fetch (f )
156
+ for line in fileinput .input (f ):
157
+ data = line .split ('#' )[0 ].split (';' )
158
+ if len (data ) == 5 :
159
+ code , lower , title , upper , _comment = data
160
+ elif len (data ) == 6 :
161
+ code , lower , title , upper , condition , _comment = data
162
+ if condition .strip (): # Only keep unconditional mappins
163
+ continue
164
+ else :
165
+ continue
166
+ code = code .strip ()
167
+ lower = lower .strip ()
168
+ title = title .strip ()
169
+ upper = upper .strip ()
170
+ key = int (code , 16 )
171
+ for (map_ , values ) in [(to_lower , lower ), (to_upper , upper ), (to_title , title )]:
172
+ if values != code :
173
+ values = [int (i , 16 ) for i in values .split ()]
174
+ for _ in range (len (values ), 3 ):
175
+ values .append (0 )
176
+ assert len (values ) == 3
177
+ map_ [key ] = values
148
178
149
179
def group_cats (cats ):
150
180
cats_out = {}
@@ -279,7 +309,7 @@ def load_east_asian_width(want_widths, except_cats):
279
309
return widths
280
310
281
311
def escape_char (c ):
282
- return "'\\ u{%x}'" % c
312
+ return "'\\ u{%x}'" % c if c != 0 else "' \\ 0'"
283
313
284
314
def emit_bsearch_range_table (f ):
285
315
f .write ("""
@@ -319,7 +349,7 @@ def emit_property_module(f, mod, tbl, emit):
319
349
f .write (" }\n \n " )
320
350
f .write ("}\n \n " )
321
351
322
- def emit_conversions_module (f , lowerupper , upperlower ):
352
+ def emit_conversions_module (f , to_upper , to_lower , to_title ):
323
353
f .write ("pub mod conversions {" )
324
354
f .write ("""
325
355
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -328,21 +358,28 @@ def emit_conversions_module(f, lowerupper, upperlower):
328
358
use core::option::Option::{Some, None};
329
359
use core::result::Result::{Ok, Err};
330
360
331
- pub fn to_lower(c: char) -> char {
332
- match bsearch_case_table(c, LuLl_table) {
333
- None => c,
334
- Some(index) => LuLl_table[index].1
361
+ pub fn to_lower(c: char) -> [char; 3] {
362
+ match bsearch_case_table(c, to_lowercase_table) {
363
+ None => [c, '\\ 0', '\\ 0'],
364
+ Some(index) => to_lowercase_table[index].1
365
+ }
366
+ }
367
+
368
+ pub fn to_upper(c: char) -> [char; 3] {
369
+ match bsearch_case_table(c, to_uppercase_table) {
370
+ None => [c, '\\ 0', '\\ 0'],
371
+ Some(index) => to_uppercase_table[index].1
335
372
}
336
373
}
337
374
338
- pub fn to_upper (c: char) -> char {
339
- match bsearch_case_table(c, LlLu_table ) {
340
- None => c ,
341
- Some(index) => LlLu_table [index].1
375
+ pub fn to_title (c: char) -> [ char; 3] {
376
+ match bsearch_case_table(c, to_titlecase_table ) {
377
+ None => [c, ' \\ 0', ' \\ 0'] ,
378
+ Some(index) => to_titlecase_table [index].1
342
379
}
343
380
}
344
381
345
- fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
382
+ fn bsearch_case_table(c: char, table: &'static [(char, [ char; 3] )]) -> Option<usize> {
346
383
match table.binary_search_by(|&(key, _)| {
347
384
if c == key { Equal }
348
385
else if key < c { Less }
@@ -354,10 +391,18 @@ def emit_conversions_module(f, lowerupper, upperlower):
354
391
}
355
392
356
393
""" )
357
- emit_table (f , "LuLl_table" ,
358
- sorted (upperlower .iteritems (), key = operator .itemgetter (0 )), is_pub = False )
359
- emit_table (f , "LlLu_table" ,
360
- sorted (lowerupper .iteritems (), key = operator .itemgetter (0 )), is_pub = False )
394
+ t_type = "&'static [(char, [char; 3])]"
395
+ pfun = lambda x : "(%s,[%s,%s,%s])" % (
396
+ escape_char (x [0 ]), escape_char (x [1 ][0 ]), escape_char (x [1 ][1 ]), escape_char (x [1 ][2 ]))
397
+ emit_table (f , "to_lowercase_table" ,
398
+ sorted (to_lower .iteritems (), key = operator .itemgetter (0 )),
399
+ is_pub = False , t_type = t_type , pfun = pfun )
400
+ emit_table (f , "to_uppercase_table" ,
401
+ sorted (to_upper .iteritems (), key = operator .itemgetter (0 )),
402
+ is_pub = False , t_type = t_type , pfun = pfun )
403
+ emit_table (f , "to_titlecase_table" ,
404
+ sorted (to_title .iteritems (), key = operator .itemgetter (0 )),
405
+ is_pub = False , t_type = t_type , pfun = pfun )
361
406
f .write ("}\n \n " )
362
407
363
408
def emit_grapheme_module (f , grapheme_table , grapheme_cats ):
@@ -591,8 +636,10 @@ def optimize_width_table(wtable):
591
636
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
592
637
""" % unicode_version )
593
638
(canon_decomp , compat_decomp , gencats , combines ,
594
- lowerupper , upperlower ) = load_unicode_data ("UnicodeData.txt" )
595
- want_derived = ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ]
639
+ to_upper , to_lower , to_title ) = load_unicode_data ("UnicodeData.txt" )
640
+ load_special_casing ("SpecialCasing.txt" , to_upper , to_lower , to_title )
641
+ want_derived = ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ,
642
+ "Cased" , "Case_Ignorable" ]
596
643
derived = load_properties ("DerivedCoreProperties.txt" , want_derived )
597
644
scripts = load_properties ("Scripts.txt" , [])
598
645
props = load_properties ("PropList.txt" ,
@@ -611,7 +658,7 @@ def optimize_width_table(wtable):
611
658
612
659
# normalizations and conversions module
613
660
emit_norm_module (rf , canon_decomp , compat_decomp , combines , norm_props )
614
- emit_conversions_module (rf , lowerupper , upperlower )
661
+ emit_conversions_module (rf , to_upper , to_lower , to_title )
615
662
616
663
### character width module
617
664
width_table = []
0 commit comments