51
51
'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
52
52
}
53
53
54
+
55
+ # Grapheme cluster data
56
+ # taken from UAX29, http://www.unicode.org/reports/tr29/
57
+ # these code points are excluded from the Control category
58
+ # NOTE: CR and LF are also technically excluded, but for
59
+ # the sake of convenience we leave them in the Control group
60
+ # and manually check them in the appropriate place. This is
61
+ # still compliant with the implementation requirements.
62
+ grapheme_control_exceptions = set ([0x200c , 0x200d ])
63
+
64
+ # the Regional_Indicator category
65
+ grapheme_regional_indicator = [(0x1f1e6 , 0x1f1ff )]
66
+
67
+ # "The following ... are specifically excluded" from the SpacingMark category
68
+ # http://www.unicode.org/reports/tr29/#SpacingMark
69
+ grapheme_spacingmark_exceptions = [(0x102b , 0x102c ), (0x1038 , 0x1038 ),
70
+ (0x1062 , 0x1064 ), (0x1067 , 0x106d ), (0x1083 , 0x1083 ), (0x1087 , 0x108c ),
71
+ (0x108f , 0x108f ), (0x109a , 0x109c ), (0x19b0 , 0x19b4 ), (0x19b8 , 0x19b9 ),
72
+ (0x19bb , 0x19c0 ), (0x19c8 , 0x19c9 ), (0x1a61 , 0x1a61 ), (0x1a63 , 0x1a64 ),
73
+ (0xaa7b , 0xaa7b ), (0xaa7d , 0xaa7d )]
74
+
75
+ # these are included in the SpacingMark category
76
+ grapheme_spacingmark_extra = set ([0xe33 , 0xeb3 ])
77
+
54
78
def fetch (f ):
55
79
if not os .path .exists (f ):
56
80
os .system ("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -109,7 +133,7 @@ def load_unicode_data(f):
109
133
canon_decomp [code ] = seq
110
134
111
135
# place letter in categories as appropriate
112
- for cat in [gencat ] + expanded_categories .get (gencat , []):
136
+ for cat in [gencat , "Assigned" ] + expanded_categories .get (gencat , []):
113
137
if cat not in gencats :
114
138
gencats [cat ] = []
115
139
gencats [cat ].append (code )
@@ -120,6 +144,12 @@ def load_unicode_data(f):
120
144
combines [combine ] = []
121
145
combines [combine ].append (code )
122
146
147
+ # generate Not_Assigned from Assigned
148
+ gencats ["Cn" ] = gen_unassigned (gencats ["Assigned" ])
149
+ # Assigned is not a real category
150
+ del (gencats ["Assigned" ])
151
+ # Other contains Not_Assigned
152
+ gencats ["C" ].extend (gencats ["Cn" ])
123
153
gencats = group_cats (gencats )
124
154
combines = to_combines (group_cats (combines ))
125
155
@@ -155,6 +185,11 @@ def ungroup_cat(cat):
155
185
lo += 1
156
186
return cat_out
157
187
188
+ def gen_unassigned (assigned ):
189
+ assigned = set (assigned )
190
+ return ([i for i in range (0 , 0xd800 ) if i not in assigned ] +
191
+ [i for i in range (0xe000 , 0x110000 ) if i not in assigned ])
192
+
158
193
def to_combines (combs ):
159
194
combs_out = []
160
195
for comb in combs :
@@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower):
350
385
sorted (lowerupper .iteritems (), key = operator .itemgetter (0 )), is_pub = False )
351
386
f .write ("}\n \n " )
352
387
388
+ def emit_grapheme_module (f , grapheme_table , grapheme_cats ):
389
+ f .write ("""pub mod grapheme {
390
+ use core::option::{Some, None};
391
+ use core::slice::ImmutableVector;
392
+
393
+ #[allow(non_camel_case_types)]
394
+ #[deriving(Clone)]
395
+ pub enum GraphemeCat {
396
+ """ )
397
+ for cat in grapheme_cats + ["Any" ]:
398
+ f .write (" GC_" + cat + ",\n " )
399
+ f .write (""" }
400
+
401
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
402
+ use core::cmp::{Equal, Less, Greater};
403
+ match r.bsearch(|&(lo, hi, _)| {
404
+ if lo <= c && c <= hi { Equal }
405
+ else if hi < c { Less }
406
+ else { Greater }
407
+ }) {
408
+ Some(idx) => {
409
+ let (_, _, cat) = r[idx];
410
+ cat
411
+ }
412
+ None => GC_Any
413
+ }
414
+ }
415
+
416
+ pub fn grapheme_category(c: char) -> GraphemeCat {
417
+ bsearch_range_value_table(c, grapheme_cat_table)
418
+ }
419
+
420
+ """ )
421
+
422
+ emit_table (f , "grapheme_cat_table" , grapheme_table , "&'static [(char, char, GraphemeCat)]" ,
423
+ pfun = lambda x : "(%s,%s,GC_%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]),
424
+ is_pub = False )
425
+ f .write ("}\n " )
426
+
353
427
def emit_charwidth_module (f , width_table ):
354
428
f .write ("pub mod charwidth {\n " )
355
429
f .write (" use core::option::{Option, Some, None};\n " )
@@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table):
388
462
f .write (" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n " )
389
463
emit_table (f , "charwidth_table" , width_table , "&'static [(char, char, u8, u8)]" , is_pub = False ,
390
464
pfun = lambda x : "(%s,%s,%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ], x [3 ]))
391
- f .write ("}\n " )
465
+ f .write ("}\n \n " )
392
466
393
467
def emit_norm_module (f , canon , compat , combine ):
394
468
canon_keys = canon .keys ()
@@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val):
473
547
wtable_out .extend (wtable )
474
548
return wtable_out
475
549
550
+
551
+
476
552
def optimize_width_table (wtable ):
477
553
wtable_out = []
478
554
w_this = wtable .pop (0 )
@@ -487,7 +563,7 @@ def optimize_width_table(wtable):
487
563
return wtable_out
488
564
489
565
if __name__ == "__main__" :
490
- r = "unicode .rs"
566
+ r = "tables .rs"
491
567
if os .path .exists (r ):
492
568
os .remove (r )
493
569
with open (r , "w" ) as rf :
@@ -498,12 +574,18 @@ def optimize_width_table(wtable):
498
574
(canon_decomp , compat_decomp , gencats , combines ,
499
575
lowerupper , upperlower ) = load_unicode_data ("UnicodeData.txt" )
500
576
want_derived = ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ]
501
- other_derived = ["Default_Ignorable_Code_Point" ]
577
+ other_derived = ["Default_Ignorable_Code_Point" , "Grapheme_Extend" ]
502
578
derived = load_properties ("DerivedCoreProperties.txt" , want_derived + other_derived )
503
579
scripts = load_properties ("Scripts.txt" , [])
504
580
props = load_properties ("PropList.txt" ,
505
581
["White_Space" , "Join_Control" , "Noncharacter_Code_Point" ])
506
582
583
+ # grapheme cluster category from DerivedCoreProperties
584
+ # the rest are defined below
585
+ grapheme_cats = {}
586
+ grapheme_cats ["Extend" ] = derived ["Grapheme_Extend" ]
587
+ del (derived ["Grapheme_Extend" ])
588
+
507
589
# bsearch_range_table is used in all the property modules below
508
590
emit_bsearch_range_table (rf )
509
591
@@ -533,7 +615,7 @@ def optimize_width_table(wtable):
533
615
emit_norm_module (rf , canon_decomp , compat_decomp , combines )
534
616
emit_conversions_module (rf , lowerupper , upperlower )
535
617
536
- # character width module
618
+ ### character width module
537
619
width_table = []
538
620
for zwcat in ["Me" , "Mn" , "Cf" ]:
539
621
width_table .extend (map (lambda (lo , hi ): (lo , hi , 0 , 0 ), gencats [zwcat ]))
@@ -555,3 +637,40 @@ def optimize_width_table(wtable):
555
637
# optimize the width table by collapsing adjacent entities when possible
556
638
width_table = optimize_width_table (width_table )
557
639
emit_charwidth_module (rf , width_table )
640
+
641
+ ### grapheme cluster module
642
+ # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
643
+ # Hangul syllable categories
644
+ want_hangul = ["L" , "V" , "T" , "LV" , "LVT" ]
645
+ grapheme_cats .update (load_properties ("HangulSyllableType.txt" , want_hangul ))
646
+
647
+ # Control
648
+ # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
649
+ # Unicode Scalar Values only, and surrogates are thus invalid `char`s.
650
+ grapheme_cats ["Control" ] = set ()
651
+ for cat in ["Zl" , "Zp" , "Cc" , "Cf" ]:
652
+ grapheme_cats ["Control" ] |= set (ungroup_cat (gencats [cat ]))
653
+ grapheme_cats ["Control" ] = group_cat (list (
654
+ grapheme_cats ["Control" ]
655
+ - grapheme_control_exceptions
656
+ | (set (ungroup_cat (gencats ["Cn" ]))
657
+ & set (ungroup_cat (derived ["Default_Ignorable_Code_Point" ])))))
658
+
659
+ # Regional Indicator
660
+ grapheme_cats ["RegionalIndicator" ] = grapheme_regional_indicator
661
+
662
+ # Prepend - "Currently there are no characters with this value"
663
+ # (from UAX#29, Unicode 7.0)
664
+
665
+ # SpacingMark
666
+ grapheme_cats ["SpacingMark" ] = group_cat (list (
667
+ set (ungroup_cat (gencats ["Mc" ]))
668
+ - set (ungroup_cat (grapheme_cats ["Extend" ]))
669
+ | grapheme_spacingmark_extra
670
+ - set (ungroup_cat (grapheme_spacingmark_exceptions ))))
671
+
672
+ grapheme_table = []
673
+ for cat in grapheme_cats :
674
+ grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
675
+ grapheme_table .sort (key = lambda w : w [0 ])
676
+ emit_grapheme_module (rf , grapheme_table , grapheme_cats .keys ())
0 commit comments