Skip to content

Commit cf432b8

Browse files
committed
add Graphemes iterator; tidy unicode exports
- Graphemes and GraphemeIndices structs implement iterators over grapheme clusters analogous to the Chars and CharOffsets for chars in a string. Iterator and DoubleEndedIterator are available for both. - tidied up the exports for libunicode. crate root exports are now moved into more appropriate module locations: - UnicodeStrSlice, Words, Graphemes, GraphemeIndices are in str module - UnicodeChar exported from char instead of crate root - canonical_combining_class is exported from str rather than crate root Since libunicode's exports have changed, programs that previously relied on the old export locations will need to change their `use` statements to reflect the new ones. See above for more information on where the new exports live. closes #7043 [breaking-change]
1 parent c066a1e commit cf432b8

File tree

9 files changed

+1599
-38
lines changed

9 files changed

+1599
-38
lines changed

src/etc/unicode.py

+124-5
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,30 @@
5151
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
5252
}
5353

54+
55+
# Grapheme cluster data
56+
# taken from UAX29, http://www.unicode.org/reports/tr29/
57+
# these code points are excluded from the Control category
58+
# NOTE: CR and LF are also technically excluded, but for
59+
# the sake of convenience we leave them in the Control group
60+
# and manually check them in the appropriate place. This is
61+
# still compliant with the implementation requirements.
62+
grapheme_control_exceptions = set([0x200c, 0x200d])
63+
64+
# the Regional_Indicator category
65+
grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)]
66+
67+
# "The following ... are specifically excluded" from the SpacingMark category
68+
# http://www.unicode.org/reports/tr29/#SpacingMark
69+
grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038),
70+
(0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c),
71+
(0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9),
72+
(0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64),
73+
(0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)]
74+
75+
# these are included in the SpacingMark category
76+
grapheme_spacingmark_extra = set([0xe33, 0xeb3])
77+
5478
def fetch(f):
5579
if not os.path.exists(f):
5680
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -109,7 +133,7 @@ def load_unicode_data(f):
109133
canon_decomp[code] = seq
110134

111135
# place letter in categories as appropriate
112-
for cat in [gencat] + expanded_categories.get(gencat, []):
136+
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
113137
if cat not in gencats:
114138
gencats[cat] = []
115139
gencats[cat].append(code)
@@ -120,6 +144,12 @@ def load_unicode_data(f):
120144
combines[combine] = []
121145
combines[combine].append(code)
122146

147+
# generate Not_Assigned from Assigned
148+
gencats["Cn"] = gen_unassigned(gencats["Assigned"])
149+
# Assigned is not a real category
150+
del(gencats["Assigned"])
151+
# Other contains Not_Assigned
152+
gencats["C"].extend(gencats["Cn"])
123153
gencats = group_cats(gencats)
124154
combines = to_combines(group_cats(combines))
125155

@@ -155,6 +185,11 @@ def ungroup_cat(cat):
155185
lo += 1
156186
return cat_out
157187

188+
def gen_unassigned(assigned):
189+
assigned = set(assigned)
190+
return ([i for i in range(0, 0xd800) if i not in assigned] +
191+
[i for i in range(0xe000, 0x110000) if i not in assigned])
192+
158193
def to_combines(combs):
159194
combs_out = []
160195
for comb in combs:
@@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower):
350385
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
351386
f.write("}\n\n")
352387

388+
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
389+
f.write("""pub mod grapheme {
390+
use core::option::{Some, None};
391+
use core::slice::ImmutableVector;
392+
393+
#[allow(non_camel_case_types)]
394+
#[deriving(Clone)]
395+
pub enum GraphemeCat {
396+
""")
397+
for cat in grapheme_cats + ["Any"]:
398+
f.write(" GC_" + cat + ",\n")
399+
f.write(""" }
400+
401+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
402+
use core::cmp::{Equal, Less, Greater};
403+
match r.bsearch(|&(lo, hi, _)| {
404+
if lo <= c && c <= hi { Equal }
405+
else if hi < c { Less }
406+
else { Greater }
407+
}) {
408+
Some(idx) => {
409+
let (_, _, cat) = r[idx];
410+
cat
411+
}
412+
None => GC_Any
413+
}
414+
}
415+
416+
pub fn grapheme_category(c: char) -> GraphemeCat {
417+
bsearch_range_value_table(c, grapheme_cat_table)
418+
}
419+
420+
""")
421+
422+
emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
423+
pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
424+
is_pub=False)
425+
f.write("}\n")
426+
353427
def emit_charwidth_module(f, width_table):
354428
f.write("pub mod charwidth {\n")
355429
f.write(" use core::option::{Option, Some, None};\n")
@@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table):
388462
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
389463
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
390464
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
391-
f.write("}\n")
465+
f.write("}\n\n")
392466

393467
def emit_norm_module(f, canon, compat, combine):
394468
canon_keys = canon.keys()
@@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val):
473547
wtable_out.extend(wtable)
474548
return wtable_out
475549

550+
551+
476552
def optimize_width_table(wtable):
477553
wtable_out = []
478554
w_this = wtable.pop(0)
@@ -487,7 +563,7 @@ def optimize_width_table(wtable):
487563
return wtable_out
488564

489565
if __name__ == "__main__":
490-
r = "unicode.rs"
566+
r = "tables.rs"
491567
if os.path.exists(r):
492568
os.remove(r)
493569
with open(r, "w") as rf:
@@ -498,12 +574,18 @@ def optimize_width_table(wtable):
498574
(canon_decomp, compat_decomp, gencats, combines,
499575
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
500576
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
501-
other_derived = ["Default_Ignorable_Code_Point"]
577+
other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
502578
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
503579
scripts = load_properties("Scripts.txt", [])
504580
props = load_properties("PropList.txt",
505581
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
506582

583+
# grapheme cluster category from DerivedCoreProperties
584+
# the rest are defined below
585+
grapheme_cats = {}
586+
grapheme_cats["Extend"] = derived["Grapheme_Extend"]
587+
del(derived["Grapheme_Extend"])
588+
507589
# bsearch_range_table is used in all the property modules below
508590
emit_bsearch_range_table(rf)
509591

@@ -533,7 +615,7 @@ def optimize_width_table(wtable):
533615
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
534616
emit_conversions_module(rf, lowerupper, upperlower)
535617

536-
# character width module
618+
### character width module
537619
width_table = []
538620
for zwcat in ["Me", "Mn", "Cf"]:
539621
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
@@ -555,3 +637,40 @@ def optimize_width_table(wtable):
555637
# optimize the width table by collapsing adjacent entities when possible
556638
width_table = optimize_width_table(width_table)
557639
emit_charwidth_module(rf, width_table)
640+
641+
### grapheme cluster module
642+
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
643+
# Hangul syllable categories
644+
want_hangul = ["L", "V", "T", "LV", "LVT"]
645+
grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul))
646+
647+
# Control
648+
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
649+
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
650+
grapheme_cats["Control"] = set()
651+
for cat in ["Zl", "Zp", "Cc", "Cf"]:
652+
grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat]))
653+
grapheme_cats["Control"] = group_cat(list(
654+
grapheme_cats["Control"]
655+
- grapheme_control_exceptions
656+
| (set(ungroup_cat(gencats["Cn"]))
657+
& set(ungroup_cat(derived["Default_Ignorable_Code_Point"])))))
658+
659+
# Regional Indicator
660+
grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator
661+
662+
# Prepend - "Currently there are no characters with this value"
663+
# (from UAX#29, Unicode 7.0)
664+
665+
# SpacingMark
666+
grapheme_cats["SpacingMark"] = group_cat(list(
667+
set(ungroup_cat(gencats["Mc"]))
668+
- set(ungroup_cat(grapheme_cats["Extend"]))
669+
| grapheme_spacingmark_extra
670+
- set(ungroup_cat(grapheme_spacingmark_exceptions))))
671+
672+
grapheme_table = []
673+
for cat in grapheme_cats:
674+
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
675+
grapheme_table.sort(key=lambda w: w[0])
676+
emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())

0 commit comments

Comments
 (0)