Skip to content

Add complex case mapping and title case mapping. #26039

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 9, 2015
97 changes: 72 additions & 25 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ def is_surrogate(n):
def load_unicode_data(f):
fetch(f)
gencats = {}
upperlower = {}
lowerupper = {}
to_lower = {}
to_upper = {}
to_title = {}
combines = {}
canon_decomp = {}
compat_decomp = {}
Expand Down Expand Up @@ -103,12 +104,16 @@ def load_unicode_data(f):

# generate char to char direct common and simple conversions
# uppercase to lowercase
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
upperlower[code] = int(lowcase, 16)
if lowcase != "" and code_org != lowcase:
to_lower[code] = (int(lowcase, 16), 0, 0)

# lowercase to uppercase
if gencat == "Ll" and upcase != "" and code_org != upcase:
lowerupper[code] = int(upcase, 16)
if upcase != "" and code_org != upcase:
to_upper[code] = (int(upcase, 16), 0, 0)

# title case
if titlecase.strip() != "" and code_org != titlecase:
to_title[code] = (int(titlecase, 16), 0, 0)

# store decomposition, if given
if decomp != "":
Expand Down Expand Up @@ -144,7 +149,32 @@ def load_unicode_data(f):
gencats = group_cats(gencats)
combines = to_combines(group_cats(combines))

return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title)

def load_special_casing(f, to_upper, to_lower, to_title):
fetch(f)
for line in fileinput.input(f):
data = line.split('#')[0].split(';')
if len(data) == 5:
code, lower, title, upper, _comment = data
elif len(data) == 6:
code, lower, title, upper, condition, _comment = data
if condition.strip(): # Only keep unconditional mappins
continue
else:
continue
code = code.strip()
lower = lower.strip()
title = title.strip()
upper = upper.strip()
key = int(code, 16)
for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]:
if values != code:
values = [int(i, 16) for i in values.split()]
for _ in range(len(values), 3):
values.append(0)
assert len(values) == 3
map_[key] = values

def group_cats(cats):
cats_out = {}
Expand Down Expand Up @@ -279,7 +309,7 @@ def load_east_asian_width(want_widths, except_cats):
return widths

def escape_char(c):
return "'\\u{%x}'" % c
return "'\\u{%x}'" % c if c != 0 else "'\\0'"

def emit_bsearch_range_table(f):
f.write("""
Expand Down Expand Up @@ -319,7 +349,7 @@ def emit_property_module(f, mod, tbl, emit):
f.write(" }\n\n")
f.write("}\n\n")

def emit_conversions_module(f, lowerupper, upperlower):
def emit_conversions_module(f, to_upper, to_lower, to_title):
f.write("pub mod conversions {")
f.write("""
use core::cmp::Ordering::{Equal, Less, Greater};
Expand All @@ -328,21 +358,28 @@ def emit_conversions_module(f, lowerupper, upperlower):
use core::option::Option::{Some, None};
use core::result::Result::{Ok, Err};

pub fn to_lower(c: char) -> char {
match bsearch_case_table(c, LuLl_table) {
None => c,
Some(index) => LuLl_table[index].1
pub fn to_lower(c: char) -> [char; 3] {
match bsearch_case_table(c, to_lowercase_table) {
None => [c, '\\0', '\\0'],
Some(index) => to_lowercase_table[index].1
}
}

pub fn to_upper(c: char) -> [char; 3] {
match bsearch_case_table(c, to_uppercase_table) {
None => [c, '\\0', '\\0'],
Some(index) => to_uppercase_table[index].1
}
}

pub fn to_upper(c: char) -> char {
match bsearch_case_table(c, LlLu_table) {
None => c,
Some(index) => LlLu_table[index].1
pub fn to_title(c: char) -> [char; 3] {
match bsearch_case_table(c, to_titlecase_table) {
None => [c, '\\0', '\\0'],
Some(index) => to_titlecase_table[index].1
}
}

fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
match table.binary_search_by(|&(key, _)| {
if c == key { Equal }
else if key < c { Less }
Expand All @@ -354,10 +391,18 @@ def emit_conversions_module(f, lowerupper, upperlower):
}

""")
emit_table(f, "LuLl_table",
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
emit_table(f, "LlLu_table",
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
t_type = "&'static [(char, [char; 3])]"
pfun = lambda x: "(%s,[%s,%s,%s])" % (
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
emit_table(f, "to_lowercase_table",
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
is_pub=False, t_type = t_type, pfun=pfun)
emit_table(f, "to_uppercase_table",
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
is_pub=False, t_type = t_type, pfun=pfun)
emit_table(f, "to_titlecase_table",
sorted(to_title.iteritems(), key=operator.itemgetter(0)),
is_pub=False, t_type = t_type, pfun=pfun)
f.write("}\n\n")

def emit_grapheme_module(f, grapheme_table, grapheme_cats):
Expand Down Expand Up @@ -591,8 +636,10 @@ def optimize_width_table(wtable):
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)
(canon_decomp, compat_decomp, gencats, combines,
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt")
load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title)
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
"Cased", "Case_Ignorable"]
derived = load_properties("DerivedCoreProperties.txt", want_derived)
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
Expand All @@ -611,7 +658,7 @@ def optimize_width_table(wtable):

# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
emit_conversions_module(rf, lowerupper, upperlower)
emit_conversions_module(rf, to_upper, to_lower, to_title)

### character width module
width_table = []
Expand Down
35 changes: 32 additions & 3 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1851,11 +1851,40 @@ impl str {
/// let s = "HELLO";
/// assert_eq!(s.to_lowercase(), "hello");
/// ```
#[unstable(feature = "collections")]
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
pub fn to_lowercase(&self) -> String {
let mut s = String::with_capacity(self.len());
s.extend(self[..].chars().flat_map(|c| c.to_lowercase()));
for (i, c) in self[..].char_indices() {
if c == 'Σ' {
// Σ maps to σ, except at the end of a word where it maps to ς.
// This is the only conditional (contextual) but language-independent mapping
// in `SpecialCasing.txt`,
// so hard-code it rather than have a generic "condition" mechanim.
// See https://github.com/rust-lang/rust/issues/26035
map_uppercase_sigma(self, i, &mut s)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment for why this is being specially cased here?

} else {
s.extend(c.to_lowercase());
}
}
return s;

fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
// for the definition of `Final_Sigma`.
debug_assert!('Σ'.len_utf8() == 2);
let is_word_final =
case_ignoreable_then_cased(from[..i].chars().rev()) &&
!case_ignoreable_then_cased(from[i + 2..].chars());
to.push_str(if is_word_final { "ς" } else { "σ" });
}

fn case_ignoreable_then_cased<I: Iterator<Item=char>>(iter: I) -> bool {
use rustc_unicode::derived_property::{Cased, Case_Ignorable};
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
None => false,
}
}
}

/// Returns the uppercase equivalent of this string.
Expand All @@ -1868,7 +1897,7 @@ impl str {
/// let s = "hello";
/// assert_eq!(s.to_uppercase(), "HELLO");
/// ```
#[unstable(feature = "collections")]
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
pub fn to_uppercase(&self) -> String {
let mut s = String::with_capacity(self.len());
s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));
Expand Down
39 changes: 39 additions & 0 deletions src/libcollectionstest/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,45 @@ fn trim_ws() {
"");
}

#[test]
fn to_lowercase() {
assert_eq!("".to_lowercase(), "");
assert_eq!("AÉDžaé ".to_lowercase(), "aédžaé ");

// https://github.com/rust-lang/rust/issues/26035
assert_eq!("ΑΣ".to_lowercase(), "ας");
assert_eq!("Α'Σ".to_lowercase(), "α'ς");
assert_eq!("Α''Σ".to_lowercase(), "α''ς");

assert_eq!("ΑΣ Α".to_lowercase(), "ας α");
assert_eq!("Α'Σ Α".to_lowercase(), "α'ς α");
assert_eq!("Α''Σ Α".to_lowercase(), "α''ς α");

assert_eq!("ΑΣ' Α".to_lowercase(), "ας' α");
assert_eq!("ΑΣ'' Α".to_lowercase(), "ας'' α");

assert_eq!("Α'Σ' Α".to_lowercase(), "α'ς' α");
assert_eq!("Α''Σ'' Α".to_lowercase(), "α''ς'' α");

assert_eq!("Α Σ".to_lowercase(), "α σ");
assert_eq!("Α 'Σ".to_lowercase(), "α 'σ");
assert_eq!("Α ''Σ".to_lowercase(), "α ''σ");

assert_eq!("Σ".to_lowercase(), "σ");
assert_eq!("'Σ".to_lowercase(), "'σ");
assert_eq!("''Σ".to_lowercase(), "''σ");

assert_eq!("ΑΣΑ".to_lowercase(), "ασα");
assert_eq!("ΑΣ'Α".to_lowercase(), "ασ'α");
assert_eq!("ΑΣ''Α".to_lowercase(), "ασ''α");
}

#[test]
fn to_uppercase() {
assert_eq!("".to_uppercase(), "");
assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ");
}

mod pattern {
use std::str::pattern::Pattern;
use std::str::pattern::{Searcher, ReverseSearcher};
Expand Down
63 changes: 45 additions & 18 deletions src/libcoretest/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ fn test_to_lowercase() {
fn lower(c: char) -> char {
let mut it = c.to_lowercase();
let c = it.next().unwrap();
// As of Unicode version 7.0.0, `SpecialCasing.txt` has no lower-case mapping
// to multiple code points.
assert!(it.next().is_none());
c
}
Expand All @@ -73,29 +75,54 @@ fn test_to_lowercase() {
assert_eq!(lower('Μ'), 'μ');
assert_eq!(lower('Α'), 'α');
assert_eq!(lower('Σ'), 'σ');
assert_eq!(lower('Dž'), 'dž');
assert_eq!(lower('fi'), 'fi');
}

#[test]
fn test_to_uppercase() {
fn upper(c: char) -> char {
let mut it = c.to_uppercase();
let c = it.next().unwrap();
assert!(it.next().is_none());
c
fn upper(c: char) -> Vec<char> {
c.to_uppercase().collect()
}
assert_eq!(upper('a'), 'A');
assert_eq!(upper('ö'), 'Ö');
assert_eq!(upper('ß'), 'ß'); // not ẞ: Latin capital letter sharp s
assert_eq!(upper('ü'), 'Ü');
assert_eq!(upper('💩'), '💩');

assert_eq!(upper('σ'), 'Σ');
assert_eq!(upper('τ'), 'Τ');
assert_eq!(upper('ι'), 'Ι');
assert_eq!(upper('γ'), 'Γ');
assert_eq!(upper('μ'), 'Μ');
assert_eq!(upper('α'), 'Α');
assert_eq!(upper('ς'), 'Σ');
assert_eq!(upper('a'), ['A']);
assert_eq!(upper('ö'), ['Ö']);
assert_eq!(upper('ß'), ['S', 'S']); // not ẞ: Latin capital letter sharp s
assert_eq!(upper('ü'), ['Ü']);
assert_eq!(upper('💩'), ['💩']);

assert_eq!(upper('σ'), ['Σ']);
assert_eq!(upper('τ'), ['Τ']);
assert_eq!(upper('ι'), ['Ι']);
assert_eq!(upper('γ'), ['Γ']);
assert_eq!(upper('μ'), ['Μ']);
assert_eq!(upper('α'), ['Α']);
assert_eq!(upper('ς'), ['Σ']);
assert_eq!(upper('Dž'), ['DŽ']);
assert_eq!(upper('fi'), ['F', 'I']);
assert_eq!(upper('ᾀ'), ['Ἀ', 'Ι']);
}

#[test]
fn test_to_titlecase() {
fn title(c: char) -> Vec<char> {
c.to_titlecase().collect()
}
assert_eq!(title('a'), ['A']);
assert_eq!(title('ö'), ['Ö']);
assert_eq!(title('ß'), ['S', 's']); // not ẞ: Latin capital letter sharp s
assert_eq!(title('ü'), ['Ü']);
assert_eq!(title('💩'), ['💩']);

assert_eq!(title('σ'), ['Σ']);
assert_eq!(title('τ'), ['Τ']);
assert_eq!(title('ι'), ['Ι']);
assert_eq!(title('γ'), ['Γ']);
assert_eq!(title('μ'), ['Μ']);
assert_eq!(title('α'), ['Α']);
assert_eq!(title('ς'), ['Σ']);
assert_eq!(title('DŽ'), ['Dž']);
assert_eq!(title('fi'), ['F', 'i']);
assert_eq!(title('ᾀ'), ['ᾈ']);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/librustc_lint/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

#![feature(box_patterns)]
#![feature(box_syntax)]
#![feature(collections)]
#![cfg_attr(stage0, feature(collections))]
#![feature(core)]
#![feature(quote)]
#![feature(rustc_diagnostic_macros)]
Expand Down
Loading