Skip to content

Commit f06e026

Browse files
committed
Auto merge of #26039 - SimonSapin:case-mapping, r=alexcrichton
* Add “complex” mappings to `char::to_lowercase` and `char::to_uppercase`, making them yield sometimes more than on `char`: #25800. `str::to_lowercase` and `str::to_uppercase` are affected as well. * Add `char::to_titlecase`, since it’s the same algorithm (just different data). However this does **not** add `str::to_titlecase`, as that would require UAX#29 Unicode Text Segmentation which we decided not to include in of `std`: rust-lang/rfcs#1054 I made `char::to_titlecase` immediately `#[stable]`, since it’s so similar to `char::to_uppercase` that’s already stable. Let me know if it should be `#[unstable]` for a while. * Add a special case for upper-case Sigma in word-final position in `str::to_lowercase`: #26035. This is the only language-independent conditional mapping currently in `SpecialCasing.txt`. * Stabilize `str::to_lowercase` and `str::to_uppercase`. The `&self -> String` on `str` signature seems straightforward enough, and the only relevant issue I’ve found is #24536 about naming. But `char` already has stable methods with the same name, and deprecating them for a rename doesn’t seem worth it. r? @alexcrichton
2 parents 8a3f5af + 6369dcb commit f06e026

File tree

8 files changed

+1975
-673
lines changed

8 files changed

+1975
-673
lines changed

src/etc/unicode.py

+72-25
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ def is_surrogate(n):
7272
def load_unicode_data(f):
7373
fetch(f)
7474
gencats = {}
75-
upperlower = {}
76-
lowerupper = {}
75+
to_lower = {}
76+
to_upper = {}
77+
to_title = {}
7778
combines = {}
7879
canon_decomp = {}
7980
compat_decomp = {}
@@ -103,12 +104,16 @@ def load_unicode_data(f):
103104

104105
# generate char to char direct common and simple conversions
105106
# uppercase to lowercase
106-
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
107-
upperlower[code] = int(lowcase, 16)
107+
if lowcase != "" and code_org != lowcase:
108+
to_lower[code] = (int(lowcase, 16), 0, 0)
108109

109110
# lowercase to uppercase
110-
if gencat == "Ll" and upcase != "" and code_org != upcase:
111-
lowerupper[code] = int(upcase, 16)
111+
if upcase != "" and code_org != upcase:
112+
to_upper[code] = (int(upcase, 16), 0, 0)
113+
114+
# title case
115+
if titlecase.strip() != "" and code_org != titlecase:
116+
to_title[code] = (int(titlecase, 16), 0, 0)
112117

113118
# store decomposition, if given
114119
if decomp != "":
@@ -144,7 +149,32 @@ def load_unicode_data(f):
144149
gencats = group_cats(gencats)
145150
combines = to_combines(group_cats(combines))
146151

147-
return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
152+
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title)
153+
154+
def load_special_casing(f, to_upper, to_lower, to_title):
155+
fetch(f)
156+
for line in fileinput.input(f):
157+
data = line.split('#')[0].split(';')
158+
if len(data) == 5:
159+
code, lower, title, upper, _comment = data
160+
elif len(data) == 6:
161+
code, lower, title, upper, condition, _comment = data
162+
if condition.strip(): # Only keep unconditional mappins
163+
continue
164+
else:
165+
continue
166+
code = code.strip()
167+
lower = lower.strip()
168+
title = title.strip()
169+
upper = upper.strip()
170+
key = int(code, 16)
171+
for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]:
172+
if values != code:
173+
values = [int(i, 16) for i in values.split()]
174+
for _ in range(len(values), 3):
175+
values.append(0)
176+
assert len(values) == 3
177+
map_[key] = values
148178

149179
def group_cats(cats):
150180
cats_out = {}
@@ -279,7 +309,7 @@ def load_east_asian_width(want_widths, except_cats):
279309
return widths
280310

281311
def escape_char(c):
282-
return "'\\u{%x}'" % c
312+
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
283313

284314
def emit_bsearch_range_table(f):
285315
f.write("""
@@ -319,7 +349,7 @@ def emit_property_module(f, mod, tbl, emit):
319349
f.write(" }\n\n")
320350
f.write("}\n\n")
321351

322-
def emit_conversions_module(f, lowerupper, upperlower):
352+
def emit_conversions_module(f, to_upper, to_lower, to_title):
323353
f.write("pub mod conversions {")
324354
f.write("""
325355
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -328,21 +358,28 @@ def emit_conversions_module(f, lowerupper, upperlower):
328358
use core::option::Option::{Some, None};
329359
use core::result::Result::{Ok, Err};
330360
331-
pub fn to_lower(c: char) -> char {
332-
match bsearch_case_table(c, LuLl_table) {
333-
None => c,
334-
Some(index) => LuLl_table[index].1
361+
pub fn to_lower(c: char) -> [char; 3] {
362+
match bsearch_case_table(c, to_lowercase_table) {
363+
None => [c, '\\0', '\\0'],
364+
Some(index) => to_lowercase_table[index].1
365+
}
366+
}
367+
368+
pub fn to_upper(c: char) -> [char; 3] {
369+
match bsearch_case_table(c, to_uppercase_table) {
370+
None => [c, '\\0', '\\0'],
371+
Some(index) => to_uppercase_table[index].1
335372
}
336373
}
337374
338-
pub fn to_upper(c: char) -> char {
339-
match bsearch_case_table(c, LlLu_table) {
340-
None => c,
341-
Some(index) => LlLu_table[index].1
375+
pub fn to_title(c: char) -> [char; 3] {
376+
match bsearch_case_table(c, to_titlecase_table) {
377+
None => [c, '\\0', '\\0'],
378+
Some(index) => to_titlecase_table[index].1
342379
}
343380
}
344381
345-
fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
382+
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
346383
match table.binary_search_by(|&(key, _)| {
347384
if c == key { Equal }
348385
else if key < c { Less }
@@ -354,10 +391,18 @@ def emit_conversions_module(f, lowerupper, upperlower):
354391
}
355392
356393
""")
357-
emit_table(f, "LuLl_table",
358-
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
359-
emit_table(f, "LlLu_table",
360-
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
394+
t_type = "&'static [(char, [char; 3])]"
395+
pfun = lambda x: "(%s,[%s,%s,%s])" % (
396+
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
397+
emit_table(f, "to_lowercase_table",
398+
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
399+
is_pub=False, t_type = t_type, pfun=pfun)
400+
emit_table(f, "to_uppercase_table",
401+
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
402+
is_pub=False, t_type = t_type, pfun=pfun)
403+
emit_table(f, "to_titlecase_table",
404+
sorted(to_title.iteritems(), key=operator.itemgetter(0)),
405+
is_pub=False, t_type = t_type, pfun=pfun)
361406
f.write("}\n\n")
362407

363408
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
@@ -591,8 +636,10 @@ def optimize_width_table(wtable):
591636
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
592637
""" % unicode_version)
593638
(canon_decomp, compat_decomp, gencats, combines,
594-
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
595-
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
639+
to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt")
640+
load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title)
641+
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
642+
"Cased", "Case_Ignorable"]
596643
derived = load_properties("DerivedCoreProperties.txt", want_derived)
597644
scripts = load_properties("Scripts.txt", [])
598645
props = load_properties("PropList.txt",
@@ -611,7 +658,7 @@ def optimize_width_table(wtable):
611658

612659
# normalizations and conversions module
613660
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
614-
emit_conversions_module(rf, lowerupper, upperlower)
661+
emit_conversions_module(rf, to_upper, to_lower, to_title)
615662

616663
### character width module
617664
width_table = []

src/libcollections/str.rs

+32-3
Original file line numberDiff line numberDiff line change
@@ -1816,11 +1816,40 @@ impl str {
18161816
/// let s = "HELLO";
18171817
/// assert_eq!(s.to_lowercase(), "hello");
18181818
/// ```
1819-
#[unstable(feature = "collections")]
1819+
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
18201820
pub fn to_lowercase(&self) -> String {
18211821
let mut s = String::with_capacity(self.len());
1822-
s.extend(self[..].chars().flat_map(|c| c.to_lowercase()));
1822+
for (i, c) in self[..].char_indices() {
1823+
if c == 'Σ' {
1824+
// Σ maps to σ, except at the end of a word where it maps to ς.
1825+
// This is the only conditional (contextual) but language-independent mapping
1826+
// in `SpecialCasing.txt`,
1827+
// so hard-code it rather than have a generic "condition" mechanim.
1828+
// See https://github.com/rust-lang/rust/issues/26035
1829+
map_uppercase_sigma(self, i, &mut s)
1830+
} else {
1831+
s.extend(c.to_lowercase());
1832+
}
1833+
}
18231834
return s;
1835+
1836+
fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {
1837+
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
1838+
// for the definition of `Final_Sigma`.
1839+
debug_assert!('Σ'.len_utf8() == 2);
1840+
let is_word_final =
1841+
case_ignoreable_then_cased(from[..i].chars().rev()) &&
1842+
!case_ignoreable_then_cased(from[i + 2..].chars());
1843+
to.push_str(if is_word_final { "ς" } else { "σ" });
1844+
}
1845+
1846+
fn case_ignoreable_then_cased<I: Iterator<Item=char>>(iter: I) -> bool {
1847+
use rustc_unicode::derived_property::{Cased, Case_Ignorable};
1848+
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
1849+
Some(c) => Cased(c),
1850+
None => false,
1851+
}
1852+
}
18241853
}
18251854

18261855
/// Returns the uppercase equivalent of this string.
@@ -1833,7 +1862,7 @@ impl str {
18331862
/// let s = "hello";
18341863
/// assert_eq!(s.to_uppercase(), "HELLO");
18351864
/// ```
1836-
#[unstable(feature = "collections")]
1865+
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
18371866
pub fn to_uppercase(&self) -> String {
18381867
let mut s = String::with_capacity(self.len());
18391868
s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));

src/libcollectionstest/str.rs

+39
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,45 @@ fn trim_ws() {
16871687
"");
16881688
}
16891689

1690+
#[test]
1691+
fn to_lowercase() {
1692+
assert_eq!("".to_lowercase(), "");
1693+
assert_eq!("AÉDžaé ".to_lowercase(), "aédžaé ");
1694+
1695+
// https://github.com/rust-lang/rust/issues/26035
1696+
assert_eq!("ΑΣ".to_lowercase(), "ας");
1697+
assert_eq!("Α'Σ".to_lowercase(), "α'ς");
1698+
assert_eq!("Α''Σ".to_lowercase(), "α''ς");
1699+
1700+
assert_eq!("ΑΣ Α".to_lowercase(), "ας α");
1701+
assert_eq!("Α'Σ Α".to_lowercase(), "α'ς α");
1702+
assert_eq!("Α''Σ Α".to_lowercase(), "α''ς α");
1703+
1704+
assert_eq!("ΑΣ' Α".to_lowercase(), "ας' α");
1705+
assert_eq!("ΑΣ'' Α".to_lowercase(), "ας'' α");
1706+
1707+
assert_eq!("Α'Σ' Α".to_lowercase(), "α'ς' α");
1708+
assert_eq!("Α''Σ'' Α".to_lowercase(), "α''ς'' α");
1709+
1710+
assert_eq!("Α Σ".to_lowercase(), "α σ");
1711+
assert_eq!("Α 'Σ".to_lowercase(), "α 'σ");
1712+
assert_eq!("Α ''Σ".to_lowercase(), "α ''σ");
1713+
1714+
assert_eq!("Σ".to_lowercase(), "σ");
1715+
assert_eq!("'Σ".to_lowercase(), "'σ");
1716+
assert_eq!("''Σ".to_lowercase(), "''σ");
1717+
1718+
assert_eq!("ΑΣΑ".to_lowercase(), "ασα");
1719+
assert_eq!("ΑΣ'Α".to_lowercase(), "ασ'α");
1720+
assert_eq!("ΑΣ''Α".to_lowercase(), "ασ''α");
1721+
}
1722+
1723+
#[test]
1724+
fn to_uppercase() {
1725+
assert_eq!("".to_uppercase(), "");
1726+
assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ");
1727+
}
1728+
16901729
mod pattern {
16911730
use std::str::pattern::Pattern;
16921731
use std::str::pattern::{Searcher, ReverseSearcher};

src/libcoretest/char.rs

+45-18
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ fn test_to_lowercase() {
5858
fn lower(c: char) -> char {
5959
let mut it = c.to_lowercase();
6060
let c = it.next().unwrap();
61+
// As of Unicode version 7.0.0, `SpecialCasing.txt` has no lower-case mapping
62+
// to multiple code points.
6163
assert!(it.next().is_none());
6264
c
6365
}
@@ -73,29 +75,54 @@ fn test_to_lowercase() {
7375
assert_eq!(lower('Μ'), 'μ');
7476
assert_eq!(lower('Α'), 'α');
7577
assert_eq!(lower('Σ'), 'σ');
78+
assert_eq!(lower('Dž'), 'dž');
79+
assert_eq!(lower('fi'), 'fi');
7680
}
7781

7882
#[test]
7983
fn test_to_uppercase() {
80-
fn upper(c: char) -> char {
81-
let mut it = c.to_uppercase();
82-
let c = it.next().unwrap();
83-
assert!(it.next().is_none());
84-
c
84+
fn upper(c: char) -> Vec<char> {
85+
c.to_uppercase().collect()
8586
}
86-
assert_eq!(upper('a'), 'A');
87-
assert_eq!(upper('ö'), 'Ö');
88-
assert_eq!(upper('ß'), 'ß'); // not ẞ: Latin capital letter sharp s
89-
assert_eq!(upper('ü'), 'Ü');
90-
assert_eq!(upper('💩'), '💩');
91-
92-
assert_eq!(upper('σ'), 'Σ');
93-
assert_eq!(upper('τ'), 'Τ');
94-
assert_eq!(upper('ι'), 'Ι');
95-
assert_eq!(upper('γ'), 'Γ');
96-
assert_eq!(upper('μ'), 'Μ');
97-
assert_eq!(upper('α'), 'Α');
98-
assert_eq!(upper('ς'), 'Σ');
87+
assert_eq!(upper('a'), ['A']);
88+
assert_eq!(upper('ö'), ['Ö']);
89+
assert_eq!(upper('ß'), ['S', 'S']); // not ẞ: Latin capital letter sharp s
90+
assert_eq!(upper('ü'), ['Ü']);
91+
assert_eq!(upper('💩'), ['💩']);
92+
93+
assert_eq!(upper('σ'), ['Σ']);
94+
assert_eq!(upper('τ'), ['Τ']);
95+
assert_eq!(upper('ι'), ['Ι']);
96+
assert_eq!(upper('γ'), ['Γ']);
97+
assert_eq!(upper('μ'), ['Μ']);
98+
assert_eq!(upper('α'), ['Α']);
99+
assert_eq!(upper('ς'), ['Σ']);
100+
assert_eq!(upper('Dž'), ['DŽ']);
101+
assert_eq!(upper('fi'), ['F', 'I']);
102+
assert_eq!(upper('ᾀ'), ['Ἀ', 'Ι']);
103+
}
104+
105+
#[test]
106+
fn test_to_titlecase() {
107+
fn title(c: char) -> Vec<char> {
108+
c.to_titlecase().collect()
109+
}
110+
assert_eq!(title('a'), ['A']);
111+
assert_eq!(title('ö'), ['Ö']);
112+
assert_eq!(title('ß'), ['S', 's']); // not ẞ: Latin capital letter sharp s
113+
assert_eq!(title('ü'), ['Ü']);
114+
assert_eq!(title('💩'), ['💩']);
115+
116+
assert_eq!(title('σ'), ['Σ']);
117+
assert_eq!(title('τ'), ['Τ']);
118+
assert_eq!(title('ι'), ['Ι']);
119+
assert_eq!(title('γ'), ['Γ']);
120+
assert_eq!(title('μ'), ['Μ']);
121+
assert_eq!(title('α'), ['Α']);
122+
assert_eq!(title('ς'), ['Σ']);
123+
assert_eq!(title('DŽ'), ['Dž']);
124+
assert_eq!(title('fi'), ['F', 'i']);
125+
assert_eq!(title('ᾀ'), ['ᾈ']);
99126
}
100127

101128
#[test]

src/librustc_lint/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
#![feature(box_patterns)]
3434
#![feature(box_syntax)]
35-
#![feature(collections)]
35+
#![cfg_attr(stage0, feature(collections))]
3636
#![feature(core)]
3737
#![feature(quote)]
3838
#![feature(rustc_diagnostic_macros)]

0 commit comments

Comments
 (0)