Skip to content

Commit efe1f7e

Browse files
committed
auto merge of #15986 : Florob/rust/nfKc-new, r=alexcrichton
This adds a new `Recompositions` iterator, which performs canonical composition on the result of the `Decompositions` iterator (which is canonical or compatibility decomposition). In effect this implements Unicode normalization forms C and KC.
2 parents 31590bd + 7ece0ab commit efe1f7e

File tree

5 files changed

+581
-37
lines changed

5 files changed

+581
-37
lines changed

src/etc/unicode.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -464,13 +464,26 @@ def emit_charwidth_module(f, width_table):
464464
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
465465
f.write("}\n\n")
466466

467-
def emit_norm_module(f, canon, compat, combine):
467+
def emit_norm_module(f, canon, compat, combine, norm_props):
468468
canon_keys = canon.keys()
469469
canon_keys.sort()
470470

471471
compat_keys = compat.keys()
472472
compat_keys.sort()
473473

474+
canon_comp = {}
475+
comp_exclusions = norm_props["Full_Composition_Exclusion"]
476+
for char in canon_keys:
477+
if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions):
478+
continue
479+
decomp = canon[char]
480+
if len(decomp) == 2:
481+
if not canon_comp.has_key(decomp[0]):
482+
canon_comp[decomp[0]] = []
483+
canon_comp[decomp[0]].append( (decomp[1], char) )
484+
canon_comp_keys = canon_comp.keys()
485+
canon_comp_keys.sort()
486+
474487
f.write("pub mod normalization {\n")
475488

476489
def mkdata_fun(table):
@@ -494,6 +507,22 @@ def f(char):
494507
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
495508
pfun=mkdata_fun(compat))
496509

510+
def comp_pfun(char):
511+
data = "(%s,&[" % escape_char(char)
512+
canon_comp[char].sort(lambda x, y: x[0] - y[0])
513+
first = True
514+
for pair in canon_comp[char]:
515+
if not first:
516+
data += ","
517+
first = False
518+
data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
519+
data += "])"
520+
return data
521+
522+
f.write(" // Canonical compositions\n")
523+
emit_table(f, "composition_table", canon_comp_keys,
524+
"&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
525+
497526
f.write("""
498527
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
499528
use core::option::{Some, None};
@@ -579,6 +608,8 @@ def optimize_width_table(wtable):
579608
scripts = load_properties("Scripts.txt", [])
580609
props = load_properties("PropList.txt",
581610
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
611+
norm_props = load_properties("DerivedNormalizationProps.txt",
612+
["Full_Composition_Exclusion"])
582613

583614
# grapheme cluster category from DerivedCoreProperties
584615
# the rest are defined below
@@ -612,7 +643,7 @@ def optimize_width_table(wtable):
612643
emit_regex_module(rf, allcats, perl_words)
613644

614645
# normalizations and conversions module
615-
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
646+
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
616647
emit_conversions_module(rf, lowerupper, upperlower)
617648

618649
### character width module

src/libcollections/str.rs

+198-30
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,9 @@ use core::cmp;
7777
use core::iter::AdditiveIterator;
7878
use core::mem;
7979

80-
use {Collection, MutableSeq};
80+
use {Collection, Deque, MutableSeq};
8181
use hash;
82+
use ringbuf::RingBuf;
8283
use string::String;
8384
use unicode;
8485
use vec::Vec;
@@ -302,6 +303,106 @@ impl<'a> Iterator<char> for Decompositions<'a> {
302303
}
303304
}
304305

306+
#[deriving(Clone)]
307+
enum RecompositionState {
308+
Composing,
309+
Purging,
310+
Finished
311+
}
312+
313+
/// External iterator for a string's recomposition's characters.
314+
/// Use with the `std::iter` module.
315+
#[deriving(Clone)]
316+
pub struct Recompositions<'a> {
317+
iter: Decompositions<'a>,
318+
state: RecompositionState,
319+
buffer: RingBuf<char>,
320+
composee: Option<char>,
321+
last_ccc: Option<u8>
322+
}
323+
324+
impl<'a> Iterator<char> for Recompositions<'a> {
325+
#[inline]
326+
fn next(&mut self) -> Option<char> {
327+
loop {
328+
match self.state {
329+
Composing => {
330+
for ch in self.iter {
331+
let ch_class = unicode::char::canonical_combining_class(ch);
332+
if self.composee.is_none() {
333+
if ch_class != 0 {
334+
return Some(ch);
335+
}
336+
self.composee = Some(ch);
337+
continue;
338+
}
339+
let k = self.composee.clone().unwrap();
340+
341+
match self.last_ccc {
342+
None => {
343+
match unicode::char::compose(k, ch) {
344+
Some(r) => {
345+
self.composee = Some(r);
346+
continue;
347+
}
348+
None => {
349+
if ch_class == 0 {
350+
self.composee = Some(ch);
351+
return Some(k);
352+
}
353+
self.buffer.push(ch);
354+
self.last_ccc = Some(ch_class);
355+
}
356+
}
357+
}
358+
Some(l_class) => {
359+
if l_class >= ch_class {
360+
// `ch` is blocked from `composee`
361+
if ch_class == 0 {
362+
self.composee = Some(ch);
363+
self.last_ccc = None;
364+
self.state = Purging;
365+
return Some(k);
366+
}
367+
self.buffer.push(ch);
368+
self.last_ccc = Some(ch_class);
369+
continue;
370+
}
371+
match unicode::char::compose(k, ch) {
372+
Some(r) => {
373+
self.composee = Some(r);
374+
continue;
375+
}
376+
None => {
377+
self.buffer.push(ch);
378+
self.last_ccc = Some(ch_class);
379+
}
380+
}
381+
}
382+
}
383+
}
384+
self.state = Finished;
385+
if self.composee.is_some() {
386+
return self.composee.take();
387+
}
388+
}
389+
Purging => {
390+
match self.buffer.pop_front() {
391+
None => self.state = Composing,
392+
s => return s
393+
}
394+
}
395+
Finished => {
396+
match self.buffer.pop_front() {
397+
None => return self.composee.take(),
398+
s => return s
399+
}
400+
}
401+
}
402+
}
403+
}
404+
}
405+
305406
/// Replace all occurrences of one string with another
306407
///
307408
/// # Arguments
@@ -744,6 +845,32 @@ pub trait StrAllocating: Str {
744845
kind: Compatible
745846
}
746847
}
848+
849+
/// An Iterator over the string in Unicode Normalization Form C
850+
/// (canonical decomposition followed by canonical composition).
851+
#[inline]
852+
fn nfc_chars<'a>(&'a self) -> Recompositions<'a> {
853+
Recompositions {
854+
iter: self.nfd_chars(),
855+
state: Composing,
856+
buffer: RingBuf::new(),
857+
composee: None,
858+
last_ccc: None
859+
}
860+
}
861+
862+
/// An Iterator over the string in Unicode Normalization Form KC
863+
/// (compatibility decomposition followed by canonical composition).
864+
#[inline]
865+
fn nfkc_chars<'a>(&'a self) -> Recompositions<'a> {
866+
Recompositions {
867+
iter: self.nfkd_chars(),
868+
state: Composing,
869+
buffer: RingBuf::new(),
870+
composee: None,
871+
last_ccc: None
872+
}
873+
}
747874
}
748875

749876
impl<'a> StrAllocating for &'a str {
@@ -1754,39 +1881,80 @@ mod tests {
17541881

17551882
#[test]
17561883
fn test_nfd_chars() {
1757-
assert_eq!("abc".nfd_chars().collect::<String>(), String::from_str("abc"));
1758-
assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<String>(),
1759-
String::from_str("d\u0307\u01c4"));
1760-
assert_eq!("\u2026".nfd_chars().collect::<String>(), String::from_str("\u2026"));
1761-
assert_eq!("\u2126".nfd_chars().collect::<String>(), String::from_str("\u03a9"));
1762-
assert_eq!("\u1e0b\u0323".nfd_chars().collect::<String>(),
1763-
String::from_str("d\u0323\u0307"));
1764-
assert_eq!("\u1e0d\u0307".nfd_chars().collect::<String>(),
1765-
String::from_str("d\u0323\u0307"));
1766-
assert_eq!("a\u0301".nfd_chars().collect::<String>(), String::from_str("a\u0301"));
1767-
assert_eq!("\u0301a".nfd_chars().collect::<String>(), String::from_str("\u0301a"));
1768-
assert_eq!("\ud4db".nfd_chars().collect::<String>(),
1769-
String::from_str("\u1111\u1171\u11b6"));
1770-
assert_eq!("\uac1c".nfd_chars().collect::<String>(), String::from_str("\u1100\u1162"));
1884+
macro_rules! t {
1885+
($input: expr, $expected: expr) => {
1886+
assert_eq!($input.nfd_chars().collect::<String>(), $expected.into_string());
1887+
}
1888+
}
1889+
t!("abc", "abc");
1890+
t!("\u1e0b\u01c4", "d\u0307\u01c4");
1891+
t!("\u2026", "\u2026");
1892+
t!("\u2126", "\u03a9");
1893+
t!("\u1e0b\u0323", "d\u0323\u0307");
1894+
t!("\u1e0d\u0307", "d\u0323\u0307");
1895+
t!("a\u0301", "a\u0301");
1896+
t!("\u0301a", "\u0301a");
1897+
t!("\ud4db", "\u1111\u1171\u11b6");
1898+
t!("\uac1c", "\u1100\u1162");
17711899
}
17721900

17731901
#[test]
17741902
fn test_nfkd_chars() {
1775-
assert_eq!("abc".nfkd_chars().collect::<String>(), String::from_str("abc"));
1776-
assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<String>(),
1777-
String::from_str("d\u0307DZ\u030c"));
1778-
assert_eq!("\u2026".nfkd_chars().collect::<String>(), String::from_str("..."));
1779-
assert_eq!("\u2126".nfkd_chars().collect::<String>(), String::from_str("\u03a9"));
1780-
assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<String>(),
1781-
String::from_str("d\u0323\u0307"));
1782-
assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<String>(),
1783-
String::from_str("d\u0323\u0307"));
1784-
assert_eq!("a\u0301".nfkd_chars().collect::<String>(), String::from_str("a\u0301"));
1785-
assert_eq!("\u0301a".nfkd_chars().collect::<String>(),
1786-
String::from_str("\u0301a"));
1787-
assert_eq!("\ud4db".nfkd_chars().collect::<String>(),
1788-
String::from_str("\u1111\u1171\u11b6"));
1789-
assert_eq!("\uac1c".nfkd_chars().collect::<String>(), String::from_str("\u1100\u1162"));
1903+
macro_rules! t {
1904+
($input: expr, $expected: expr) => {
1905+
assert_eq!($input.nfkd_chars().collect::<String>(), $expected.into_string());
1906+
}
1907+
}
1908+
t!("abc", "abc");
1909+
t!("\u1e0b\u01c4", "d\u0307DZ\u030c");
1910+
t!("\u2026", "...");
1911+
t!("\u2126", "\u03a9");
1912+
t!("\u1e0b\u0323", "d\u0323\u0307");
1913+
t!("\u1e0d\u0307", "d\u0323\u0307");
1914+
t!("a\u0301", "a\u0301");
1915+
t!("\u0301a", "\u0301a");
1916+
t!("\ud4db", "\u1111\u1171\u11b6");
1917+
t!("\uac1c", "\u1100\u1162");
1918+
}
1919+
1920+
#[test]
1921+
fn test_nfc_chars() {
1922+
macro_rules! t {
1923+
($input: expr, $expected: expr) => {
1924+
assert_eq!($input.nfc_chars().collect::<String>(), $expected.into_string());
1925+
}
1926+
}
1927+
t!("abc", "abc");
1928+
t!("\u1e0b\u01c4", "\u1e0b\u01c4");
1929+
t!("\u2026", "\u2026");
1930+
t!("\u2126", "\u03a9");
1931+
t!("\u1e0b\u0323", "\u1e0d\u0307");
1932+
t!("\u1e0d\u0307", "\u1e0d\u0307");
1933+
t!("a\u0301", "\xe1");
1934+
t!("\u0301a", "\u0301a");
1935+
t!("\ud4db", "\ud4db");
1936+
t!("\uac1c", "\uac1c");
1937+
t!("a\u0300\u0305\u0315\u05aeb", "\xe0\u05ae\u0305\u0315b");
1938+
}
1939+
1940+
#[test]
1941+
fn test_nfkc_chars() {
1942+
macro_rules! t {
1943+
($input: expr, $expected: expr) => {
1944+
assert_eq!($input.nfkc_chars().collect::<String>(), $expected.into_string());
1945+
}
1946+
}
1947+
t!("abc", "abc");
1948+
t!("\u1e0b\u01c4", "\u1e0bD\u017d");
1949+
t!("\u2026", "...");
1950+
t!("\u2126", "\u03a9");
1951+
t!("\u1e0b\u0323", "\u1e0d\u0307");
1952+
t!("\u1e0d\u0307", "\u1e0d\u0307");
1953+
t!("a\u0301", "\xe1");
1954+
t!("\u0301a", "\u0301a");
1955+
t!("\ud4db", "\ud4db");
1956+
t!("\uac1c", "\uac1c");
1957+
t!("a\u0300\u0305\u0315\u05aeb", "\xe0\u05ae\u0305\u0315b");
17901958
}
17911959

17921960
#[test]

src/libunicode/lib.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ extern crate core;
3535
// regex module
3636
pub use tables::regex;
3737

38-
mod decompose;
38+
mod normalize;
3939
mod tables;
4040
mod u_char;
4141
mod u_str;
@@ -61,7 +61,7 @@ pub mod char {
6161
pub use core::char::{from_digit, escape_unicode, escape_default};
6262
pub use core::char::{len_utf8_bytes, Char};
6363

64-
pub use decompose::{decompose_canonical, decompose_compatible};
64+
pub use normalize::{decompose_canonical, decompose_compatible, compose};
6565

6666
pub use tables::normalization::canonical_combining_class;
6767

0 commit comments

Comments
 (0)