@@ -77,8 +77,9 @@ use core::cmp;
77
77
use core:: iter:: AdditiveIterator ;
78
78
use core:: mem;
79
79
80
- use { Collection , MutableSeq } ;
80
+ use { Collection , Deque , MutableSeq } ;
81
81
use hash;
82
+ use ringbuf:: RingBuf ;
82
83
use string:: String ;
83
84
use unicode;
84
85
use vec:: Vec ;
@@ -302,6 +303,106 @@ impl<'a> Iterator<char> for Decompositions<'a> {
302
303
}
303
304
}
304
305
306
+ #[ deriving( Clone ) ]
307
+ enum RecompositionState {
308
+ Composing ,
309
+ Purging ,
310
+ Finished
311
+ }
312
+
313
+ /// External iterator for a string's recomposition's characters.
314
+ /// Use with the `std::iter` module.
315
+ #[ deriving( Clone ) ]
316
+ pub struct Recompositions < ' a > {
317
+ iter : Decompositions < ' a > ,
318
+ state : RecompositionState ,
319
+ buffer : RingBuf < char > ,
320
+ composee : Option < char > ,
321
+ last_ccc : Option < u8 >
322
+ }
323
+
324
+ impl < ' a > Iterator < char > for Recompositions < ' a > {
325
+ #[ inline]
326
+ fn next ( & mut self ) -> Option < char > {
327
+ loop {
328
+ match self . state {
329
+ Composing => {
330
+ for ch in self . iter {
331
+ let ch_class = unicode:: char:: canonical_combining_class ( ch) ;
332
+ if self . composee . is_none ( ) {
333
+ if ch_class != 0 {
334
+ return Some ( ch) ;
335
+ }
336
+ self . composee = Some ( ch) ;
337
+ continue ;
338
+ }
339
+ let k = self . composee . clone ( ) . unwrap ( ) ;
340
+
341
+ match self . last_ccc {
342
+ None => {
343
+ match unicode:: char:: compose ( k, ch) {
344
+ Some ( r) => {
345
+ self . composee = Some ( r) ;
346
+ continue ;
347
+ }
348
+ None => {
349
+ if ch_class == 0 {
350
+ self . composee = Some ( ch) ;
351
+ return Some ( k) ;
352
+ }
353
+ self . buffer . push ( ch) ;
354
+ self . last_ccc = Some ( ch_class) ;
355
+ }
356
+ }
357
+ }
358
+ Some ( l_class) => {
359
+ if l_class >= ch_class {
360
+ // `ch` is blocked from `composee`
361
+ if ch_class == 0 {
362
+ self . composee = Some ( ch) ;
363
+ self . last_ccc = None ;
364
+ self . state = Purging ;
365
+ return Some ( k) ;
366
+ }
367
+ self . buffer . push ( ch) ;
368
+ self . last_ccc = Some ( ch_class) ;
369
+ continue ;
370
+ }
371
+ match unicode:: char:: compose ( k, ch) {
372
+ Some ( r) => {
373
+ self . composee = Some ( r) ;
374
+ continue ;
375
+ }
376
+ None => {
377
+ self . buffer . push ( ch) ;
378
+ self . last_ccc = Some ( ch_class) ;
379
+ }
380
+ }
381
+ }
382
+ }
383
+ }
384
+ self . state = Finished ;
385
+ if self . composee . is_some ( ) {
386
+ return self . composee . take ( ) ;
387
+ }
388
+ }
389
+ Purging => {
390
+ match self . buffer . pop_front ( ) {
391
+ None => self . state = Composing ,
392
+ s => return s
393
+ }
394
+ }
395
+ Finished => {
396
+ match self . buffer . pop_front ( ) {
397
+ None => return self . composee . take ( ) ,
398
+ s => return s
399
+ }
400
+ }
401
+ }
402
+ }
403
+ }
404
+ }
405
+
305
406
/// Replace all occurrences of one string with another
306
407
///
307
408
/// # Arguments
@@ -744,6 +845,32 @@ pub trait StrAllocating: Str {
744
845
kind : Compatible
745
846
}
746
847
}
848
+
849
+ /// An Iterator over the string in Unicode Normalization Form C
850
+ /// (canonical decomposition followed by canonical composition).
851
+ #[ inline]
852
+ fn nfc_chars < ' a > ( & ' a self ) -> Recompositions < ' a > {
853
+ Recompositions {
854
+ iter : self . nfd_chars ( ) ,
855
+ state : Composing ,
856
+ buffer : RingBuf :: new ( ) ,
857
+ composee : None ,
858
+ last_ccc : None
859
+ }
860
+ }
861
+
862
+ /// An Iterator over the string in Unicode Normalization Form KC
863
+ /// (compatibility decomposition followed by canonical composition).
864
+ #[ inline]
865
+ fn nfkc_chars < ' a > ( & ' a self ) -> Recompositions < ' a > {
866
+ Recompositions {
867
+ iter : self . nfkd_chars ( ) ,
868
+ state : Composing ,
869
+ buffer : RingBuf :: new ( ) ,
870
+ composee : None ,
871
+ last_ccc : None
872
+ }
873
+ }
747
874
}
748
875
749
876
impl < ' a > StrAllocating for & ' a str {
@@ -1754,39 +1881,80 @@ mod tests {
1754
1881
1755
1882
#[ test]
1756
1883
fn test_nfd_chars ( ) {
1757
- assert_eq ! ( "abc" . nfd_chars( ) . collect:: <String >( ) , String :: from_str( "abc" ) ) ;
1758
- assert_eq ! ( "\u1e0b \u01c4 " . nfd_chars( ) . collect:: <String >( ) ,
1759
- String :: from_str( "d\u0307 \u01c4 " ) ) ;
1760
- assert_eq ! ( "\u2026 " . nfd_chars( ) . collect:: <String >( ) , String :: from_str( "\u2026 " ) ) ;
1761
- assert_eq ! ( "\u2126 " . nfd_chars( ) . collect:: <String >( ) , String :: from_str( "\u03a9 " ) ) ;
1762
- assert_eq ! ( "\u1e0b \u0323 " . nfd_chars( ) . collect:: <String >( ) ,
1763
- String :: from_str( "d\u0323 \u0307 " ) ) ;
1764
- assert_eq ! ( "\u1e0d \u0307 " . nfd_chars( ) . collect:: <String >( ) ,
1765
- String :: from_str( "d\u0323 \u0307 " ) ) ;
1766
- assert_eq ! ( "a\u0301 " . nfd_chars( ) . collect:: <String >( ) , String :: from_str( "a\u0301 " ) ) ;
1767
- assert_eq ! ( "\u0301 a" . nfd_chars( ) . collect:: <String >( ) , String :: from_str( "\u0301 a" ) ) ;
1768
- assert_eq ! ( "\ud4db " . nfd_chars( ) . collect:: <String >( ) ,
1769
- String :: from_str( "\u1111 \u1171 \u11b6 " ) ) ;
1770
- assert_eq ! ( "\uac1c " . nfd_chars( ) . collect:: <String >( ) , String :: from_str( "\u1100 \u1162 " ) ) ;
1884
+ macro_rules! t {
1885
+ ( $input: expr, $expected: expr) => {
1886
+ assert_eq!( $input. nfd_chars( ) . collect:: <String >( ) , $expected. into_string( ) ) ;
1887
+ }
1888
+ }
1889
+ t ! ( "abc" , "abc" ) ;
1890
+ t ! ( "\u1e0b \u01c4 " , "d\u0307 \u01c4 " ) ;
1891
+ t ! ( "\u2026 " , "\u2026 " ) ;
1892
+ t ! ( "\u2126 " , "\u03a9 " ) ;
1893
+ t ! ( "\u1e0b \u0323 " , "d\u0323 \u0307 " ) ;
1894
+ t ! ( "\u1e0d \u0307 " , "d\u0323 \u0307 " ) ;
1895
+ t ! ( "a\u0301 " , "a\u0301 " ) ;
1896
+ t ! ( "\u0301 a" , "\u0301 a" ) ;
1897
+ t ! ( "\ud4db " , "\u1111 \u1171 \u11b6 " ) ;
1898
+ t ! ( "\uac1c " , "\u1100 \u1162 " ) ;
1771
1899
}
1772
1900
1773
1901
#[ test]
1774
1902
fn test_nfkd_chars ( ) {
1775
- assert_eq ! ( "abc" . nfkd_chars( ) . collect:: <String >( ) , String :: from_str( "abc" ) ) ;
1776
- assert_eq ! ( "\u1e0b \u01c4 " . nfkd_chars( ) . collect:: <String >( ) ,
1777
- String :: from_str( "d\u0307 DZ\u030c " ) ) ;
1778
- assert_eq ! ( "\u2026 " . nfkd_chars( ) . collect:: <String >( ) , String :: from_str( "..." ) ) ;
1779
- assert_eq ! ( "\u2126 " . nfkd_chars( ) . collect:: <String >( ) , String :: from_str( "\u03a9 " ) ) ;
1780
- assert_eq ! ( "\u1e0b \u0323 " . nfkd_chars( ) . collect:: <String >( ) ,
1781
- String :: from_str( "d\u0323 \u0307 " ) ) ;
1782
- assert_eq ! ( "\u1e0d \u0307 " . nfkd_chars( ) . collect:: <String >( ) ,
1783
- String :: from_str( "d\u0323 \u0307 " ) ) ;
1784
- assert_eq ! ( "a\u0301 " . nfkd_chars( ) . collect:: <String >( ) , String :: from_str( "a\u0301 " ) ) ;
1785
- assert_eq ! ( "\u0301 a" . nfkd_chars( ) . collect:: <String >( ) ,
1786
- String :: from_str( "\u0301 a" ) ) ;
1787
- assert_eq ! ( "\ud4db " . nfkd_chars( ) . collect:: <String >( ) ,
1788
- String :: from_str( "\u1111 \u1171 \u11b6 " ) ) ;
1789
- assert_eq ! ( "\uac1c " . nfkd_chars( ) . collect:: <String >( ) , String :: from_str( "\u1100 \u1162 " ) ) ;
1903
+ macro_rules! t {
1904
+ ( $input: expr, $expected: expr) => {
1905
+ assert_eq!( $input. nfkd_chars( ) . collect:: <String >( ) , $expected. into_string( ) ) ;
1906
+ }
1907
+ }
1908
+ t ! ( "abc" , "abc" ) ;
1909
+ t ! ( "\u1e0b \u01c4 " , "d\u0307 DZ\u030c " ) ;
1910
+ t ! ( "\u2026 " , "..." ) ;
1911
+ t ! ( "\u2126 " , "\u03a9 " ) ;
1912
+ t ! ( "\u1e0b \u0323 " , "d\u0323 \u0307 " ) ;
1913
+ t ! ( "\u1e0d \u0307 " , "d\u0323 \u0307 " ) ;
1914
+ t ! ( "a\u0301 " , "a\u0301 " ) ;
1915
+ t ! ( "\u0301 a" , "\u0301 a" ) ;
1916
+ t ! ( "\ud4db " , "\u1111 \u1171 \u11b6 " ) ;
1917
+ t ! ( "\uac1c " , "\u1100 \u1162 " ) ;
1918
+ }
1919
+
1920
+ #[ test]
1921
+ fn test_nfc_chars ( ) {
1922
+ macro_rules! t {
1923
+ ( $input: expr, $expected: expr) => {
1924
+ assert_eq!( $input. nfc_chars( ) . collect:: <String >( ) , $expected. into_string( ) ) ;
1925
+ }
1926
+ }
1927
+ t ! ( "abc" , "abc" ) ;
1928
+ t ! ( "\u1e0b \u01c4 " , "\u1e0b \u01c4 " ) ;
1929
+ t ! ( "\u2026 " , "\u2026 " ) ;
1930
+ t ! ( "\u2126 " , "\u03a9 " ) ;
1931
+ t ! ( "\u1e0b \u0323 " , "\u1e0d \u0307 " ) ;
1932
+ t ! ( "\u1e0d \u0307 " , "\u1e0d \u0307 " ) ;
1933
+ t ! ( "a\u0301 " , "\xe1 " ) ;
1934
+ t ! ( "\u0301 a" , "\u0301 a" ) ;
1935
+ t ! ( "\ud4db " , "\ud4db " ) ;
1936
+ t ! ( "\uac1c " , "\uac1c " ) ;
1937
+ t ! ( "a\u0300 \u0305 \u0315 \u05ae b" , "\xe0 \u05ae \u0305 \u0315 b" ) ;
1938
+ }
1939
+
1940
+ #[ test]
1941
+ fn test_nfkc_chars ( ) {
1942
+ macro_rules! t {
1943
+ ( $input: expr, $expected: expr) => {
1944
+ assert_eq!( $input. nfkc_chars( ) . collect:: <String >( ) , $expected. into_string( ) ) ;
1945
+ }
1946
+ }
1947
+ t ! ( "abc" , "abc" ) ;
1948
+ t ! ( "\u1e0b \u01c4 " , "\u1e0b D\u017d " ) ;
1949
+ t ! ( "\u2026 " , "..." ) ;
1950
+ t ! ( "\u2126 " , "\u03a9 " ) ;
1951
+ t ! ( "\u1e0b \u0323 " , "\u1e0d \u0307 " ) ;
1952
+ t ! ( "\u1e0d \u0307 " , "\u1e0d \u0307 " ) ;
1953
+ t ! ( "a\u0301 " , "\xe1 " ) ;
1954
+ t ! ( "\u0301 a" , "\u0301 a" ) ;
1955
+ t ! ( "\ud4db " , "\ud4db " ) ;
1956
+ t ! ( "\uac1c " , "\uac1c " ) ;
1957
+ t ! ( "a\u0300 \u0305 \u0315 \u05ae b" , "\xe0 \u05ae \u0305 \u0315 b" ) ;
1790
1958
}
1791
1959
1792
1960
#[ test]
0 commit comments