10
10
11
11
//! Utilities for manipulating the char type
12
12
13
- #[ cfg( not( test) ) ]
14
- use cmp:: Ord ;
15
13
use option:: { None , Option , Some } ;
16
14
use str;
15
+ #[ cfg( stage0) ]
16
+ use str:: StrSlice ;
17
+ #[ cfg( not( stage0) ) ]
18
+ use str:: { StrSlice , OwnedStr } ;
17
19
use u32;
18
20
use uint;
19
21
use unicode:: { derived_property, general_category} ;
20
22
21
- #[ cfg( not( test) ) ] use cmp:: Eq ;
23
+ #[ cfg( not( test) ) ]
24
+ use cmp:: { Eq , Ord } ;
22
25
23
26
/*
24
- Lu Uppercase_Letter an uppercase letter
25
- Ll Lowercase_Letter a lowercase letter
26
- Lt Titlecase_Letter a digraphic character, with first part uppercase
27
- Lm Modifier_Letter a modifier letter
28
- Lo Other_Letter other letters, including syllables and ideographs
29
- Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
30
- Mc Spacing_Mark a spacing combining mark (positive advance width)
31
- Me Enclosing_Mark an enclosing combining mark
32
- Nd Decimal_Number a decimal digit
33
- Nl Letter_Number a letterlike numeric character
34
- No Other_Number a numeric character of other type
27
+ Lu Uppercase_Letter an uppercase letter
28
+ Ll Lowercase_Letter a lowercase letter
29
+ Lt Titlecase_Letter a digraphic character, with first part uppercase
30
+ Lm Modifier_Letter a modifier letter
31
+ Lo Other_Letter other letters, including syllables and ideographs
32
+ Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
33
+ Mc Spacing_Mark a spacing combining mark (positive advance width)
34
+ Me Enclosing_Mark an enclosing combining mark
35
+ Nd Decimal_Number a decimal digit
36
+ Nl Letter_Number a letterlike numeric character
37
+ No Other_Number a numeric character of other type
35
38
Pc Connector_Punctuation a connecting punctuation mark, like a tie
36
- Pd Dash_Punctuation a dash or hyphen punctuation mark
37
- Ps Open_Punctuation an opening punctuation mark (of a pair)
38
- Pe Close_Punctuation a closing punctuation mark (of a pair)
39
+ Pd Dash_Punctuation a dash or hyphen punctuation mark
40
+ Ps Open_Punctuation an opening punctuation mark (of a pair)
41
+ Pe Close_Punctuation a closing punctuation mark (of a pair)
39
42
Pi Initial_Punctuation an initial quotation mark
40
- Pf Final_Punctuation a final quotation mark
41
- Po Other_Punctuation a punctuation mark of other type
42
- Sm Math_Symbol a symbol of primarily mathematical use
43
- Sc Currency_Symbol a currency sign
44
- Sk Modifier_Symbol a non-letterlike modifier symbol
45
- So Other_Symbol a symbol of other type
46
- Zs Space_Separator a space character (of various non-zero widths)
47
- Zl Line_Separator U+2028 LINE SEPARATOR only
43
+ Pf Final_Punctuation a final quotation mark
44
+ Po Other_Punctuation a punctuation mark of other type
45
+ Sm Math_Symbol a symbol of primarily mathematical use
46
+ Sc Currency_Symbol a currency sign
47
+ Sk Modifier_Symbol a non-letterlike modifier symbol
48
+ So Other_Symbol a symbol of other type
49
+ Zs Space_Separator a space character (of various non-zero widths)
50
+ Zl Line_Separator U+2028 LINE SEPARATOR only
48
51
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
49
- Cc Control a C0 or C1 control code
50
- Cf Format a format control character
51
- Cs Surrogate a surrogate code point
52
- Co Private_Use a private-use character
53
- Cn Unassigned a reserved unassigned code point or a noncharacter
52
+ Cc Control a C0 or C1 control code
53
+ Cf Format a format control character
54
+ Cs Surrogate a surrogate code point
55
+ Co Private_Use a private-use character
56
+ Cn Unassigned a reserved unassigned code point or a noncharacter
54
57
*/
55
58
56
59
pub fn is_alphabetic ( c : char ) -> bool { derived_property:: Alphabetic ( c) }
@@ -62,18 +65,14 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
62
65
* in terms of the Unicode General Category 'Ll'
63
66
*/
64
67
#[ inline( always) ]
65
- pub fn is_lowercase ( c : char ) -> bool {
66
- return general_category:: Ll ( c) ;
67
- }
68
+ pub fn is_lowercase ( c : char ) -> bool { general_category:: Ll ( c) }
68
69
69
70
/**
70
71
* Indicates whether a character is in upper case, defined
71
72
* in terms of the Unicode General Category 'Lu'.
72
73
*/
73
74
#[ inline( always) ]
74
- pub fn is_uppercase ( c : char ) -> bool {
75
- return general_category:: Lu ( c) ;
76
- }
75
+ pub fn is_uppercase ( c : char ) -> bool { general_category:: Lu ( c) }
77
76
78
77
/**
79
78
* Indicates whether a character is whitespace. Whitespace is defined in
@@ -82,10 +81,10 @@ pub fn is_uppercase(c: char) -> bool {
82
81
*/
83
82
#[ inline( always) ]
84
83
pub fn is_whitespace ( c : char ) -> bool {
85
- return ( '\x09' <= c && c <= '\x0d' )
84
+ ( '\x09' <= c && c <= '\x0d' )
86
85
|| general_category:: Zs ( c)
87
86
|| general_category:: Zl ( c)
88
- || general_category:: Zp ( c) ;
87
+ || general_category:: Zp ( c)
89
88
}
90
89
91
90
/**
@@ -95,18 +94,18 @@ pub fn is_whitespace(c: char) -> bool {
95
94
*/
96
95
#[ inline( always) ]
97
96
pub fn is_alphanumeric ( c : char ) -> bool {
98
- return derived_property:: Alphabetic ( c) ||
99
- general_category:: Nd ( c) ||
100
- general_category:: Nl ( c) ||
101
- general_category:: No ( c) ;
97
+ derived_property:: Alphabetic ( c)
98
+ || general_category:: Nd ( c)
99
+ || general_category:: Nl ( c)
100
+ || general_category:: No ( c)
102
101
}
103
102
104
103
/// Indicates whether the character is numeric (Nd, Nl, or No)
105
104
#[ inline( always) ]
106
105
pub fn is_digit ( c : char ) -> bool {
107
- return general_category:: Nd ( c) ||
108
- general_category:: Nl ( c) ||
109
- general_category:: No ( c) ;
106
+ general_category:: Nd ( c)
107
+ || general_category:: Nl ( c)
108
+ || general_category:: No ( c)
110
109
}
111
110
112
111
/**
@@ -125,7 +124,7 @@ pub fn is_digit(c: char) -> bool {
125
124
pub fn is_digit_radix ( c : char , radix : uint ) -> bool {
126
125
match to_digit ( c, radix) {
127
126
Some ( _) => true ,
128
- None => false
127
+ None => false ,
129
128
}
130
129
}
131
130
@@ -151,7 +150,7 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
151
150
'0' .. '9' => c as uint - ( '0' as uint ) ,
152
151
'a' .. 'z' => c as uint + 10 u - ( 'a' as uint ) ,
153
152
'A' .. 'Z' => c as uint + 10 u - ( 'A' as uint ) ,
154
- _ => return None
153
+ _ => return None ,
155
154
} ;
156
155
if val < radix { Some ( val) }
157
156
else { None }
@@ -181,6 +180,21 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
181
180
}
182
181
}
183
182
183
+ #[ cfg( stage0) ]
184
+ pub fn escape_unicode ( c : char ) -> ~str {
185
+ let s = u32:: to_str_radix ( c as u32 , 16 u) ;
186
+ let ( c, pad) = ( if c <= '\xff' { ( 'x' , 2 u) }
187
+ else if c <= '\uffff' { ( 'u' , 4 u) }
188
+ else { ( 'U' , 8 u) } ) ;
189
+ assert ! ( str :: len( s) <= pad) ;
190
+ let mut out = ~"\\ ";
191
+ str:: push_str ( & mut out, str:: from_char ( c) ) ;
192
+ for uint:: range( str:: len( s) , pad) |_i|
193
+ { str:: push_str( & mut out, ~"0 ") ; }
194
+ str:: push_str ( & mut out, s) ;
195
+ out
196
+ }
197
+
184
198
/**
185
199
* Return the hexadecimal unicode escape of a char.
186
200
*
@@ -190,17 +204,21 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
190
204
* - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
191
205
* - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
192
206
*/
207
+ #[ cfg( not( stage0) ) ]
193
208
pub fn escape_unicode ( c : char ) -> ~str {
194
209
let s = u32:: to_str_radix ( c as u32 , 16 u) ;
195
- let ( c, pad) = ( if c <= '\xff' { ( 'x' , 2 u) }
196
- else if c <= '\uffff' { ( 'u' , 4 u) }
197
- else { ( 'U' , 8 u) } ) ;
198
- assert ! ( str :: len( s) <= pad) ;
210
+ let ( c, pad) = cond ! (
211
+ ( c <= '\xff' ) { ( 'x' , 2 u) }
212
+ ( c <= '\uffff' ) { ( 'u' , 4 u) }
213
+ _ { ( 'U' , 8 u) }
214
+ ) ;
215
+ assert ! ( s. len( ) <= pad) ;
199
216
let mut out = ~"\\ ";
200
- str:: push_str ( & mut out, str:: from_char ( c) ) ;
201
- for uint:: range( str:: len( s) , pad) |_i|
202
- { str:: push_str( & mut out, ~"0 ") ; }
203
- str:: push_str ( & mut out, s) ;
217
+ out. push_str ( str:: from_char ( c) ) ;
218
+ for uint:: range( s. len( ) , pad) |_| {
219
+ out. push_str ( "0" ) ;
220
+ }
221
+ out. push_str ( s) ;
204
222
out
205
223
}
206
224
@@ -218,18 +236,18 @@ pub fn escape_unicode(c: char) -> ~str {
218
236
*/
219
237
pub fn escape_default ( c : char ) -> ~str {
220
238
match c {
221
- '\t' => ~"\\ t",
222
- '\r' => ~"\\ r",
223
- '\n' => ~" \\ n",
224
- '\\' => ~"\\ \\ ",
225
- '\'' => ~"\\ ' ",
226
- '"' => ~"\\ \" ",
227
- '\x20' .. '\x7e' => str:: from_char ( c) ,
228
- _ => escape_unicode ( c )
239
+ '\t' => ~"\\ t",
240
+ '\r' => ~"\\ r",
241
+ '\n' => ~" \\ n",
242
+ '\\' => ~"\\ \\ ",
243
+ '\'' => ~"\\ ' ",
244
+ '"' => ~"\\ \" ",
245
+ '\x20' .. '\x7e' => str:: from_char ( c) ,
246
+ _ => c . escape_unicode ( ) ,
229
247
}
230
248
}
231
249
232
- /// Returns the amount of bytes this character would need if encoded in utf8
250
+ # [ cfg ( stage0 ) ]
233
251
pub fn len_utf8_bytes ( c : char ) -> uint {
234
252
static max_one_b: uint = 128 u;
235
253
static max_two_b: uint = 2048 u;
@@ -244,6 +262,24 @@ pub fn len_utf8_bytes(c: char) -> uint {
244
262
else { fail ! ( "invalid character!" ) }
245
263
}
246
264
265
+ /// Returns the amount of bytes this character would need if encoded in utf8
266
+ #[ cfg( not( stage0) ) ]
267
+ pub fn len_utf8_bytes ( c : char ) -> uint {
268
+ static MAX_ONE_B : uint = 128 u;
269
+ static MAX_TWO_B : uint = 2048 u;
270
+ static MAX_THREE_B : uint = 65536 u;
271
+ static MAX_FOUR_B : uint = 2097152 u;
272
+
273
+ let code = c as uint ;
274
+ cond ! (
275
+ ( code < MAX_ONE_B ) { 1 u }
276
+ ( code < MAX_TWO_B ) { 2 u }
277
+ ( code < MAX_THREE_B ) { 3 u }
278
+ ( code < MAX_FOUR_B ) { 4 u }
279
+ _ { fail!( "invalid character!" ) }
280
+ )
281
+ }
282
+
247
283
pub trait Char {
248
284
fn is_alphabetic ( & self ) -> bool ;
249
285
fn is_XID_start ( & self ) -> bool ;
0 commit comments