1
1
import 'dart:collection' ;
2
+ import 'dart:convert' show ascii, utf8;
2
3
3
4
import 'package:source_span/source_span.dart' ;
4
5
5
- import 'char_encodings.dart' ;
6
6
import 'constants.dart' ;
7
7
import 'encoding_parser.dart' ;
8
8
import 'utils.dart' ;
@@ -66,7 +66,7 @@ class HtmlInputStream {
66
66
this .sourceUrl])
67
67
: charEncodingName = codecName (encoding) {
68
68
if (source is String ) {
69
- _rawChars = toCodepoints ( source);
69
+ _rawChars = source.runes. toList ( );
70
70
charEncodingName = 'utf-8' ;
71
71
charEncodingCertain = true ;
72
72
} else if (source is List <int >) {
@@ -92,7 +92,7 @@ class HtmlInputStream {
92
92
_chars = < int > [];
93
93
94
94
if (_rawChars == null ) {
95
- _rawChars = decodeBytes (charEncodingName, _rawBytes);
95
+ _rawChars = _decodeBytes (charEncodingName, _rawBytes);
96
96
}
97
97
98
98
bool skipNewline = false ;
@@ -177,7 +177,7 @@ class HtmlInputStream {
177
177
/// encoding otherwise return null.
178
178
String detectBOM () {
179
179
// Try detecting the BOM using bytes from the string
180
- if (hasUtf8Bom (_rawBytes)) {
180
+ if (_hasUtf8Bom (_rawBytes)) {
181
181
return 'utf-8' ;
182
182
}
183
183
return null ;
@@ -292,3 +292,32 @@ String codecName(String encoding) {
292
292
var canonicalName = encoding.replaceAll (asciiPunctuation, '' ).toLowerCase ();
293
293
return encodings[canonicalName];
294
294
}
295
+
296
+ /// Returns true if the [bytes] starts with a UTF-8 byte order mark.
297
+ /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
298
+ /// used in HTML to detect the UTF-
299
+ bool _hasUtf8Bom (List <int > bytes, [int offset = 0 , int length]) {
300
+ int end = length != null ? offset + length : bytes.length;
301
+ return (offset + 3 ) <= end &&
302
+ bytes[offset] == 0xEF &&
303
+ bytes[offset + 1 ] == 0xBB &&
304
+ bytes[offset + 2 ] == 0xBF ;
305
+ }
306
+
307
+ /// Decodes the [bytes] with the provided [encoding] and returns an iterable for
308
+ /// the codepoints. Supports the major unicode encodings as well as ascii and
309
+ /// and windows-1252 encodings.
310
+ Iterable <int > _decodeBytes (String encoding, List <int > bytes) {
311
+ switch (encoding) {
312
+ case 'ascii' :
313
+ return ascii.decode (bytes).runes;
314
+
315
+ case 'utf-8' :
316
+ // NOTE: To match the behavior of the other decode functions, we eat the
317
+ // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
318
+ return utf8.decode (bytes).runes;
319
+
320
+ default :
321
+ throw ArgumentError ('Encoding $encoding not supported' );
322
+ }
323
+ }
0 commit comments