From e01856bc37a7b703307c6b4bc5da31e12c118515 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Sat, 21 Mar 2015 15:23:30 -0700 Subject: [PATCH 1/3] Read stdin in noop-tokenize --- examples/noop-tokenize.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize.rs index 34bde39b..0f5725f2 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize.rs @@ -9,12 +9,12 @@ // Run a single benchmark once. For use with profiling tools. -#![feature(core, test)] +#![feature(test)] extern crate test; extern crate html5ever; -use std::{fs, env}; +use std::io; use std::io::prelude::*; use std::default::Default; @@ -34,15 +34,10 @@ impl TokenSink for Sink { } fn main() { - let mut path = env::current_exe().unwrap(); - path.push("../data/bench/"); - path.push(env::args().nth(1).unwrap().as_slice()); + let mut input = String::new(); + io::stdin().read_to_string(&mut input).unwrap(); - let mut file = fs::File::open(&path).unwrap(); - let mut file_input = String::new(); - file.read_to_string(&mut file_input).unwrap(); - - tokenize_to(Sink, one_input(file_input), TokenizerOpts { + tokenize_to(Sink, one_input(input), TokenizerOpts { profile: true, .. Default::default() }); From 4c865494b9345db0230f967d595cb1f4a91f2141 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Sat, 21 Mar 2015 17:39:51 -0700 Subject: [PATCH 2/3] Rework BeforeAttributeValue tokenizer state --- src/tokenizer/mod.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 09e26182..285e37f1 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -546,6 +546,7 @@ macro_rules! shorthand ( ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); ); ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push($c); ); ( $me:ident : discard_tag ) => ( $me.discard_tag(); ); + ( $me:ident : discard_char ) => ( $me.discard_char(); ); ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push($c); ); ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); ); ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); ); @@ -630,6 +631,10 @@ macro_rules! get_char ( ($me:expr) => ( unwrap_or_return!($me.get_char(), false) )); +macro_rules! peek ( ($me:expr) => ( + unwrap_or_return!($me.peek(), false) +)); + macro_rules! pop_except_from ( ($me:expr, $set:expr) => ( unwrap_or_return!($me.pop_except_from($set), false) )); @@ -922,18 +927,16 @@ impl Tokenizer { }}, //§ before-attribute-value-state - states::BeforeAttributeValue => loop { match get_char!(self) { - '\t' | '\n' | '\x0C' | ' ' => (), - '"' => go!(self: to AttributeValue DoubleQuoted), - '&' => go!(self: reconsume AttributeValue Unquoted), - '\'' => go!(self: to AttributeValue SingleQuoted), - '\0' => go!(self: error; push_value '\u{fffd}'; to AttributeValue Unquoted), - '>' => go!(self: error; emit_tag Data), - c => { - go_match!(self: c, - '<' , '=' , '`' => error); - go!(self: push_value c; to AttributeValue Unquoted); - } + // Use peek so we can handle the first attr character along with the rest, + // hopefully in the same zero-copy buffer. + states::BeforeAttributeValue => loop { match peek!(self) { + '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char), + '"' => go!(self: discard_char; to AttributeValue DoubleQuoted), + '&' => go!(self: to AttributeValue Unquoted), + '\'' => go!(self: discard_char; to AttributeValue SingleQuoted), + '\0' => go!(self: discard_char; error; push_value '\u{fffd}'; to AttributeValue Unquoted), + '>' => go!(self: discard_char; error; emit_tag Data), + _ => go!(self: to AttributeValue Unquoted), }}, //§ attribute-value-(double-quoted)-state From 9ca1de005fef3f65e3914f899c06d759c9283261 Mon Sep 17 00:00:00 2001 From: Keegan McAllister Date: Sat, 21 Mar 2015 20:02:43 -0700 Subject: [PATCH 3/3] Implement zero-copy parsing Based on #60 by cgaebel. --- Cargo.toml | 1 + Makefile.in | 1 - benches/tokenizer.rs | 3 +- ...-tokenize.rs => noop-tokenize-zerocopy.rs} | 12 +- examples/noop-tree-builder.rs | 6 +- examples/print-tree-actions.rs | 6 +- macros/src/lib.rs | 2 +- src/driver.rs | 59 +- src/for_c/common.rs | 8 + src/for_c/tokenizer.rs | 3 +- src/lib.rs | 6 +- src/sink/common.rs | 8 +- src/sink/owned_dom.rs | 6 +- src/sink/rcdom.rs | 6 +- src/tokenizer/buffer_queue.rs | 61 +- src/tokenizer/char_ref/mod.rs | 15 +- src/tokenizer/interface.rs | 14 +- src/tokenizer/mod.rs | 120 +-- src/tree_builder/actions.rs | 18 +- src/tree_builder/data.rs | 18 +- src/tree_builder/interface.rs | 9 +- src/tree_builder/mod.rs | 29 +- src/tree_builder/rules.rs | 4 +- src/tree_builder/types.rs | 9 +- src/util/smallcharset.rs | 10 +- src/util/str.rs | 51 +- src/util/tendril.rs | 922 ++++++++++++++++++ tests/tokenizer.rs | 67 +- 28 files changed, 1191 insertions(+), 283 deletions(-) rename examples/{noop-tokenize.rs => noop-tokenize-zerocopy.rs} (76%) create mode 100644 src/util/tendril.rs diff --git a/Cargo.toml b/Cargo.toml index 1b453468..b0dd68b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ phf = "0" phf_macros = "0" time = "0" log = "0" +iobuf = "5" [dependencies.string_cache] git = "https://github.com/servo/string-cache" diff --git a/Makefile.in b/Makefile.in index 738a118a..364a9240 100644 --- a/Makefile.in +++ b/Makefile.in @@ -14,7 +14,6 @@ RUST_DIRS := -L $(VPATH)/target/debug -L $(VPATH)/target/debug/deps RUSTC_CMD := $(RUSTC) -D warnings -C rpath $(RUST_DIRS) \ --extern time=`find $(VPATH)/target/debug/deps -name 'libtime-*.rlib'` \ - --extern log=`find $(VPATH)/target/debug/deps -name 'liblog-*.rlib'` \ $(RUSTFLAGS) # We build the library itself using Cargo. diff --git a/benches/tokenizer.rs b/benches/tokenizer.rs index 2c5a6d84..ed5073b6 100644 --- a/benches/tokenizer.rs +++ b/benches/tokenizer.rs @@ -7,7 +7,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![feature(box_syntax, core, std_misc, start, test, io, path)] +#![feature(box_syntax, core, std_misc, start, test)] extern crate test; extern crate html5ever; @@ -21,6 +21,7 @@ use test::{black_box, Bencher, TestDesc, TestDescAndFn}; use test::{DynTestName, DynBenchFn, TDynBenchFn}; use test::ShouldPanic::No; +use html5ever::Tendril; use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts}; struct Sink; diff --git a/examples/noop-tokenize.rs b/examples/noop-tokenize-zerocopy.rs similarity index 76% rename from examples/noop-tokenize.rs rename to examples/noop-tokenize-zerocopy.rs index 0f5725f2..e495c135 100644 --- a/examples/noop-tokenize.rs +++ b/examples/noop-tokenize-zerocopy.rs @@ -7,21 +7,19 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -// Run a single benchmark once. For use with profiling tools. - #![feature(test)] extern crate test; extern crate html5ever; use std::io; -use std::io::prelude::*; use std::default::Default; use test::black_box; +use html5ever::TendrilReader; use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts}; -use html5ever::driver::{tokenize_to, one_input}; +use html5ever::driver::tokenize_to; struct Sink; @@ -34,10 +32,10 @@ impl TokenSink for Sink { } fn main() { - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); + let reader = TendrilReader::from_utf8(16384, io::stdin()) + .map(|r| r.unwrap()); - tokenize_to(Sink, one_input(input), TokenizerOpts { + tokenize_to(Sink, reader, TokenizerOpts { profile: true, .. Default::default() }); diff --git a/examples/noop-tree-builder.rs b/examples/noop-tree-builder.rs index ce1306b8..491d401e 100644 --- a/examples/noop-tree-builder.rs +++ b/examples/noop-tree-builder.rs @@ -18,7 +18,7 @@ use std::collections::HashMap; use std::borrow::Cow; use string_cache::QualName; -use html5ever::{parse_to, one_input}; +use html5ever::{parse_to, one_input, Tendril}; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText}; @@ -56,7 +56,7 @@ impl TreeSink for Sink { id } - fn create_comment(&mut self, _text: String) -> usize { + fn create_comment(&mut self, _text: Tendril) -> usize { self.get_id() } @@ -72,7 +72,7 @@ impl TreeSink for Sink { fn set_quirks_mode(&mut self, _mode: QuirksMode) { } fn append(&mut self, _parent: usize, _child: NodeOrText) { } - fn append_doctype_to_document(&mut self, _name: String, _public_id: String, _system_id: String) { } + fn append_doctype_to_document(&mut self, _name: Tendril, _public_id: Tendril, _system_id: Tendril) { } fn add_attrs_if_missing(&mut self, _target: usize, _attrs: Vec) { } fn remove_from_parent(&mut self, _target: usize) { } fn reparent_children(&mut self, _node: usize, _new_parent: usize) { } diff --git a/examples/print-tree-actions.rs b/examples/print-tree-actions.rs index 0da21099..ae102224 100644 --- a/examples/print-tree-actions.rs +++ b/examples/print-tree-actions.rs @@ -20,7 +20,7 @@ use std::collections::HashMap; use std::borrow::Cow; use string_cache::QualName; -use html5ever::{parse_to, one_input}; +use html5ever::{parse_to, one_input, Tendril}; use html5ever::tokenizer::Attribute; use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; @@ -67,7 +67,7 @@ impl TreeSink for Sink { id } - fn create_comment(&mut self, text: String) -> usize { + fn create_comment(&mut self, text: Tendril) -> usize { let id = self.get_id(); println!("Created comment \"{}\" as {}", text.escape_default(), id); id @@ -97,7 +97,7 @@ impl TreeSink for Sink { Ok(()) } - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) { println!("Append doctype: {} {} {}", name, public_id, system_id); } diff --git a/macros/src/lib.rs b/macros/src/lib.rs index fd76e10e..ba41fd0d 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -11,7 +11,7 @@ #![crate_type="dylib"] #![feature(plugin_registrar, quote)] -#![feature(rustc_private, core, std_misc)] +#![feature(rustc_private, core, std_misc, str_char)] #![deny(warnings)] extern crate syntax; diff --git a/src/driver.rs b/src/driver.rs index d81e9897..1059115b 100644 --- a/src/driver.rs +++ b/src/driver.rs @@ -11,17 +11,17 @@ use core::prelude::*; +use util::tendril::IntoTendril; use tokenizer::{TokenizerOpts, Tokenizer, TokenSink}; use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink}; use core::default::Default; use core::option; -use collections::string::String; use string_cache::{Atom, QualName}; -/// Convenience function to turn a single `String` into an iterator. -pub fn one_input(x: String) -> option::IntoIter { +/// Convenience function to turn a single value into an iterator. +pub fn one_input(x: T) -> option::IntoIter { Some(x).into_iter() } @@ -33,14 +33,11 @@ pub fn one_input(x: String) -> option::IntoIter { /// let mut sink = MySink; /// tokenize_to(&mut sink, one_input(my_str), Default::default()); /// ``` -pub fn tokenize_to< - Sink: TokenSink, - It: Iterator - >( - sink: Sink, - input: It, - opts: TokenizerOpts) -> Sink { - +pub fn tokenize_to(sink: Sink, input: It, opts: TokenizerOpts) -> Sink + where Sink: TokenSink, + T: IntoTendril, + It: Iterator, +{ let mut tok = Tokenizer::new(sink, opts); for s in input { tok.feed(s); @@ -67,14 +64,11 @@ pub struct ParseOpts { /// let mut sink = MySink; /// parse_to(&mut sink, one_input(my_str), Default::default()); /// ``` -pub fn parse_to< - Sink: TreeSink, - It: Iterator - >( - sink: Sink, - input: It, - opts: ParseOpts) -> Sink { - +pub fn parse_to(sink: Sink, input: It, opts: ParseOpts) -> Sink + where Sink: TreeSink, + T: IntoTendril, + It: Iterator, +{ let tb = TreeBuilder::new(sink, opts.tree_builder); let mut tok = Tokenizer::new(tb, opts.tokenizer); for s in input { @@ -92,15 +86,14 @@ pub fn parse_to< /// let mut sink = MySink; /// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default()); /// ``` -pub fn parse_fragment_to< - Sink: TreeSink, - It: Iterator - >( - sink: Sink, - input: It, - context: Atom, - opts: ParseOpts) -> Sink { - +pub fn parse_fragment_to(sink: Sink, + input: It, + context: Atom, + opts: ParseOpts) -> Sink + where Sink: TreeSink, + T: IntoTendril, + It: Iterator +{ let mut sink = sink; let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!()); let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder); @@ -132,9 +125,10 @@ pub trait ParseResult { /// ```ignore /// let dom: RcDom = parse(one_input(my_str), Default::default()); /// ``` -pub fn parse(input: It, opts: ParseOpts) -> Output +pub fn parse(input: It, opts: ParseOpts) -> Output where Output: ParseResult, - It: Iterator, + T: IntoTendril, + It: Iterator, { let sink = parse_to(Default::default(), input, opts); ParseResult::get_result(sink) @@ -147,9 +141,10 @@ pub fn parse(input: It, opts: ParseOpts) -> Output /// ```ignore /// let dom: RcDom = parse_fragment(one_input(my_str), context_token, Default::default()); /// ``` -pub fn parse_fragment(input: It, context: Atom, opts: ParseOpts) -> Output +pub fn parse_fragment(input: It, context: Atom, opts: ParseOpts) -> Output where Output: ParseResult, - It: Iterator, + T: IntoTendril, + It: Iterator, { let sink = parse_fragment_to(Default::default(), input, context, opts); ParseResult::get_result(sink) diff --git a/src/for_c/common.rs b/src/for_c/common.rs index adad27cc..601df041 100644 --- a/src/for_c/common.rs +++ b/src/for_c/common.rs @@ -20,6 +20,8 @@ use libc::{size_t, c_int, c_char, strlen}; use string_cache::Atom; +use util::tendril::Tendril; + #[repr(C)] pub struct h5e_buf { data: *const u8, @@ -82,6 +84,12 @@ impl AsLifetimeBuf for String { } } +impl AsLifetimeBuf for Tendril { + fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { + LifetimeBuf::from_str(self.as_slice()) + } +} + impl AsLifetimeBuf for Atom { fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> { LifetimeBuf::from_str(self.as_slice()) diff --git a/src/for_c/tokenizer.rs b/src/for_c/tokenizer.rs index 8bea595e..0d4b9905 100644 --- a/src/for_c/tokenizer.rs +++ b/src/for_c/tokenizer.rs @@ -11,6 +11,7 @@ use core::prelude::*; +use util::tendril::Tendril; use for_c::common::{LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool}; use tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken}; @@ -64,7 +65,7 @@ impl TokenSink for *mut h5e_token_sink { ($name:ident) => (call!($name,)); // bleh } - fn opt_str_to_buf<'a>(s: &'a Option) -> LifetimeBuf<'a> { + fn opt_str_to_buf<'a>(s: &'a Option) -> LifetimeBuf<'a> { match *s { None => LifetimeBuf::null(), Some(ref s) => s.as_lifetime_buf(), diff --git a/src/lib.rs b/src/lib.rs index 63f3fd3d..01f5dd0a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,7 @@ #![crate_name="html5ever"] #![crate_type="dylib"] -#![feature(plugin, box_syntax, no_std, core, collections, alloc)] +#![feature(plugin, box_syntax, no_std, core, collections, alloc, str_char)] #![deny(warnings)] #![allow(unused_parens)] @@ -49,6 +49,9 @@ extern crate phf; extern crate time; +extern crate iobuf; + +pub use util::tendril::{Tendril, TendrilReader, TendrilReaderError, IntoTendril}; pub use tokenizer::Attribute; pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment}; @@ -61,6 +64,7 @@ mod macros; #[macro_use] mod util { pub mod str; + pub mod tendril; #[macro_use] pub mod smallcharset; } diff --git a/src/sink/common.rs b/src/sink/common.rs index 54b1f7bd..9271e911 100644 --- a/src/sink/common.rs +++ b/src/sink/common.rs @@ -7,10 +7,10 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use util::tendril::Tendril; use tokenizer::Attribute; use collections::vec::Vec; -use collections::string::String; use string_cache::QualName; pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element}; @@ -22,13 +22,13 @@ pub enum NodeEnum { Document, /// A `DOCTYPE` with name, public id, and system id. - Doctype(String, String, String), + Doctype(Tendril, Tendril, Tendril), /// A text node. - Text(String), + Text(Tendril), /// A comment. - Comment(String), + Comment(Tendril), /// An element with attributes. Element(QualName, Vec), diff --git a/src/sink/owned_dom.rs b/src/sink/owned_dom.rs index ce09d540..b24c4d22 100644 --- a/src/sink/owned_dom.rs +++ b/src/sink/owned_dom.rs @@ -23,6 +23,7 @@ use core::prelude::*; use sink::common::{NodeEnum, Document, Doctype, Text, Comment, Element}; +use util::tendril::Tendril; use tokenizer::Attribute; use tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; use tree_builder; @@ -38,7 +39,6 @@ use core::mem; use core::ptr; use alloc::boxed::Box; use collections::vec::Vec; -use collections::string::String; use std::borrow::Cow; use std::io::{self, Write}; use std::collections::HashSet; @@ -215,7 +215,7 @@ impl TreeSink for Sink { self.new_node(Element(name, attrs)) } - fn create_comment(&mut self, text: String) -> Handle { + fn create_comment(&mut self, text: Tendril) -> Handle { self.new_node(Comment(text)) } @@ -269,7 +269,7 @@ impl TreeSink for Sink { Ok(()) } - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) { append(self.document, self.new_node(Doctype(name, public_id, system_id))); } diff --git a/src/sink/rcdom.rs b/src/sink/rcdom.rs index 6e4eeb77..f0e81716 100644 --- a/src/sink/rcdom.rs +++ b/src/sink/rcdom.rs @@ -16,6 +16,7 @@ use core::prelude::*; use sink::common::{NodeEnum, Document, Doctype, Text, Comment, Element}; +use util::tendril::Tendril; use tokenizer::Attribute; use tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; use tree_builder; @@ -28,7 +29,6 @@ use core::cell::RefCell; use core::default::Default; use alloc::rc::{Rc, Weak}; use collections::vec::Vec; -use collections::string::String; use std::borrow::Cow; use std::io::{self, Write}; use std::ops::DerefMut; @@ -156,7 +156,7 @@ impl TreeSink for RcDom { new_node(Element(name, attrs)) } - fn create_comment(&mut self, text: String) -> Handle { + fn create_comment(&mut self, text: Tendril) -> Handle { new_node(Comment(text)) } @@ -211,7 +211,7 @@ impl TreeSink for RcDom { Ok(()) } - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) { append(&self.document, new_node(Doctype(name, public_id, system_id))); } diff --git a/src/tokenizer/buffer_queue.rs b/src/tokenizer/buffer_queue.rs index d73253a8..d798f85c 100644 --- a/src/tokenizer/buffer_queue.rs +++ b/src/tokenizer/buffer_queue.rs @@ -10,6 +10,7 @@ use core::prelude::*; use util::str::AsciiCast; +use util::tendril::Tendril; use util::smallcharset::SmallCharSet; use core::str::CharRange; @@ -20,16 +21,16 @@ pub use self::SetResult::{FromSet, NotFromSet}; struct Buffer { /// Byte position within the buffer. - pub pos: usize, + pub pos: u32, /// The buffer. - pub buf: String, + pub buf: Tendril, } /// Result from `pop_except_from`. #[derive(PartialEq, Eq, Debug)] pub enum SetResult { FromSet(char), - NotFromSet(String), + NotFromSet(Tendril), } /// A queue of owned string buffers, which supports incrementally @@ -43,12 +44,12 @@ impl BufferQueue { /// Create an empty BufferQueue. pub fn new() -> BufferQueue { BufferQueue { - buffers: VecDeque::with_capacity(3), + buffers: VecDeque::with_capacity(16), } } /// Add a buffer to the beginning of the queue. - pub fn push_front(&mut self, buf: String) { + pub fn push_front(&mut self, buf: Tendril) { if buf.len() == 0 { return; } @@ -61,8 +62,8 @@ impl BufferQueue { /// Add a buffer to the end of the queue. /// 'pos' can be non-zero to remove that many bytes /// from the beginning. - pub fn push_back(&mut self, buf: String, pos: usize) { - if pos >= buf.len() { + pub fn push_back(&mut self, buf: Tendril, pos: u32) { + if pos as usize >= buf.len() { return; } self.buffers.push_back(Buffer { @@ -74,7 +75,7 @@ impl BufferQueue { /// Look at the next available character, if any. pub fn peek(&mut self) -> Option { match self.buffers.front() { - Some(&Buffer { pos, ref buf }) => Some(buf.as_slice().char_at(pos)), + Some(&Buffer { pos, ref buf }) => Some(buf.as_slice().char_at(pos as usize)), None => None, } } @@ -84,8 +85,8 @@ impl BufferQueue { let (result, now_empty) = match self.buffers.front_mut() { None => (None, false), Some(&mut Buffer { ref mut pos, ref buf }) => { - let CharRange { ch, next } = buf.as_slice().char_range_at(*pos); - *pos = next; + let CharRange { ch, next } = buf.as_slice().char_range_at(*pos as usize); + *pos = next as u32; (Some(ch), next >= buf.len()) } }; @@ -98,22 +99,24 @@ impl BufferQueue { } /// Pops and returns either a single character from the given set, or - /// a `String` of characters none of which are in the set. The set + /// a `Tendril` of characters none of which are in the set. The set /// is represented as a bitmask and so can only contain the first 64 /// ASCII characters. pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option { let (result, now_empty) = match self.buffers.front_mut() { Some(&mut Buffer { ref mut pos, ref buf }) => { - let n = set.nonmember_prefix_len(&buf[*pos..]); + let n = set.nonmember_prefix_len(&buf[*pos as usize..]); if n > 0 { let new_pos = *pos + n; - let out = String::from_str(&buf[*pos..new_pos]); + let out = unsafe { + buf.subtendril(*pos, new_pos) + }; *pos = new_pos; - (Some(NotFromSet(out)), new_pos >= buf.len()) + (Some(NotFromSet(out)), new_pos as usize >= buf.len()) } else { - let CharRange { ch, next } = buf.as_slice().char_range_at(*pos); - *pos = next; - (Some(FromSet(ch)), next >= buf.len()) + let CharRange { ch, next } = buf.as_slice().char_range_at(*pos as usize); + *pos = next as u32; + (Some(FromSet(ch)), next as usize >= buf.len()) } } _ => (None, false), @@ -146,7 +149,7 @@ impl BufferQueue { } let ref buf = self.buffers[buffers_exhausted]; - let d = buf.buf.as_slice().char_at(consumed_from_last); + let d = buf.buf.as_slice().char_at(consumed_from_last as usize); match (c.to_ascii_opt(), d.to_ascii_opt()) { (Some(c), Some(d)) => if c.eq_ignore_case(d) { () } else { return Some(false) }, _ => return Some(false), @@ -154,7 +157,7 @@ impl BufferQueue { // d was an ASCII character; size must be 1 byte consumed_from_last += 1; - if consumed_from_last >= buf.buf.len() { + if consumed_from_last as usize >= buf.buf.len() { buffers_exhausted += 1; consumed_from_last = 0; } @@ -178,7 +181,7 @@ impl BufferQueue { #[allow(non_snake_case)] mod test { use core::prelude::*; - use collections::string::String; + use util::tendril::Tendril; use super::{BufferQueue, FromSet, NotFromSet}; #[test] @@ -187,7 +190,7 @@ mod test { assert_eq!(bq.peek(), None); assert_eq!(bq.next(), None); - bq.push_back(String::from_str("abc"), 0); + bq.push_back(Tendril::owned_copy("abc"), 0); assert_eq!(bq.peek(), Some('a')); assert_eq!(bq.next(), Some('a')); assert_eq!(bq.peek(), Some('b')); @@ -202,10 +205,10 @@ mod test { #[test] fn can_unconsume() { let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("abc"), 0); + bq.push_back(Tendril::owned_copy("abc"), 0); assert_eq!(bq.next(), Some('a')); - bq.push_front(String::from_str("xy")); + bq.push_front(Tendril::owned_copy("xy")); assert_eq!(bq.next(), Some('x')); assert_eq!(bq.next(), Some('y')); assert_eq!(bq.next(), Some('b')); @@ -216,18 +219,18 @@ mod test { #[test] fn can_pop_except_set() { let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("abc&def"), 0); + bq.push_back(Tendril::owned_copy("abc&def"), 0); let mut pop = || bq.pop_except_from(small_char_set!('&')); - assert_eq!(pop(), Some(NotFromSet(String::from_str("abc")))); + assert_eq!(pop(), Some(NotFromSet(Tendril::owned_copy("abc")))); assert_eq!(pop(), Some(FromSet('&'))); - assert_eq!(pop(), Some(NotFromSet(String::from_str("def")))); + assert_eq!(pop(), Some(NotFromSet(Tendril::owned_copy("def")))); assert_eq!(pop(), None); } #[test] fn can_push_truncated() { let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("abc"), 1); + bq.push_back(Tendril::owned_copy("abc"), 1); assert_eq!(bq.next(), Some('b')); assert_eq!(bq.next(), Some('c')); assert_eq!(bq.next(), None); @@ -239,8 +242,8 @@ mod test { // integration tests for more thorough testing with many // different input buffer splits. let mut bq = BufferQueue::new(); - bq.push_back(String::from_str("a"), 0); - bq.push_back(String::from_str("bc"), 0); + bq.push_back(Tendril::owned_copy("a"), 0); + bq.push_back(Tendril::owned_copy("bc"), 0); assert_eq!(bq.eat("abcd"), None); assert_eq!(bq.eat("ax"), Some(false)); assert_eq!(bq.eat("ab"), Some(true)); diff --git a/src/tokenizer/char_ref/mod.rs b/src/tokenizer/char_ref/mod.rs index 619f3b79..ce788270 100644 --- a/src/tokenizer/char_ref/mod.rs +++ b/src/tokenizer/char_ref/mod.rs @@ -9,9 +9,10 @@ use core::prelude::*; -use super::{Tokenizer, TokenSink}; +use super::{Tokenizer, TokenSink, name_buffer}; -use util::str::{is_ascii_alnum, empty_str}; +use util::str::{is_ascii_alnum}; +use util::tendril::Tendril; use core::char::from_u32; use std::borrow::Cow::Borrowed; @@ -145,7 +146,7 @@ impl CharRefTokenizer { _ => { self.state = Named; - self.name_buf_opt = Some(empty_str()); + self.name_buf_opt = Some(name_buffer()); Progress } } @@ -204,7 +205,7 @@ impl CharRefTokenizer { } fn unconsume_numeric(&mut self, tokenizer: &mut Tokenizer) -> Status { - let mut unconsume = String::from_str("#"); + let mut unconsume = Tendril::from_char('#'); match self.hex_marker { Some(c) => unconsume.push(c), None => (), @@ -276,7 +277,7 @@ impl CharRefTokenizer { } fn unconsume_name(&mut self, tokenizer: &mut Tokenizer) { - tokenizer.unconsume(self.name_buf_opt.take().unwrap()); + tokenizer.unconsume(Tendril::owned(self.name_buf_opt.take().unwrap())); } fn finish_named(&mut self, @@ -350,7 +351,7 @@ impl CharRefTokenizer { self.unconsume_name(tokenizer); self.finish_none() } else { - tokenizer.unconsume(String::from_str(&self.name_buf()[name_len..])); + tokenizer.unconsume(Tendril::owned_copy(&self.name_buf()[name_len..])); self.result = Some(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], num_chars: if c2 == 0 { 1 } else { 2 }, @@ -394,7 +395,7 @@ impl CharRefTokenizer { } Octothorpe => { - tokenizer.unconsume(String::from_str("#")); + tokenizer.unconsume(Tendril::owned_copy("#")); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); self.finish_none(); } diff --git a/src/tokenizer/interface.rs b/src/tokenizer/interface.rs index d9b86d9c..a3120dab 100644 --- a/src/tokenizer/interface.rs +++ b/src/tokenizer/interface.rs @@ -19,6 +19,8 @@ use std::marker::Send; use string_cache::{Atom, QualName}; +use util::tendril::Tendril; + pub use self::TagKind::{StartTag, EndTag}; pub use self::Token::{DoctypeToken, TagToken, CommentToken, CharacterTokens}; pub use self::Token::{NullCharacterToken, EOFToken, ParseError}; @@ -27,9 +29,9 @@ pub use self::Token::{NullCharacterToken, EOFToken, ParseError}; // FIXME: already exists in Servo DOM #[derive(PartialEq, Eq, Clone, Debug)] pub struct Doctype { - pub name: Option, - pub public_id: Option, - pub system_id: Option, + pub name: Option, + pub public_id: Option, + pub system_id: Option, pub force_quirks: bool, } @@ -53,7 +55,7 @@ impl Doctype { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] pub struct Attribute { pub name: QualName, - pub value: String, + pub value: Tendril, } #[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] @@ -92,8 +94,8 @@ impl Tag { pub enum Token { DoctypeToken(Doctype), TagToken(Tag), - CommentToken(String), - CharacterTokens(String), + CommentToken(Tendril), + CharacterTokens(Tendril), NullCharacterToken, EOFToken, ParseError(Cow<'static, str>), diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 285e37f1..adfef19b 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -32,8 +32,9 @@ use self::char_ref::{CharRef, CharRefTokenizer}; use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet}; -use util::str::{lower_ascii, lower_ascii_letter, empty_str}; +use util::str::{lower_ascii, lower_ascii_letter}; use util::smallcharset::SmallCharSet; +use util::tendril::{Tendril, IntoTendril}; use core::mem::replace; use core::default::Default; @@ -50,19 +51,16 @@ mod interface; mod char_ref; mod buffer_queue; -fn option_push(opt_str: &mut Option, c: char) { +fn option_push(opt_str: &mut Option, c: char) { match *opt_str { Some(ref mut s) => s.push(c), - None => *opt_str = Some(c.to_string()), + None => *opt_str = Some(Tendril::from_char(c)), } } -fn append_strings(lhs: &mut String, rhs: String) { - if lhs.is_empty() { - *lhs = rhs; - } else { - lhs.push_str(rhs.as_slice()); - } +/// Pre-allocate a string which will hold a tag or attribute name. +fn name_buffer() -> String { + String::with_capacity(12) } /// Tokenizer options, with an impl for `Default`. @@ -153,10 +151,10 @@ pub struct Tokenizer { current_attr_name: String, /// Current attribute value. - current_attr_value: String, + current_attr_value: Tendril, /// Current comment. - current_comment: String, + current_comment: Tendril, /// Current doctype token. current_doctype: Doctype, @@ -165,7 +163,7 @@ pub struct Tokenizer { last_start_tag_name: Option, /// The "temporary buffer" mentioned in the spec. - temp_buf: String, + temp_buf: Tendril, /// Record of how many ns we spent in each state, if profiling is enabled. state_profile: BTreeMap, @@ -197,15 +195,15 @@ impl Tokenizer { ignore_lf: false, discard_bom: discard_bom, current_tag_kind: StartTag, - current_tag_name: empty_str(), + current_tag_name: name_buffer(), current_tag_self_closing: false, current_tag_attrs: vec!(), - current_attr_name: empty_str(), - current_attr_value: empty_str(), - current_comment: empty_str(), + current_attr_name: name_buffer(), + current_attr_value: Tendril::new(), + current_comment: Tendril::new(), current_doctype: Doctype::new(), last_start_tag_name: start_tag_name, - temp_buf: empty_str(), + temp_buf: Tendril::new(), state_profile: BTreeMap::new(), time_in_sink: 0, } @@ -224,12 +222,15 @@ impl Tokenizer { } /// Feed an input string into the tokenizer. - pub fn feed(&mut self, input: String) { - if input.len() == 0 { + pub fn feed(&mut self, input: T) + where T: IntoTendril, + { + let input = input.into_tendril(); + if input.is_empty() { return; } - let pos = if self.discard_bom && input.as_slice().char_at(0) == '\u{feff}' { + let pos = if self.discard_bom && input.char_at(0) == '\u{feff}' { self.discard_bom = false; 3 // length of BOM in UTF-8 } else { @@ -372,20 +373,20 @@ impl Tokenizer { fn emit_char(&mut self, c: char) { self.process_token(match c { '\0' => NullCharacterToken, - _ => CharacterTokens(c.to_string()), + _ => CharacterTokens(Tendril::from_char(c)), }); } // The string must not contain '\0'! - fn emit_chars(&mut self, b: String) { + fn emit_chars(&mut self, b: Tendril) { self.process_token(CharacterTokens(b)); } fn emit_current_tag(&mut self) { self.finish_attribute(); - let name = replace(&mut self.current_tag_name, String::new()); - let name = Atom::from_slice(name.as_slice()); + let name = Atom::from_slice(&self.current_tag_name); + self.current_tag_name.truncate(0); match self.current_tag_kind { StartTag => { @@ -418,22 +419,22 @@ impl Tokenizer { fn emit_temp_buf(&mut self) { // FIXME: Make sure that clearing on emit is spec-compatible. - let buf = replace(&mut self.temp_buf, empty_str()); + let buf = replace(&mut self.temp_buf, Tendril::new()); self.emit_chars(buf); } fn clear_temp_buf(&mut self) { // Do this without a new allocation. - self.temp_buf.truncate(0); + self.temp_buf.clear(); } fn emit_current_comment(&mut self) { - let comment = replace(&mut self.current_comment, empty_str()); + let comment = replace(&mut self.current_comment, Tendril::new()); self.process_token(CommentToken(comment)); } fn discard_tag(&mut self) { - self.current_tag_name = String::new(); + self.current_tag_name.truncate(0); self.current_tag_self_closing = false; self.current_tag_attrs = vec!(); } @@ -468,21 +469,22 @@ impl Tokenizer { // FIXME: the spec says we should error as soon as the name is finished. // FIXME: linear time search, do we care? let dup = { - let name = self.current_attr_name.as_slice(); + let name = &*self.current_attr_name; self.current_tag_attrs.iter().any(|a| a.name.local.as_slice() == name) }; if dup { self.emit_error(Borrowed("Duplicate attribute")); self.current_attr_name.truncate(0); - self.current_attr_value.truncate(0); + self.current_attr_value.clear(); } else { - let name = replace(&mut self.current_attr_name, String::new()); + let name = Atom::from_slice(&self.current_attr_name); + self.current_attr_name.truncate(0); self.current_tag_attrs.push(Attribute { // The tree builder will adjust the namespace if necessary. // This only happens in foreign elements. - name: QualName::new(ns!(""), Atom::from_slice(name.as_slice())), - value: replace(&mut self.current_attr_value, empty_str()), + name: QualName::new(ns!(""), name), + value: replace(&mut self.current_attr_value, Tendril::new()), }); } } @@ -492,7 +494,7 @@ impl Tokenizer { self.process_token(DoctypeToken(doctype)); } - fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option { + fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option { match kind { Public => &mut self.current_doctype.public_id, System => &mut self.current_doctype.system_id, @@ -502,8 +504,8 @@ impl Tokenizer { fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { let id = self.doctype_id(kind); match *id { - Some(ref mut s) => s.truncate(0), - None => *id = Some(empty_str()), + Some(ref mut s) => s.clear(), + None => *id = Some(Tendril::new()), } } @@ -530,7 +532,7 @@ impl Tokenizer { assert!(c.is_some()); } - fn unconsume(&mut self, buf: String) { + fn unconsume(&mut self, buf: Tendril) { self.input_buffers.push_front(buf); } @@ -553,11 +555,11 @@ macro_rules! shorthand ( ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); ); ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push($c); ); ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push($c); ); - ( $me:ident : append_value $c:expr ) => ( append_strings(&mut $me.current_attr_value, $c); ); + ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); ); ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push($c); ); ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_str($c); ); ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); ); - ( $me:ident : clear_comment ) => ( $me.current_comment.truncate(0); ); + ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); ); ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); ); ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); ); ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); ); @@ -815,7 +817,7 @@ impl Tokenizer { let c = get_char!(self); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { - let esc = if self.temp_buf.as_slice() == "script" { DoubleEscaped } else { Escaped }; + let esc = if &*self.temp_buf == "script" { DoubleEscaped } else { Escaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); } _ => match lower_ascii_letter(c) { @@ -865,7 +867,7 @@ impl Tokenizer { let c = get_char!(self); match c { '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { - let esc = if self.temp_buf.as_slice() == "script" { Escaped } else { DoubleEscaped }; + let esc = if &*self.temp_buf == "script" { Escaped } else { DoubleEscaped }; go!(self: emit c; to RawData ScriptDataEscaped esc); } _ => match lower_ascii_letter(c) { @@ -1344,47 +1346,27 @@ impl Tokenizer { mod test { use core::prelude::*; use collections::vec::Vec; - use collections::string::String; - use super::{option_push, append_strings}; // private items + use util::tendril::Tendril; + use super::{option_push}; // private items #[test] fn push_to_None_gives_singleton() { - let mut s: Option = None; + let mut s: Option = None; option_push(&mut s, 'x'); - assert_eq!(s, Some(String::from_str("x"))); + assert_eq!(s, Some(Tendril::owned_copy("x"))); } #[test] fn push_to_empty_appends() { - let mut s: Option = Some(String::new()); + let mut s: Option = Some(Tendril::new()); option_push(&mut s, 'x'); - assert_eq!(s, Some(String::from_str("x"))); + assert_eq!(s, Some(Tendril::owned_copy("x"))); } #[test] fn push_to_nonempty_appends() { - let mut s: Option = Some(String::from_str("y")); + let mut s: Option = Some(Tendril::owned_copy("y")); option_push(&mut s, 'x'); - assert_eq!(s, Some(String::from_str("yx"))); - } - - #[test] - fn append_appends() { - let mut s = String::from_str("foo"); - append_strings(&mut s, String::from_str("bar")); - assert_eq!(s, String::from_str("foobar")); - } - - #[test] - fn append_to_empty_does_not_copy() { - let mut lhs: String = String::from_str(""); - let rhs: Vec = vec![b'f', b'o', b'o']; - let ptr_old = rhs[0] as *const u8; - - append_strings(&mut lhs, String::from_utf8(rhs).unwrap()); - assert_eq!(lhs, String::from_str("foo")); - - let ptr_new = lhs.into_bytes()[0] as *const u8; - assert_eq!(ptr_old, ptr_new); + assert_eq!(s, Some(Tendril::owned_copy("yx"))); } } diff --git a/src/tree_builder/actions.rs b/src/tree_builder/actions.rs index 66333147..1c841aeb 100644 --- a/src/tree_builder/actions.rs +++ b/src/tree_builder/actions.rs @@ -23,13 +23,13 @@ use tokenizer::{Attribute, Tag, EndTag}; use tokenizer::states::{RawData, RawKind}; use util::str::{AsciiExt, to_escaped_string}; +use util::tendril::Tendril; use core::mem::replace; use core::iter::{Rev, Enumerate}; use core::slice; use core::fmt::Debug; use collections::vec::Vec; -use collections::string::String; use std::borrow::Cow::Borrowed; use string_cache::{Atom, QualName}; @@ -66,10 +66,10 @@ pub trait TreeBuilderActions { fn assert_named(&mut self, node: Handle, name: Atom); fn clear_active_formatting_to_marker(&mut self); fn create_formatting_element_for(&mut self, tag: Tag) -> Handle; - fn append_text(&mut self, text: String) -> ProcessResult; - fn append_comment(&mut self, text: String) -> ProcessResult; - fn append_comment_to_doc(&mut self, text: String) -> ProcessResult; - fn append_comment_to_html(&mut self, text: String) -> ProcessResult; + fn append_text(&mut self, text: Tendril) -> ProcessResult; + fn append_comment(&mut self, text: Tendril) -> ProcessResult; + fn append_comment_to_doc(&mut self, text: Tendril) -> ProcessResult; + fn append_comment_to_html(&mut self, text: Tendril) -> ProcessResult; fn insert_appropriately(&mut self, child: NodeOrText, override_target: Option); fn insert_phantom(&mut self, name: Atom) -> Handle; fn insert_and_pop_element_for(&mut self, tag: Tag) -> Handle; @@ -707,25 +707,25 @@ impl TreeBuilderActions self.clear_active_formatting_to_marker(); } - fn append_text(&mut self, text: String) -> ProcessResult { + fn append_text(&mut self, text: Tendril) -> ProcessResult { self.insert_appropriately(AppendText(text), None); Done } - fn append_comment(&mut self, text: String) -> ProcessResult { + fn append_comment(&mut self, text: Tendril) -> ProcessResult { let comment = self.sink.create_comment(text); self.insert_appropriately(AppendNode(comment), None); Done } - fn append_comment_to_doc(&mut self, text: String) -> ProcessResult { + fn append_comment_to_doc(&mut self, text: Tendril) -> ProcessResult { let target = self.doc_handle.clone(); let comment = self.sink.create_comment(text); self.sink.append(target, AppendNode(comment)); Done } - fn append_comment_to_html(&mut self, text: String) -> ProcessResult { + fn append_comment_to_html(&mut self, text: Tendril) -> ProcessResult { let target = self.html_elem(); let comment = self.sink.create_comment(text); self.sink.append(target, AppendNode(comment)); diff --git a/src/tree_builder/data.rs b/src/tree_builder/data.rs index 586f7371..25b9b18b 100644 --- a/src/tree_builder/data.rs +++ b/src/tree_builder/data.rs @@ -12,6 +12,7 @@ use core::prelude::*; use tokenizer::Doctype; use tree_builder::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks}; use util::str::AsciiExt; +use util::tendril::Tendril; use collections::string::String; @@ -94,17 +95,24 @@ static HTML4_PUBLIC_PREFIXES: &'static [&'static str] = &[ ]; pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool, QuirksMode) { - fn opt_as_slice<'t>(x: &'t Option) -> Option<&'t str> { + fn opt_string_as_slice<'t>(x: &'t Option) -> Option<&'t str> { x.as_ref().map(|y| y.as_slice()) } + fn opt_tendril_as_slice<'t>(x: &'t Option) -> Option<&'t str> { + match *x { + Some(ref t) => Some(t), + None => None, + } + } + fn opt_to_ascii_lower(x: Option<&str>) -> Option { x.map(|y| y.to_ascii_lower()) } - let name = opt_as_slice(&doctype.name); - let public = opt_as_slice(&doctype.public_id); - let system = opt_as_slice(&doctype.system_id); + let name = opt_tendril_as_slice(&doctype.name); + let public = opt_tendril_as_slice(&doctype.public_id); + let system = opt_tendril_as_slice(&doctype.system_id); let err = match (name, public, system) { (Some("html"), None, None) @@ -130,7 +138,7 @@ pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool let public = opt_to_ascii_lower(public); let system = opt_to_ascii_lower(system); - let quirk = match (opt_as_slice(&public), opt_as_slice(&system)) { + let quirk = match (opt_string_as_slice(&public), opt_string_as_slice(&system)) { _ if doctype.force_quirks => Quirks, _ if name != Some("html") => Quirks, diff --git a/src/tree_builder/interface.rs b/src/tree_builder/interface.rs index 0a872b7d..52031818 100644 --- a/src/tree_builder/interface.rs +++ b/src/tree_builder/interface.rs @@ -15,11 +15,12 @@ use core::prelude::*; use tokenizer::Attribute; use collections::vec::Vec; -use collections::string::String; use std::borrow::Cow; use string_cache::QualName; +use util::tendril::Tendril; + pub use self::QuirksMode::{Quirks, LimitedQuirks, NoQuirks}; pub use self::NodeOrText::{AppendNode, AppendText}; @@ -37,7 +38,7 @@ pub enum QuirksMode { /// the sink may not want to allocate a `Handle` for each. pub enum NodeOrText { AppendNode(Handle), - AppendText(String), + AppendText(Tendril), } /// Types which can process tree modifications from the tree builder. @@ -69,7 +70,7 @@ pub trait TreeSink { fn create_element(&mut self, name: QualName, attrs: Vec) -> Self::Handle; /// Create a comment node. - fn create_comment(&mut self, text: String) -> Self::Handle; + fn create_comment(&mut self, text: Tendril) -> Self::Handle; /// Append a node as the last child of the given node. If this would /// produce adjacent sibling text nodes, it should concatenate the text @@ -92,7 +93,7 @@ pub trait TreeSink { new_node: NodeOrText) -> Result<(), NodeOrText>; /// Append a `DOCTYPE` element to the `Document` node. - fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String); + fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril); /// Add each attribute to the given element, if no attribute /// with that name already exists. diff --git a/src/tree_builder/mod.rs b/src/tree_builder/mod.rs index c89c8fdf..33293aa3 100644 --- a/src/tree_builder/mod.rs +++ b/src/tree_builder/mod.rs @@ -7,6 +7,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![allow(warnings)] + //! The HTML5 tree builder. use core::prelude::*; @@ -26,7 +28,8 @@ use tokenizer::{Doctype, Tag}; use tokenizer::TokenSink; use tokenizer::states as tok_state; -use util::str::{is_ascii_whitespace, char_run}; +use util::str::is_ascii_whitespace; +use util::tendril::Tendril; use core::default::Default; use core::mem::replace; @@ -104,7 +107,7 @@ pub struct TreeBuilder { template_modes: Vec, /// Pending table character tokens. - pending_table_text: Vec<(SplitStatus, String)>, + pending_table_text: Vec<(SplitStatus, Tendril)>, /// Quirks mode as set by the parser. /// FIXME: can scripts etc. change this? @@ -347,18 +350,16 @@ impl TreeBuilder token = t; } SplitWhitespace(buf) => { - let buf = buf.as_slice(); - - let (len, is_ws) = unwrap_or_return!( - char_run(is_ascii_whitespace, buf), ()); + let (len, is_ws) = unwrap_or_return!(buf.char_run(is_ascii_whitespace), ()); token = CharacterTokens( if is_ws { Whitespace } else { NotWhitespace }, - String::from_str(&buf[..len])); + unsafe { buf.subtendril(0, len) }); - if len < buf.len() { + if len < buf.len32() { more_tokens.push_back( - CharacterTokens(NotSplit, String::from_str(&buf[len..]))); + CharacterTokens(NotSplit, + unsafe { buf.subtendril(len, buf.len32()) })); } } } @@ -397,9 +398,9 @@ impl TokenSink let Doctype { name, public_id, system_id, force_quirks: _ } = dt; if !self.opts.drop_doctype { self.sink.append_doctype_to_document( - name.unwrap_or(String::new()), - public_id.unwrap_or(String::new()), - system_id.unwrap_or(String::new()) + name.unwrap_or(Tendril::new()), + public_id.unwrap_or(Tendril::new()), + system_id.unwrap_or(Tendril::new()) ); } self.set_quirks_mode(quirk); @@ -420,8 +421,8 @@ impl TokenSink tokenizer::EOFToken => EOFToken, tokenizer::CharacterTokens(mut x) => { - if ignore_lf && x.len() >= 1 && x.as_slice().char_at(0) == '\n' { - x.remove(0); + if ignore_lf { + x.pop_front_lf(); } if x.is_empty() { return; diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs index 259f18af..80d58926 100644 --- a/src/tree_builder/rules.rs +++ b/src/tree_builder/rules.rs @@ -20,12 +20,12 @@ use tokenizer::{Tag, StartTag, EndTag}; use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext}; use util::str::is_ascii_whitespace; +use util::tendril::Tendril; use core::mem::replace; -use collections::string::String; use std::borrow::Cow::Borrowed; -fn any_not_whitespace(x: &String) -> bool { +fn any_not_whitespace(x: &Tendril) -> bool { // FIXME: this might be much faster as a byte scan x.as_slice().chars().any(|c| !is_ascii_whitespace(c)) } diff --git a/src/tree_builder/types.rs b/src/tree_builder/types.rs index cfa1df3c..2925c444 100644 --- a/src/tree_builder/types.rs +++ b/src/tree_builder/types.rs @@ -11,10 +11,9 @@ use core::prelude::*; +use util::tendril::Tendril; use tokenizer::Tag; -use collections::string::String; - pub use self::InsertionMode::*; pub use self::SplitStatus::*; pub use self::Token::*; @@ -60,8 +59,8 @@ pub enum SplitStatus { #[derive(PartialEq, Eq, Clone, Debug)] pub enum Token { TagToken(Tag), - CommentToken(String), - CharacterTokens(SplitStatus, String), + CommentToken(Tendril), + CharacterTokens(SplitStatus, Tendril), NullCharacterToken, EOFToken, } @@ -69,7 +68,7 @@ pub enum Token { pub enum ProcessResult { Done, DoneAckSelfClosing, - SplitWhitespace(String), + SplitWhitespace(Tendril), Reprocess(InsertionMode, Token), } diff --git a/src/util/smallcharset.rs b/src/util/smallcharset.rs index 52cb74a5..cc5f2d77 100644 --- a/src/util/smallcharset.rs +++ b/src/util/smallcharset.rs @@ -24,7 +24,7 @@ impl SmallCharSet { /// Count the number of bytes of characters at the beginning /// of `buf` which are not in the set. /// See `tokenizer::buffer_queue::pop_except_from`. - pub fn nonmember_prefix_len(&self, buf: &str) -> usize { + pub fn nonmember_prefix_len(&self, buf: &str) -> u32 { let mut n = 0; for b in buf.bytes() { if b >= 64 || !self.contains(b) { @@ -52,11 +52,11 @@ mod test { #[test] fn nonmember_prefix() { for &c in ['&', '\0'].iter() { - for x in 0 .. 48usize { - for y in 0 .. 48usize { - let mut s = repeat("x").take(x).collect::(); + for x in 0 .. 48u32 { + for y in 0 .. 48u32 { + let mut s = repeat("x").take(x as usize).collect::(); s.push(c); - s.push_str(repeat("x").take(y).collect::().as_slice()); + s.push_str(repeat("x").take(y as usize).collect::().as_slice()); let set = small_char_set!('&' '\0'); assert_eq!(x, set.nonmember_prefix_len(s.as_slice())); diff --git a/src/util/str.rs b/src/util/str.rs index 892d9707..194aed92 100644 --- a/src/util/str.rs +++ b/src/util/str.rs @@ -65,7 +65,7 @@ pub static ASCII_LOWER_MAP: [u8; 256] = [ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, ]; -#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] +#[derive(Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] pub struct Ascii { chr: u8, } @@ -75,6 +75,10 @@ impl Ascii { self.chr as char } + pub fn to_u8(self) -> u8 { + self.chr + } + #[inline] pub fn is_alphabetic(&self) -> bool { (self.chr >= 0x41 && self.chr <= 0x5A) || (self.chr >= 0x61 && self.chr <= 0x7A) @@ -106,6 +110,7 @@ pub trait AsciiCast { } impl AsciiCast for char { + #[inline] fn to_ascii_opt(&self) -> Option { let n = *self as u32; if n < 0x80 { @@ -170,11 +175,6 @@ pub fn is_ascii_alnum(c: char) -> bool { c.to_ascii_opt().map_or(false, |a| a.is_alphanumeric()) } -/// Allocate an empty string with a small non-zero capacity. -pub fn empty_str() -> String { - String::with_capacity(4) -} - /// ASCII whitespace characters, as defined by /// tree construction modes that treat them specially. pub fn is_ascii_whitespace(c: char) -> bool { @@ -184,30 +184,11 @@ pub fn is_ascii_whitespace(c: char) -> bool { } } -/// Count how many bytes at the beginning of the string -/// either all match or all don't match the predicate, -/// and also return whether they match. -/// -/// Returns `None` on an empty string. -pub fn char_run(mut pred: Pred, buf: &str) -> Option<(usize, bool)> - where Pred: FnMut(char) -> bool, -{ - let (first, rest) = unwrap_or_return!(buf.slice_shift_char(), None); - let matches = pred(first); - - for (idx, ch) in rest.char_indices() { - if matches != pred(ch) { - return Some((idx + first.len_utf8(), matches)); - } - } - Some((buf.len(), matches)) -} - #[cfg(test)] #[allow(non_snake_case)] mod test { use core::prelude::*; - use super::{char_run, is_ascii_whitespace, is_ascii_alnum, lower_ascii, lower_ascii_letter}; + use super::{is_ascii_alnum, lower_ascii, lower_ascii_letter}; test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a')); test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a')); @@ -224,22 +205,4 @@ mod test { test_eq!(is_alnum_1, is_ascii_alnum('1'), true); test_eq!(is_not_alnum_symbol, is_ascii_alnum('!'), false); test_eq!(is_not_alnum_nonascii, is_ascii_alnum('\u{a66e}'), false); - - macro_rules! test_char_run ( ($name:ident, $input:expr, $expect:expr) => ( - test_eq!($name, char_run(is_ascii_whitespace, $input), $expect); - )); - - test_char_run!(run_empty, "", None); - test_char_run!(run_one_t, " ", Some((1, true))); - test_char_run!(run_one_f, "x", Some((1, false))); - test_char_run!(run_t, " \t \n", Some((6, true))); - test_char_run!(run_f, "xyzzy", Some((5, false))); - test_char_run!(run_tf, " xyzzy", Some((3, true))); - test_char_run!(run_ft, "xyzzy ", Some((5, false))); - test_char_run!(run_tft, " xyzzy ", Some((3, true))); - test_char_run!(run_ftf, "xyzzy hi", Some((5, false))); - test_char_run!(run_multibyte_0, "中 ", Some((3, false))); - test_char_run!(run_multibyte_1, " 中 ", Some((1, true))); - test_char_run!(run_multibyte_2, " 中 ", Some((2, true))); - test_char_run!(run_multibyte_3, " 中 ", Some((3, true))); } diff --git a/src/util/tendril.rs b/src/util/tendril.rs new file mode 100644 index 00000000..fae08663 --- /dev/null +++ b/src/util/tendril.rs @@ -0,0 +1,922 @@ +// Copyright 2015 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::prelude::v1::*; + +use std::{mem, fmt, io, str, slice}; +use std::raw::{self, Repr}; +use std::ops::Deref; +use std::cmp::Ordering; +use std::error::FromError; + +use iobuf::{Iobuf, ROIobuf, RWIobuf}; + +use util::str::AsciiCast; + +use self::Tendril_::{Shared, Owned, Ascii}; + +#[derive(Clone)] +enum Tendril_ { + Shared(ROIobuf<'static>), + Ascii(u8), + Owned(String), +} + +/// html5ever's abstraction of strings. +/// +/// A tendril either owns its content, or is a slice of a shared buffer. +/// These buffers are managed with non-atomic (thread-local) reference +/// counting, which is very fast. +/// +/// Like `String`, `Tendril` implements `Deref`. So you can +/// call string slice methods on `Tendril`, or pass `&Tendril` to a function +/// expecting `&str`. +/// +/// Accordingly, the content of a tendril is guaranteed to be valid UTF-8. +/// Take particular care of this when calling `unsafe` functions below! +/// +/// The maximum size of a tendril is 1 GB. The safe methods below will +/// `panic!` if a tendril grows beyond that size. +#[derive(Clone)] +pub struct Tendril(Tendril_); + +impl PartialEq for Tendril { + #[inline] + fn eq(&self, other: &Tendril) -> bool { + &**self == &**other + } + + #[inline] + fn ne(&self, other: &Tendril) -> bool { + &**self != &**other + } +} + +impl Eq for Tendril { } + +impl PartialOrd for Tendril { + #[inline] + fn partial_cmp(&self, other: &Tendril) -> Option { + (&**self).partial_cmp(other) + } +} + +impl Ord for Tendril { + #[inline] + fn cmp(&self, other: &Tendril) -> Ordering { + (&**self).cmp(other) + } +} + +impl fmt::Display for Tendril { + #[inline] + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + ::fmt(&*self, fmt) + } +} + +impl fmt::Debug for Tendril { + #[inline] + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + try!(write!(fmt, "Tendril[{}](", match self.0 { + Shared(_) => "shared", + Ascii(_) => "ascii", + Owned(_) => "owned", + })); + try!(::fmt(&*self, fmt)); + try!(write!(fmt, ")")); + Ok(()) + } +} + +impl Deref for Tendril { + type Target = str; + + #[inline] + fn deref<'a>(&'a self) -> &'a str { + match self.0 { + Shared(ref s) => unsafe { + mem::transmute(s.as_window_slice()) + }, + Owned(ref s) => s, + Ascii(ref b) => unsafe { + str::from_utf8_unchecked(slice::ref_slice(b)) + }, + } + } +} + +/// Interpret the slice as a single ASCII codepoint, if possible. +#[inline(always)] +fn as_single_ascii(x: &str) -> Option { + // &str is always valid UTF-8, so a one-byte &str must contain + // an ASCII character. + if x.len() == 1 { + Some(unsafe { *x.as_bytes().get_unchecked(0) }) + } else { + None + } +} + +/// The maximum size of a tendril is 1 GB. +pub const TENDRIL_MAX_LEN: u32 = 1 << 30; + +impl Tendril { + /// Create a new, empty tendril. + #[inline] + pub fn new() -> Tendril { + Tendril(Owned(String::new())) + } + + /// Create a tendril from any `IntoTendril` type. + #[inline] + pub fn from(x: T) -> Tendril + where T: IntoTendril, + { + x.into_tendril() + } + + /// Create a tendril from a character. + #[inline] + pub fn from_char(c: char) -> Tendril { + let n = c as usize; + if n < 0x80 { + Tendril(Ascii(n as u8)) + } else { + Tendril(Owned(c.to_string())) + } + } + + /// Create a tendril from a `String`, without copying. + #[inline] + pub fn owned(s: String) -> Tendril { + assert!(s.len() < (1 << 31)); + Tendril(Owned(s)) + } + + /// Copy a string to create a tendril which owns its content. + #[inline] + pub fn owned_copy(s: &str) -> Tendril { + if let Some(n) = as_single_ascii(s) { + Tendril(Ascii(n)) + } else { + Tendril(Owned(String::from_str(s))) + } + } + + /// Copy a string to create a shared buffer which multiple + /// tendrils can point into. + /// + /// See also `subtendril`. + #[inline] + pub fn shared_copy(s: &str) -> Tendril { + Tendril(Shared(ROIobuf::from_str_copy(s))) + } + + /// Does this tendril point into a shared buffer? + #[inline] + pub fn is_shared(&self) -> bool { + match self.0 { + Shared(_) => true, + _ => false, + } + } + + /// Get the length of the tendril. + #[inline] + pub fn len32(&self) -> u32 { + match self.0 { + Shared(ref b) => b.len(), + Owned(ref s) => s.len() as u32, + Ascii(_) => 1, + } + } + + /// Count how many bytes at the beginning of the tendril + /// either all match or all don't match the predicate, + /// and also return whether they match. + /// + /// Returns `None` on an empty string. + pub fn char_run(&self, mut pred: Pred) -> Option<(u32, bool)> + where Pred: FnMut(char) -> bool, + { + let (first, rest) = unwrap_or_return!(self.slice_shift_char(), None); + let matches = pred(first); + + for (idx, ch) in rest.char_indices() { + if matches != pred(ch) { + return Some(((idx + first.len_utf8()) as u32, matches)); + } + } + Some((self.len32(), matches)) + } + + /// Promotes the tendril to owning its content, and get a + /// mutable reference. + /// + /// This is unsafe because the user must not exceed the 1 GB + /// size limit! + #[inline] + unsafe fn to_mut<'a>(&'a mut self) -> &'a mut String { + match self.0 { + Owned(ref mut s) => return s, + _ => (), + } + + self.0 = Owned(String::from_str(self)); + match self.0 { + Owned(ref mut s) => s, + _ => unreachable!(), + } + } + + #[inline(always)] + fn check_len(&self) { + if self.len() > TENDRIL_MAX_LEN as usize { + panic!("tendril exceeded 1 GB"); + } + } + + /// Push a character onto the end of the tendril. + #[inline] + pub fn push(&mut self, c: char) { + if self.is_empty() { + if let Some(a) = c.to_ascii_opt() { + self.0 = Ascii(a.to_u8()); + return; + } + } + unsafe { + self.to_mut().push(c); + } + self.check_len(); + } + + /// Push a string onto the end of the tendril. + #[inline] + pub fn push_str(&mut self, rhs: &str) { + match rhs.len() { + 0 => return, + 1 if self.is_empty() => { + if let Some(n) = as_single_ascii(rhs) { + self.0 = Ascii(n); + return; + } + } + n if n > TENDRIL_MAX_LEN as usize => { + panic!("attempted to extend tendril by more than 1 GB"); + } + + // Otherwise, 2 * TENDRIL_MAX_LEN does not overflow u32. + _ => (), + } + unsafe { + self.to_mut().push_str(rhs); + } + self.check_len(); + } + + /// Push another tendril onto the end of the tendril. + #[inline] + pub fn push_tendril(&mut self, rhs: Tendril) { + if rhs.is_empty() { + return; + } + + if self.is_empty() { + *self = rhs; + return; + } + + // Try to merge adjacent Iobufs. + if let (&mut Tendril(Shared(ref mut a)), &Tendril(Shared(ref b))) + = (&mut *self, &rhs) + { + if a.extend_with(b).is_ok() { + return; + } + } + + if rhs.len() > TENDRIL_MAX_LEN as usize{ + panic!("attempted to extend tendril by more than 1 GB"); + } + + // Slow path: copy on write. + unsafe { + self.to_mut().push_str(&rhs); + } + self.check_len(); + } + + /// Truncate the tendril to an empty tendril, without discarding allocations. + #[inline] + pub fn clear(&mut self) { + if let Owned(ref mut s) = self.0 { + s.truncate(0); + return; + } + self.0 = Owned(String::new()); + } + + /// Remove the front character, if it's `\n`. + #[inline] + pub fn pop_front_lf(&mut self) { + match self.0 { + Ascii(b'\n') => *self = Tendril::new(), + Ascii(_) => (), + Owned(ref mut s) => { + if s.starts_with("\n") { + s.remove(0); + } + } + Shared(ref mut b) => unsafe { + if b.unsafe_peek_le(0) == b'\n' { + b.unsafe_sub_window_from(1); + } + } + } + } + + /// Slice a tendril. + /// + /// The new tendril encompasses bytes in the index range `[from, to)`. + /// + /// If possible, the new and old tendrils point into the same shared + /// buffer. + /// + /// This method is `unsafe` because neither bounds checking nor UTF-8 + /// validity checking is guaranteed. If you violate these properties + /// then all bets are off! + /// + /// html5ever uses `subtendril` in certain fast paths, just after + /// finding a character boundary with a byte-wise scan. + #[inline] + pub unsafe fn subtendril(&self, from: u32, to: u32) -> Tendril { + match *self { + Tendril(Shared(ref a)) => { + let mut b = a.clone(); + b.unsafe_sub_window(from, to - from); + Tendril(Shared(b)) + } + _ => { + let b = self.slice_unchecked(from as usize, to as usize); + Tendril::owned_copy(b) + } + } + } +} + +/// Types which can be converted into a `Tendril`. +/// +/// The `Tendril` and `String` instances avoid copying the string data. +/// The other instances copy into a new owned buffer. +pub trait IntoTendril { + fn into_tendril(self) -> Tendril; +} + +impl IntoTendril for Tendril { + #[inline(always)] + fn into_tendril(self) -> Tendril { + self + } +} + +impl IntoTendril for String { + #[inline(always)] + fn into_tendril(self) -> Tendril { + Tendril::owned(self) + } +} + +impl<'a> IntoTendril for &'a str { + #[inline(always)] + fn into_tendril(self) -> Tendril { + Tendril::owned_copy(self) + } +} + +impl IntoTendril for char { + #[inline(always)] + fn into_tendril(self) -> Tendril { + Tendril::from_char(self) + } +} + +// Be very careful about overflow if you plan to use these functions in another context! +#[inline(always)] +unsafe fn unsafe_slice<'a>(buf: &'a [u8], from: u32, to: u32) -> &'a [u8] { + let raw::Slice { data, len } = buf.repr(); + debug_assert!((from as usize) < len); + debug_assert!((to as usize) <= len); + slice::from_raw_parts(data.offset(from as isize), (to - from) as usize) +} + +#[inline(always)] +unsafe fn unsafe_slice_mut<'a>(buf: &'a mut [u8], from: u32, to: u32) -> &'a mut [u8] { + let raw::Slice { data, len } = buf.repr(); + debug_assert!((from as usize) < len); + debug_assert!((to as usize) <= len); + slice::from_raw_parts_mut(data.offset(from as isize) as *mut u8, (to - from) as usize) +} + +// Return the number of bytes at the end of the buffer that make up an incomplete +// but possibly valid UTF-8 character. +// +// This does *not* check UTF-8 validity. Rather it's used to defer +// validity checking for the last few bytes of a buffer, when appropriate. +// However, it's safe to call on arbitrary byte buffers. +#[inline(always)] +fn incomplete_trailing_utf8(buf: &[u8]) -> u32 { + let n = buf.len(); + if n < 1 { return 0; } + + // There are four patterns of valid-but-incomplete UTF-8: + // + // ... 110xxxxx + // ... 1110xxxx 10xxxxxx + // ... 11110xxx 10xxxxxx + // ... 11110xxx 10xxxxxx 10xxxxxx + + #[inline(always)] fn is_cont(v: u8) -> bool { v & 0b11_000000 == 0b10_000000 } + #[inline(always)] fn is_start(v: u8) -> bool { v & 0b11_000000 == 0b11_000000 } + #[inline(always)] fn is_start_3(v: u8) -> bool { v & 0b1111_0000 == 0b1110_0000 } + #[inline(always)] fn is_start_4(v: u8) -> bool { v & 0b11111_000 == 0b11110_000 } + + unsafe { + let c = *buf.get_unchecked(n-1); + if is_start(c) { return 1; } + + if is_cont(c) { + if n <= 1 { return 0; } + let b = *buf.get_unchecked(n-2); + if is_start_3(b) || is_start_4(b) { return 2; } + + if is_cont(b) { + if n <= 2 { return 0; } + let a = *buf.get_unchecked(n-3); + if is_start_4(a) { return 3; } + } + } + } + + 0 +} + +/// Iterator which produces tendrils by reading an input stream. +/// +/// The tendrils will be backed by shared buffers. They support +/// slicing via `.subtendril()` without a copy. +pub struct TendrilReader { + dead: bool, + chunk_size: u32, + leftover: (u32, [u8; 3]), + reader: R, +} + +impl TendrilReader + where R: io::Read, +{ + /// Read a UTF-8 input stream as a sequence of tendrils (or errors). + /// + /// Each read will attempt to fill a buffer of `chunk_size` bytes. + /// + /// # Panics + /// + /// If `chunk_size` is less than 4 bytes or greater than 1 GB. + #[inline] + pub fn from_utf8(chunk_size: u32, reader: R) -> TendrilReader { + // A chunk must be big enough to hold any UTF-8 character. + // Also it must be small enough to fit in an Iobuf. + // 1GB is only halfway to the Iobuf limit, so we don't worry + // about going a few bytes over, e.g. when handling leftover + // UTF-8 bytes. + assert!(chunk_size >= 4); + assert!(chunk_size <= TENDRIL_MAX_LEN); + TendrilReader { + dead: false, + chunk_size: chunk_size, + reader: reader, + leftover: (0, [0; 3]), + } + } +} + +#[derive(Clone, Debug)] +pub enum TendrilReaderError { + IoError(io::Error), + Utf8Error(str::Utf8Error), +} + +impl FromError for TendrilReaderError { + #[inline] + fn from_error(err: io::Error) -> TendrilReaderError { + TendrilReaderError::IoError(err) + } +} + +impl FromError for TendrilReaderError { + #[inline] + fn from_error(err: str::Utf8Error) -> TendrilReaderError { + TendrilReaderError::Utf8Error(err) + } +} + +impl Iterator for TendrilReader + where R: io::Read, +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.dead { + return None; + } + + let mut buf = RWIobuf::new(self.chunk_size as usize); + + // Copy some leftover bytes from a previous incomplete character, + // if any. + let mut size = match self.leftover { + (0, _) => 0, + (ref mut n, ref pfx) => { + debug_assert!(*n <= 3); + unsafe { + // chunk_size >= 4, which is checked in the + // TendrilReader constructor. + buf.unsafe_poke(0, unsafe_slice(pfx, 0, *n)); + } + mem::replace(n, 0) + } + }; + + unsafe { + if size < self.chunk_size { + let dest = unsafe_slice_mut(buf.as_mut_window_slice(), size, self.chunk_size); + match self.reader.read(dest) { + Err(e) => return Some(Err(TendrilReaderError::from_error(e))), + + Ok(0) => { + // EOF + self.dead = true; + return match size { + 0 => None, + _ => Some(Err(TendrilReaderError::from_error(str::Utf8Error::TooShort))), + }; + } + + Ok(n) => size += n as u32, + } + } + + // Trim the window to exclude uninitialized bytes, and set the + // limit to forbid un-doing this. + buf.unsafe_sub_to(size); + + // Defer validity checking for the bytes making up an incomplete + // UTF-8 character at the end, if any. + let tail_len = incomplete_trailing_utf8(buf.as_window_slice()); + if tail_len > 0 { + let rest = size - tail_len; + self.leftover.0 = tail_len; + buf.unsafe_peek(rest, unsafe_slice_mut(&mut self.leftover.1, 0, tail_len)); + buf.unsafe_sub_window_to(rest); + } + + // Check UTF-8 validity for the remaining buffer. + match str::from_utf8(buf.as_window_slice()) { + Err(e) => Some(Err(TendrilReaderError::from_error(e))), + Ok(_) => Some(Ok(Tendril(Shared(buf.read_only())))), + } + } + } +} + +#[cfg(test)] +mod test { + use std::prelude::v1::*; + use std::{io, cmp}; + use std::slice::bytes; + + use util::str::is_ascii_whitespace; + + use super::{Tendril, Tendril_, TendrilReader, incomplete_trailing_utf8}; + + #[test] + fn tendril_create() { + assert_eq!("", &*Tendril::new()); + + for s in &["", "foo", "zzzzzzzzzzzzzzzzz", "fooő", "ꙮ"] { + assert_eq!(*s, &*Tendril::owned(String::from_str(s))); + assert_eq!(*s, &*Tendril::owned_copy(s)); + assert_eq!(*s, &*Tendril::shared_copy(s)); + } + } + + #[test] + fn tendril_from() { + assert_eq!("x", &*Tendril::from('x')); + assert_eq!("xyz", &*Tendril::from("xyz")); + assert_eq!("xyz", &*Tendril::from(String::from_str("xyz"))); + assert_eq!("xyz", &*Tendril::from(Tendril::from("xyz"))); + } + + #[test] + fn tendril_eq() { + assert_eq!(Tendril::owned_copy("foo"), Tendril::owned_copy("foo")); + assert_eq!(Tendril::owned_copy("foo"), Tendril::shared_copy("foo")); + assert_eq!(Tendril::shared_copy("foo"), Tendril::shared_copy("foo")); + assert!(Tendril::owned_copy("foo") != Tendril::owned_copy("bar")); + assert!(Tendril::owned_copy("foo") != Tendril::shared_copy("bar")); + assert!(Tendril::shared_copy("foo") != Tendril::shared_copy("bar")); + } + + #[test] + fn tendril_partial_ord() { + assert!(Tendril::owned_copy("foo") > Tendril::owned_copy("bar")); + assert!(Tendril::owned_copy("foo") > Tendril::shared_copy("bar")); + assert!(Tendril::shared_copy("foo") > Tendril::shared_copy("bar")); + assert!(Tendril::owned_copy("bar") < Tendril::owned_copy("foo")); + assert!(Tendril::owned_copy("bar") < Tendril::shared_copy("foo")); + assert!(Tendril::shared_copy("bar") < Tendril::shared_copy("foo")); + } + + macro_rules! test_char_run ( ($name:ident, $input:expr, $expect:expr) => ( + test_eq!($name, Tendril::owned_copy($input).char_run(is_ascii_whitespace), $expect); + )); + + test_char_run!(run_empty, "", None); + test_char_run!(run_one_t, " ", Some((1, true))); + test_char_run!(run_one_f, "x", Some((1, false))); + test_char_run!(run_t, " \t \n", Some((6, true))); + test_char_run!(run_f, "xyzzy", Some((5, false))); + test_char_run!(run_tf, " xyzzy", Some((3, true))); + test_char_run!(run_ft, "xyzzy ", Some((5, false))); + test_char_run!(run_tft, " xyzzy ", Some((3, true))); + test_char_run!(run_ftf, "xyzzy hi", Some((5, false))); + test_char_run!(run_multibyte_0, "中 ", Some((3, false))); + test_char_run!(run_multibyte_1, " 中 ", Some((1, true))); + test_char_run!(run_multibyte_2, " 中 ", Some((2, true))); + test_char_run!(run_multibyte_3, " 中 ", Some((3, true))); + + #[test] + fn push() { + let mut t = Tendril::owned_copy("foo"); + t.push('x'); + assert_eq!("foox", &*t); + t.push('y'); + assert_eq!("fooxy", &*t); + + let mut t = Tendril::shared_copy("foo"); + t.push('x'); + assert_eq!("foox", &*t); + t.push('y'); + assert_eq!("fooxy", &*t); + } + + #[test] + fn push_str() { + let mut t = Tendril::owned_copy("foo"); + t.push_str("xy"); + assert_eq!("fooxy", &*t); + t.push_str("ab"); + assert_eq!("fooxyab", &*t); + + let mut t = Tendril::shared_copy("foo"); + t.push_str("xy"); + assert_eq!("fooxy", &*t); + t.push_str("ab"); + assert_eq!("fooxyab", &*t); + } + + #[test] + fn push_tendril_simple() { + let mut t = Tendril::owned_copy("foo"); + t.push_tendril(Tendril::owned_copy("xy")); + assert_eq!("fooxy", &*t); + t.push_tendril(Tendril::owned_copy("ab")); + assert_eq!("fooxyab", &*t); + + let mut t = Tendril::owned_copy("foo"); + t.push_tendril(Tendril::shared_copy("xy")); + assert_eq!("fooxy", &*t); + t.push_tendril(Tendril::owned_copy("ab")); + assert_eq!("fooxyab", &*t); + + let mut t = Tendril::shared_copy("foo"); + t.push_tendril(Tendril::owned_copy("xy")); + assert_eq!("fooxy", &*t); + t.push_tendril(Tendril::shared_copy("ab")); + assert_eq!("fooxyab", &*t); + + let mut t = Tendril::shared_copy("foo"); + t.push_tendril(Tendril::shared_copy("xy")); + assert_eq!("fooxy", &*t); + t.push_tendril(Tendril::shared_copy("ab")); + assert_eq!("fooxyab", &*t); + } + + #[test] + fn push_tendril_share() { + let mut x = Tendril::new(); + x.push_tendril(Tendril::shared_copy("foo")); + assert!(x.is_shared()); + + let mut x = Tendril::owned_copy(""); + x.push_tendril(Tendril::shared_copy("foo")); + assert!(x.is_shared()); + + let mut x = Tendril::shared_copy("foo"); + x.push_str(""); + assert!(x.is_shared()); + + let mut x = Tendril::shared_copy("foo"); + x.push_tendril(Tendril::owned_copy("")); + assert!(x.is_shared()); + + let mut x = Tendril::shared_copy("foo"); + x.push_tendril(Tendril::shared_copy("")); + assert!(x.is_shared()); + } + + #[test] + fn pop_front_lf() { + let mut t = Tendril::new(); + t.pop_front_lf(); + assert_eq!("", &*t); + + let mut t = Tendril(Tendril_::Ascii(b'\n')); + t.pop_front_lf(); + assert_eq!("", &*t); + + let mut t = Tendril(Tendril_::Ascii(b'x')); + t.pop_front_lf(); + assert_eq!("x", &*t); + + let mut t = Tendril::owned_copy("\n"); + t.pop_front_lf(); + assert_eq!("", &*t); + + let mut t = Tendril::owned_copy("\nfoo"); + t.pop_front_lf(); + assert_eq!("foo", &*t); + + let mut t = Tendril::owned_copy("foo"); + t.pop_front_lf(); + assert_eq!("foo", &*t); + + let mut t = Tendril::shared_copy("\n"); + t.pop_front_lf(); + assert_eq!("", &*t); + + let mut t = Tendril::shared_copy("\nfoo"); + t.pop_front_lf(); + assert_eq!("foo", &*t); + + let mut t = Tendril::shared_copy("foo"); + t.pop_front_lf(); + assert_eq!("foo", &*t); + } + + // FIXME: Test the coalescing of adjacent shared tendrils. + + #[test] + fn clear() { + let mut x = Tendril::owned_copy("foo"); + x.clear(); + assert!(x.is_empty()); + + let mut x = Tendril::shared_copy("foo"); + x.clear(); + assert!(x.is_empty()); + } + + #[test] + fn subtendril() { + let x = Tendril::owned_copy("foo"); + let s = unsafe { x.subtendril(0, 1) }; + assert_eq!("f", &*s); + + let x = Tendril::shared_copy("foo"); + let s = unsafe { x.subtendril(0, 1) }; + assert_eq!("f", &*s); + assert!(s.is_shared()); + + let x = Tendril::shared_copy("\u{a66e}of"); + let s = unsafe { x.subtendril(0, 4) }; + assert_eq!("\u{a66e}o", &*s); + assert!(s.is_shared()); + } + + // FIXME: Test scenarios where a tendril grows past the size limit. + + #[test] + fn test_complete_trailing_utf8() { + fn test(x: &str) { + assert_eq!(0, incomplete_trailing_utf8(x.as_bytes())); + } + + test("foobar"); + test("fooő"); + test("foo\u{a66e}"); + test("foo\u{1f4a9}"); + } + + #[test] + fn test_incomplete_trailing_utf8() { + assert_eq!(1, incomplete_trailing_utf8(b"foo\xC5")); + assert_eq!(1, incomplete_trailing_utf8(b"foo\xEA")); + assert_eq!(2, incomplete_trailing_utf8(b"foo\xEA\x99")); + assert_eq!(1, incomplete_trailing_utf8(b"foo\xF0")); + assert_eq!(2, incomplete_trailing_utf8(b"foo\xF0\x9F")); + assert_eq!(3, incomplete_trailing_utf8(b"foo\xF0\x9F\x92")); + } + + struct SliceChunks<'a> { + slice: &'a [u8], + idx: usize, + chunk_size: usize, + } + + impl<'a> io::Read for SliceChunks<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let len = cmp::min(cmp::min(self.chunk_size, buf.len()), + self.slice.len() - self.idx); + if len == 0 { return Ok(0); } + let src = &self.slice[self.idx..][..len]; + bytes::copy_memory(buf, src); + self.idx += len; + Ok(src.len()) + } + } + + fn test_tendril_reader(input: &str) { + let mut chunk_sizes = vec![1, 2, 3, 4, 5, 6, 8, 15, 16, 17, 63, 64, 65, 255, 256, 257]; + if input.len() >= 5 { + chunk_sizes.push(input.len() - 1); + chunk_sizes.push(input.len()); + chunk_sizes.push(input.len() + 1); + } + + for &source_chunk_size in &chunk_sizes { + for &tendril_buf_size in &chunk_sizes { + if tendril_buf_size < 4 { continue; } + + let reader = SliceChunks { + slice: input.as_bytes(), + idx: 0, + chunk_size: source_chunk_size, + }; + + let mut result = String::new(); + for tendril in TendrilReader::from_utf8(tendril_buf_size as u32, reader) { + let tendril = tendril.unwrap(); + result.push_str(&tendril); + } + + assert_eq!(input, &*result); + } + } + } + + macro_rules! test_tendril_reader { + ($( $n:ident => $e:expr, )*) => {$( + #[test] + fn $n() { + test_tendril_reader($e); + } + )*} + } + + test_tendril_reader! { + reader_smoke_test => "Hello, world!", + + udhr_en => "All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act + towards one another in a spirit of brotherhood.", + + udhr_hu => "Minden emberi lény szabadon születik és egyenlő méltósága és + joga van. Az emberek, ésszel és lelkiismerettel bírván, + egymással szemben testvéri szellemben kell hogy viseltessenek.", + + udhr_th => "เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง + เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน.", + + udhr_kr => "모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 + 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 + 형제애의 정신으로 행동하여야 한다.", + + udhr_jbo => "ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a + .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei + jeseki'ubo ry. simyzu'e ta'i le tunba", + + udhr_chr => "ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ + ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ.", + } + + // FIXME: test TendrilReader error handling +} diff --git a/tests/tokenizer.rs b/tests/tokenizer.rs index c0c45636..b9726472 100644 --- a/tests/tokenizer.rs +++ b/tests/tokenizer.rs @@ -32,6 +32,7 @@ use rustc_serialize::json::Json; use std::collections::BTreeMap; use std::borrow::Cow::Borrowed; +use html5ever::Tendril; use html5ever::tokenizer::{Doctype, Attribute, StartTag, EndTag, Tag}; use html5ever::tokenizer::{Token, DoctypeToken, TagToken, CommentToken}; use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, EOFToken, ParseError}; @@ -64,18 +65,25 @@ fn splits(s: &str, n: usize) -> Vec> { out } +#[derive(Copy, Clone, PartialEq, Eq)] +enum Mode { + Normal, + ExactErrors, + ZeroCopy, +} + struct TokenLogger { tokens: Vec, current_str: String, - exact_errors: bool, + mode: Mode, } impl TokenLogger { - fn new(exact_errors: bool) -> TokenLogger { + fn new(mode: Mode) -> TokenLogger { TokenLogger { tokens: vec!(), current_str: String::new(), - exact_errors: exact_errors, + mode: mode, } } @@ -88,7 +96,7 @@ impl TokenLogger { fn finish_str(&mut self) { if self.current_str.len() > 0 { let s = replace(&mut self.current_str, String::new()); - self.tokens.push(CharacterTokens(s)); + self.tokens.push(CharacterTokens(Tendril::owned(s))); } } @@ -109,7 +117,7 @@ impl TokenSink for TokenLogger { self.current_str.push('\0'); } - ParseError(_) => if self.exact_errors { + ParseError(_) => if self.mode == Mode::ExactErrors { self.push(ParseError(Borrowed(""))); }, @@ -134,11 +142,15 @@ impl TokenSink for TokenLogger { } } -fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { - let sink = TokenLogger::new(opts.exact_errors); +fn tokenize(input: Vec, mode: Mode, opts: TokenizerOpts) -> Vec { + let sink = TokenLogger::new(mode); let mut tok = Tokenizer::new(sink, opts); for chunk in input.into_iter() { - tok.feed(chunk); + tok.feed(if mode == Mode::ZeroCopy { + Tendril::shared_copy(&chunk) + } else { + Tendril::owned(chunk) + }); } tok.end(); tok.unwrap().get_tokens() @@ -147,6 +159,7 @@ fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { trait JsonExt { fn get_str(&self) -> String; fn get_nullable_str(&self) -> Option; + fn get_nullable_tendril(&self) -> Option; fn get_bool(&self) -> bool; fn get_obj<'t>(&'t self) -> &'t BTreeMap; fn get_list<'t>(&'t self) -> &'t Vec; @@ -169,6 +182,10 @@ impl JsonExt for Json { } } + fn get_nullable_tendril(&self) -> Option { + self.get_nullable_str().map(Tendril::owned) + } + fn get_bool(&self) -> bool { match *self { Json::Boolean(b) => b, @@ -202,9 +219,9 @@ fn json_to_token(js: &Json) -> Token { let args: Vec<&Json> = parts[1..].iter().collect(); match (parts[0].get_str().as_slice(), args.as_slice()) { ("DOCTYPE", [name, public_id, system_id, correct]) => DoctypeToken(Doctype { - name: name.get_nullable_str(), - public_id: public_id.get_nullable_str(), - system_id: system_id.get_nullable_str(), + name: name.get_nullable_tendril(), + public_id: public_id.get_nullable_tendril(), + system_id: system_id.get_nullable_tendril(), force_quirks: !correct.get_bool(), }), @@ -214,7 +231,7 @@ fn json_to_token(js: &Json) -> Token { attrs: attrs.get_obj().iter().map(|(k,v)| { Attribute { name: QualName::new(ns!(""), Atom::from_slice(k.as_slice())), - value: v.get_str() + value: Tendril::owned(v.get_str()), } }).collect(), self_closing: match rest { @@ -230,9 +247,9 @@ fn json_to_token(js: &Json) -> Token { self_closing: false }), - ("Comment", [txt]) => CommentToken(txt.get_str()), + ("Comment", [txt]) => CommentToken(Tendril::owned(txt.get_str())), - ("Character", [txt]) => CharacterTokens(txt.get_str()), + ("Character", [txt]) => CharacterTokens(Tendril::owned(txt.get_str())), // We don't need to produce NullCharacterToken because // the TokenLogger will convert them to CharacterTokens. @@ -242,10 +259,10 @@ fn json_to_token(js: &Json) -> Token { } // Parse the "output" field of the test case into a vector of tokens. -fn json_to_tokens(js: &Json, exact_errors: bool) -> Vec { +fn json_to_tokens(js: &Json, mode: Mode) -> Vec { // Use a TokenLogger so that we combine character tokens separated // by an ignored error. - let mut sink = TokenLogger::new(exact_errors); + let mut sink = TokenLogger::new(mode); for tok in js.get_list().iter() { match *tok { Json::String(ref s) @@ -301,7 +318,7 @@ fn unescape_json(js: &Json) -> Json { } } -fn mk_test(desc: String, input: String, expect: Vec, opts: TokenizerOpts) +fn mk_test(desc: String, input: String, expect: Vec, mode: Mode, opts: TokenizerOpts) -> TestDescAndFn { TestDescAndFn { desc: TestDesc { @@ -317,7 +334,7 @@ fn mk_test(desc: String, input: String, expect: Vec, opts: TokenizerOpts) // Also clone opts. If we don't, we get the wrong // result but the compiler doesn't catch it! // Possibly mozilla/rust#12223. - let output = tokenize(input.clone(), opts.clone()); + let output = tokenize(input.clone(), mode, opts.clone()); if output != expect { panic!("\ninput: {:?}\ngot: {:?}\nexpected: {:?}", input, output, expect); @@ -362,19 +379,21 @@ fn mk_tests(tests: &mut Vec, filename: &str, js: &Json) { // Build the tests. for state in state_overrides.into_iter() { - for &exact_errors in [false, true].iter() { + for &mode in &[Mode::Normal, Mode::ExactErrors, Mode::ZeroCopy] { let mut newdesc = desc.clone(); match state { Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s), None => (), }; - if exact_errors { - newdesc = format!("{} (exact errors)", newdesc); + match mode { + Mode::Normal => (), + Mode::ExactErrors => newdesc = format!("{} (exact errors)", newdesc), + Mode::ZeroCopy => newdesc = format!("{} (zero copy)", newdesc), } - let expect_toks = json_to_tokens(&expect, exact_errors); - tests.push(mk_test(newdesc, input.clone(), expect_toks, TokenizerOpts { - exact_errors: exact_errors, + let expect_toks = json_to_tokens(&expect, mode); + tests.push(mk_test(newdesc, input.clone(), expect_toks, mode, TokenizerOpts { + exact_errors: mode == Mode::ExactErrors, initial_state: state, last_start_tag_name: start_tag.clone(),