Skip to content

Commit 3942a80

Browse files
committed
Implement zero-copy parsing
Based on servo#60 by cgaebel.
1 parent bf129b8 commit 3942a80

28 files changed

+1191
-283
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ phf = "0"
99
phf_macros = "0"
1010
time = "0"
1111
log = "0"
12+
iobuf = "5"
1213

1314
[dependencies.string_cache]
1415
git = "https://github.com/servo/string-cache"

Makefile.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ RUST_DIRS := -L $(VPATH)/target/debug -L $(VPATH)/target/debug/deps
1414

1515
RUSTC_CMD := $(RUSTC) -D warnings -C rpath $(RUST_DIRS) \
1616
--extern time=`find $(VPATH)/target/debug/deps -name 'libtime-*.rlib'` \
17-
--extern log=`find $(VPATH)/target/debug/deps -name 'liblog-*.rlib'` \
1817
$(RUSTFLAGS)
1918

2019
# We build the library itself using Cargo.

benches/tokenizer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10-
#![feature(box_syntax, core, std_misc, start, test, io, path)]
10+
#![feature(box_syntax, core, std_misc, start, test)]
1111

1212
extern crate test;
1313
extern crate html5ever;
@@ -21,6 +21,7 @@ use test::{black_box, Bencher, TestDesc, TestDescAndFn};
2121
use test::{DynTestName, DynBenchFn, TDynBenchFn};
2222
use test::ShouldPanic::No;
2323

24+
use html5ever::Tendril;
2425
use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts};
2526

2627
struct Sink;

examples/noop-tokenize.rs renamed to examples/noop-tokenize-zerocopy.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,19 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10-
// Run a single benchmark once. For use with profiling tools.
11-
1210
#![feature(test)]
1311

1412
extern crate test;
1513
extern crate html5ever;
1614

1715
use std::io;
18-
use std::io::prelude::*;
1916
use std::default::Default;
2017

2118
use test::black_box;
2219

20+
use html5ever::TendrilReader;
2321
use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts};
24-
use html5ever::driver::{tokenize_to, one_input};
22+
use html5ever::driver::tokenize_to;
2523

2624
struct Sink;
2725

@@ -34,10 +32,10 @@ impl TokenSink for Sink {
3432
}
3533

3634
fn main() {
37-
let mut input = String::new();
38-
io::stdin().read_to_string(&mut input).unwrap();
35+
let reader = TendrilReader::from_utf8(16384, io::stdin())
36+
.map(|r| r.unwrap());
3937

40-
tokenize_to(Sink, one_input(input), TokenizerOpts {
38+
tokenize_to(Sink, reader, TokenizerOpts {
4139
profile: true,
4240
.. Default::default()
4341
});

examples/noop-tree-builder.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use std::collections::HashMap;
1818
use std::borrow::Cow;
1919
use string_cache::QualName;
2020

21-
use html5ever::{parse_to, one_input};
21+
use html5ever::{parse_to, one_input, Tendril};
2222
use html5ever::tokenizer::Attribute;
2323
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText};
2424

@@ -56,7 +56,7 @@ impl TreeSink for Sink {
5656
id
5757
}
5858

59-
fn create_comment(&mut self, _text: String) -> usize {
59+
fn create_comment(&mut self, _text: Tendril) -> usize {
6060
self.get_id()
6161
}
6262

@@ -72,7 +72,7 @@ impl TreeSink for Sink {
7272
fn set_quirks_mode(&mut self, _mode: QuirksMode) { }
7373
fn append(&mut self, _parent: usize, _child: NodeOrText<usize>) { }
7474

75-
fn append_doctype_to_document(&mut self, _name: String, _public_id: String, _system_id: String) { }
75+
fn append_doctype_to_document(&mut self, _name: Tendril, _public_id: Tendril, _system_id: Tendril) { }
7676
fn add_attrs_if_missing(&mut self, _target: usize, _attrs: Vec<Attribute>) { }
7777
fn remove_from_parent(&mut self, _target: usize) { }
7878
fn reparent_children(&mut self, _node: usize, _new_parent: usize) { }

examples/print-tree-actions.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use std::collections::HashMap;
2020
use std::borrow::Cow;
2121
use string_cache::QualName;
2222

23-
use html5ever::{parse_to, one_input};
23+
use html5ever::{parse_to, one_input, Tendril};
2424
use html5ever::tokenizer::Attribute;
2525
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
2626

@@ -67,7 +67,7 @@ impl TreeSink for Sink {
6767
id
6868
}
6969

70-
fn create_comment(&mut self, text: String) -> usize {
70+
fn create_comment(&mut self, text: Tendril) -> usize {
7171
let id = self.get_id();
7272
println!("Created comment \"{}\" as {}", text.escape_default(), id);
7373
id
@@ -97,7 +97,7 @@ impl TreeSink for Sink {
9797
Ok(())
9898
}
9999

100-
fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
100+
fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) {
101101
println!("Append doctype: {} {} {}", name, public_id, system_id);
102102
}
103103

macros/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#![crate_type="dylib"]
1212

1313
#![feature(plugin_registrar, quote)]
14-
#![feature(rustc_private, core, std_misc)]
14+
#![feature(rustc_private, core, std_misc, str_char)]
1515
#![deny(warnings)]
1616

1717
extern crate syntax;

src/driver.rs

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@
1111
1212
use core::prelude::*;
1313

14+
use util::tendril::IntoTendril;
1415
use tokenizer::{TokenizerOpts, Tokenizer, TokenSink};
1516
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink};
1617

1718
use core::default::Default;
1819
use core::option;
19-
use collections::string::String;
2020

2121
use string_cache::{Atom, QualName};
2222

23-
/// Convenience function to turn a single `String` into an iterator.
24-
pub fn one_input(x: String) -> option::IntoIter<String> {
23+
/// Convenience function to turn a single value into an iterator.
24+
pub fn one_input<T>(x: T) -> option::IntoIter<T> {
2525
Some(x).into_iter()
2626
}
2727

@@ -33,14 +33,11 @@ pub fn one_input(x: String) -> option::IntoIter<String> {
3333
/// let mut sink = MySink;
3434
/// tokenize_to(&mut sink, one_input(my_str), Default::default());
3535
/// ```
36-
pub fn tokenize_to<
37-
Sink: TokenSink,
38-
It: Iterator<Item=String>
39-
>(
40-
sink: Sink,
41-
input: It,
42-
opts: TokenizerOpts) -> Sink {
43-
36+
pub fn tokenize_to<Sink, T, It>(sink: Sink, input: It, opts: TokenizerOpts) -> Sink
37+
where Sink: TokenSink,
38+
T: IntoTendril,
39+
It: Iterator<Item=T>,
40+
{
4441
let mut tok = Tokenizer::new(sink, opts);
4542
for s in input {
4643
tok.feed(s);
@@ -67,14 +64,11 @@ pub struct ParseOpts {
6764
/// let mut sink = MySink;
6865
/// parse_to(&mut sink, one_input(my_str), Default::default());
6966
/// ```
70-
pub fn parse_to<
71-
Sink: TreeSink,
72-
It: Iterator<Item=String>
73-
>(
74-
sink: Sink,
75-
input: It,
76-
opts: ParseOpts) -> Sink {
77-
67+
pub fn parse_to<Sink, T, It>(sink: Sink, input: It, opts: ParseOpts) -> Sink
68+
where Sink: TreeSink,
69+
T: IntoTendril,
70+
It: Iterator<Item=T>,
71+
{
7872
let tb = TreeBuilder::new(sink, opts.tree_builder);
7973
let mut tok = Tokenizer::new(tb, opts.tokenizer);
8074
for s in input {
@@ -92,15 +86,14 @@ pub fn parse_to<
9286
/// let mut sink = MySink;
9387
/// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default());
9488
/// ```
95-
pub fn parse_fragment_to<
96-
Sink: TreeSink,
97-
It: Iterator<Item=String>
98-
>(
99-
sink: Sink,
100-
input: It,
101-
context: Atom,
102-
opts: ParseOpts) -> Sink {
103-
89+
pub fn parse_fragment_to<Sink, T, It>(sink: Sink,
90+
input: It,
91+
context: Atom,
92+
opts: ParseOpts) -> Sink
93+
where Sink: TreeSink,
94+
T: IntoTendril,
95+
It: Iterator<Item=T>
96+
{
10497
let mut sink = sink;
10598
let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!());
10699
let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder);
@@ -132,9 +125,10 @@ pub trait ParseResult {
132125
/// ```ignore
133126
/// let dom: RcDom = parse(one_input(my_str), Default::default());
134127
/// ```
135-
pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
128+
pub fn parse<Output, T, It>(input: It, opts: ParseOpts) -> Output
136129
where Output: ParseResult,
137-
It: Iterator<Item=String>,
130+
T: IntoTendril,
131+
It: Iterator<Item=T>,
138132
{
139133
let sink = parse_to(Default::default(), input, opts);
140134
ParseResult::get_result(sink)
@@ -147,9 +141,10 @@ pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
147141
/// ```ignore
148142
/// let dom: RcDom = parse_fragment(one_input(my_str), context_token, Default::default());
149143
/// ```
150-
pub fn parse_fragment<Output, It>(input: It, context: Atom, opts: ParseOpts) -> Output
144+
pub fn parse_fragment<Output, T, It>(input: It, context: Atom, opts: ParseOpts) -> Output
151145
where Output: ParseResult,
152-
It: Iterator<Item=String>,
146+
T: IntoTendril,
147+
It: Iterator<Item=T>,
153148
{
154149
let sink = parse_fragment_to(Default::default(), input, context, opts);
155150
ParseResult::get_result(sink)

src/for_c/common.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ use libc::{size_t, c_int, c_char, strlen};
2020

2121
use string_cache::Atom;
2222

23+
use util::tendril::Tendril;
24+
2325
#[repr(C)]
2426
pub struct h5e_buf {
2527
data: *const u8,
@@ -82,6 +84,12 @@ impl AsLifetimeBuf for String {
8284
}
8385
}
8486

87+
impl AsLifetimeBuf for Tendril {
88+
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
89+
LifetimeBuf::from_str(self.as_slice())
90+
}
91+
}
92+
8593
impl AsLifetimeBuf for Atom {
8694
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
8795
LifetimeBuf::from_str(self.as_slice())

src/for_c/tokenizer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
use core::prelude::*;
1313

14+
use util::tendril::Tendril;
1415
use for_c::common::{LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool};
1516

1617
use tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken};
@@ -64,7 +65,7 @@ impl TokenSink for *mut h5e_token_sink {
6465
($name:ident) => (call!($name,)); // bleh
6566
}
6667

67-
fn opt_str_to_buf<'a>(s: &'a Option<String>) -> LifetimeBuf<'a> {
68+
fn opt_str_to_buf<'a>(s: &'a Option<Tendril>) -> LifetimeBuf<'a> {
6869
match *s {
6970
None => LifetimeBuf::null(),
7071
Some(ref s) => s.as_lifetime_buf(),

src/lib.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#![crate_name="html5ever"]
1111
#![crate_type="dylib"]
1212

13-
#![feature(plugin, box_syntax, no_std, core, collections, alloc)]
13+
#![feature(plugin, box_syntax, no_std, core, collections, alloc, str_char)]
1414
#![deny(warnings)]
1515
#![allow(unused_parens)]
1616

@@ -49,6 +49,9 @@ extern crate phf;
4949

5050
extern crate time;
5151

52+
extern crate iobuf;
53+
54+
pub use util::tendril::{Tendril, TendrilReader, TendrilReaderError, IntoTendril};
5255
pub use tokenizer::Attribute;
5356
pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment};
5457

@@ -61,6 +64,7 @@ mod macros;
6164
#[macro_use]
6265
mod util {
6366
pub mod str;
67+
pub mod tendril;
6468
#[macro_use] pub mod smallcharset;
6569
}
6670

src/sink/common.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10+
use util::tendril::Tendril;
1011
use tokenizer::Attribute;
1112

1213
use collections::vec::Vec;
13-
use collections::string::String;
1414
use string_cache::QualName;
1515

1616
pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element};
@@ -22,13 +22,13 @@ pub enum NodeEnum {
2222
Document,
2323

2424
/// A `DOCTYPE` with name, public id, and system id.
25-
Doctype(String, String, String),
25+
Doctype(Tendril, Tendril, Tendril),
2626

2727
/// A text node.
28-
Text(String),
28+
Text(Tendril),
2929

3030
/// A comment.
31-
Comment(String),
31+
Comment(Tendril),
3232

3333
/// An element with attributes.
3434
Element(QualName, Vec<Attribute>),

src/sink/owned_dom.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use core::prelude::*;
2323

2424
use sink::common::{NodeEnum, Document, Doctype, Text, Comment, Element};
2525

26+
use util::tendril::Tendril;
2627
use tokenizer::Attribute;
2728
use tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
2829
use tree_builder;
@@ -38,7 +39,6 @@ use core::mem;
3839
use core::ptr;
3940
use alloc::boxed::Box;
4041
use collections::vec::Vec;
41-
use collections::string::String;
4242
use std::borrow::Cow;
4343
use std::io::{self, Write};
4444
use std::collections::HashSet;
@@ -215,7 +215,7 @@ impl TreeSink for Sink {
215215
self.new_node(Element(name, attrs))
216216
}
217217

218-
fn create_comment(&mut self, text: String) -> Handle {
218+
fn create_comment(&mut self, text: Tendril) -> Handle {
219219
self.new_node(Comment(text))
220220
}
221221

@@ -269,7 +269,7 @@ impl TreeSink for Sink {
269269
Ok(())
270270
}
271271

272-
fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
272+
fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) {
273273
append(self.document, self.new_node(Doctype(name, public_id, system_id)));
274274
}
275275

0 commit comments

Comments
 (0)