Skip to content

Commit e2f1d18

Browse files
committed
Implement zero-copy parsing
Based on servo#60 by cgaebel.
1 parent bf129b8 commit e2f1d18

28 files changed

+1191
-281
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ phf = "0"
99
phf_macros = "0"
1010
time = "0"
1111
log = "0"
12+
iobuf = "5"
1213

1314
[dependencies.string_cache]
1415
git = "https://github.com/servo/string-cache"

Makefile.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ RUST_DIRS := -L $(VPATH)/target/debug -L $(VPATH)/target/debug/deps
1414

1515
RUSTC_CMD := $(RUSTC) -D warnings -C rpath $(RUST_DIRS) \
1616
--extern time=`find $(VPATH)/target/debug/deps -name 'libtime-*.rlib'` \
17-
--extern log=`find $(VPATH)/target/debug/deps -name 'liblog-*.rlib'` \
1817
$(RUSTFLAGS)
1918

2019
# We build the library itself using Cargo.

benches/tokenizer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10-
#![feature(box_syntax, core, std_misc, start, test, io, path)]
10+
#![feature(box_syntax, core, std_misc, start, test)]
1111

1212
extern crate test;
1313
extern crate html5ever;
@@ -21,6 +21,7 @@ use test::{black_box, Bencher, TestDesc, TestDescAndFn};
2121
use test::{DynTestName, DynBenchFn, TDynBenchFn};
2222
use test::ShouldPanic::No;
2323

24+
use html5ever::Tendril;
2425
use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts};
2526

2627
struct Sink;

examples/noop-tokenize.rs renamed to examples/noop-tokenize-zerocopy.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ extern crate test;
1515
extern crate html5ever;
1616

1717
use std::io;
18-
use std::io::prelude::*;
1918
use std::default::Default;
2019

2120
use test::black_box;
2221

22+
use html5ever::TendrilReader;
2323
use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts};
24-
use html5ever::driver::{tokenize_to, one_input};
24+
use html5ever::driver::tokenize_to;
2525

2626
struct Sink;
2727

@@ -34,10 +34,10 @@ impl TokenSink for Sink {
3434
}
3535

3636
fn main() {
37-
let mut input = String::new();
38-
io::stdin().read_to_string(&mut input).unwrap();
37+
let reader = TendrilReader::from_utf8(16384, io::stdin())
38+
.map(|r| r.unwrap());
3939

40-
tokenize_to(Sink, one_input(input), TokenizerOpts {
40+
tokenize_to(Sink, reader, TokenizerOpts {
4141
profile: true,
4242
.. Default::default()
4343
});

examples/noop-tree-builder.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use std::collections::HashMap;
1818
use std::borrow::Cow;
1919
use string_cache::QualName;
2020

21-
use html5ever::{parse_to, one_input};
21+
use html5ever::{parse_to, one_input, Tendril};
2222
use html5ever::tokenizer::Attribute;
2323
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText};
2424

@@ -56,7 +56,7 @@ impl TreeSink for Sink {
5656
id
5757
}
5858

59-
fn create_comment(&mut self, _text: String) -> usize {
59+
fn create_comment(&mut self, _text: Tendril) -> usize {
6060
self.get_id()
6161
}
6262

@@ -72,7 +72,7 @@ impl TreeSink for Sink {
7272
fn set_quirks_mode(&mut self, _mode: QuirksMode) { }
7373
fn append(&mut self, _parent: usize, _child: NodeOrText<usize>) { }
7474

75-
fn append_doctype_to_document(&mut self, _name: String, _public_id: String, _system_id: String) { }
75+
fn append_doctype_to_document(&mut self, _name: Tendril, _public_id: Tendril, _system_id: Tendril) { }
7676
fn add_attrs_if_missing(&mut self, _target: usize, _attrs: Vec<Attribute>) { }
7777
fn remove_from_parent(&mut self, _target: usize) { }
7878
fn reparent_children(&mut self, _node: usize, _new_parent: usize) { }

examples/print-tree-actions.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use std::collections::HashMap;
2020
use std::borrow::Cow;
2121
use string_cache::QualName;
2222

23-
use html5ever::{parse_to, one_input};
23+
use html5ever::{parse_to, one_input, Tendril};
2424
use html5ever::tokenizer::Attribute;
2525
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
2626

@@ -67,7 +67,7 @@ impl TreeSink for Sink {
6767
id
6868
}
6969

70-
fn create_comment(&mut self, text: String) -> usize {
70+
fn create_comment(&mut self, text: Tendril) -> usize {
7171
let id = self.get_id();
7272
println!("Created comment \"{}\" as {}", text.escape_default(), id);
7373
id
@@ -97,7 +97,7 @@ impl TreeSink for Sink {
9797
Ok(())
9898
}
9999

100-
fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
100+
fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) {
101101
println!("Append doctype: {} {} {}", name, public_id, system_id);
102102
}
103103

macros/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#![crate_type="dylib"]
1212

1313
#![feature(plugin_registrar, quote)]
14-
#![feature(rustc_private, core, std_misc)]
14+
#![feature(rustc_private, core, std_misc, str_char)]
1515
#![deny(warnings)]
1616

1717
extern crate syntax;

src/driver.rs

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@
1111
1212
use core::prelude::*;
1313

14+
use util::tendril::IntoTendril;
1415
use tokenizer::{TokenizerOpts, Tokenizer, TokenSink};
1516
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink};
1617

1718
use core::default::Default;
1819
use core::option;
19-
use collections::string::String;
2020

2121
use string_cache::{Atom, QualName};
2222

23-
/// Convenience function to turn a single `String` into an iterator.
24-
pub fn one_input(x: String) -> option::IntoIter<String> {
23+
/// Convenience function to turn a single value into an iterator.
24+
pub fn one_input<T>(x: T) -> option::IntoIter<T> {
2525
Some(x).into_iter()
2626
}
2727

@@ -33,14 +33,11 @@ pub fn one_input(x: String) -> option::IntoIter<String> {
3333
/// let mut sink = MySink;
3434
/// tokenize_to(&mut sink, one_input(my_str), Default::default());
3535
/// ```
36-
pub fn tokenize_to<
37-
Sink: TokenSink,
38-
It: Iterator<Item=String>
39-
>(
40-
sink: Sink,
41-
input: It,
42-
opts: TokenizerOpts) -> Sink {
43-
36+
pub fn tokenize_to<Sink, T, It>(sink: Sink, input: It, opts: TokenizerOpts) -> Sink
37+
where Sink: TokenSink,
38+
T: IntoTendril,
39+
It: Iterator<Item=T>,
40+
{
4441
let mut tok = Tokenizer::new(sink, opts);
4542
for s in input {
4643
tok.feed(s);
@@ -67,14 +64,11 @@ pub struct ParseOpts {
6764
/// let mut sink = MySink;
6865
/// parse_to(&mut sink, one_input(my_str), Default::default());
6966
/// ```
70-
pub fn parse_to<
71-
Sink: TreeSink,
72-
It: Iterator<Item=String>
73-
>(
74-
sink: Sink,
75-
input: It,
76-
opts: ParseOpts) -> Sink {
77-
67+
pub fn parse_to<Sink, T, It>(sink: Sink, input: It, opts: ParseOpts) -> Sink
68+
where Sink: TreeSink,
69+
T: IntoTendril,
70+
It: Iterator<Item=T>,
71+
{
7872
let tb = TreeBuilder::new(sink, opts.tree_builder);
7973
let mut tok = Tokenizer::new(tb, opts.tokenizer);
8074
for s in input {
@@ -92,15 +86,14 @@ pub fn parse_to<
9286
/// let mut sink = MySink;
9387
/// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default());
9488
/// ```
95-
pub fn parse_fragment_to<
96-
Sink: TreeSink,
97-
It: Iterator<Item=String>
98-
>(
99-
sink: Sink,
100-
input: It,
101-
context: Atom,
102-
opts: ParseOpts) -> Sink {
103-
89+
pub fn parse_fragment_to<Sink, T, It>(sink: Sink,
90+
input: It,
91+
context: Atom,
92+
opts: ParseOpts) -> Sink
93+
where Sink: TreeSink,
94+
T: IntoTendril,
95+
It: Iterator<Item=T>
96+
{
10497
let mut sink = sink;
10598
let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!());
10699
let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder);
@@ -132,9 +125,10 @@ pub trait ParseResult {
132125
/// ```ignore
133126
/// let dom: RcDom = parse(one_input(my_str), Default::default());
134127
/// ```
135-
pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
128+
pub fn parse<Output, T, It>(input: It, opts: ParseOpts) -> Output
136129
where Output: ParseResult,
137-
It: Iterator<Item=String>,
130+
T: IntoTendril,
131+
It: Iterator<Item=T>,
138132
{
139133
let sink = parse_to(Default::default(), input, opts);
140134
ParseResult::get_result(sink)
@@ -147,9 +141,10 @@ pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
147141
/// ```ignore
148142
/// let dom: RcDom = parse_fragment(one_input(my_str), context_token, Default::default());
149143
/// ```
150-
pub fn parse_fragment<Output, It>(input: It, context: Atom, opts: ParseOpts) -> Output
144+
pub fn parse_fragment<Output, T, It>(input: It, context: Atom, opts: ParseOpts) -> Output
151145
where Output: ParseResult,
152-
It: Iterator<Item=String>,
146+
T: IntoTendril,
147+
It: Iterator<Item=T>,
153148
{
154149
let sink = parse_fragment_to(Default::default(), input, context, opts);
155150
ParseResult::get_result(sink)

src/for_c/common.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ use libc::{size_t, c_int, c_char, strlen};
2020

2121
use string_cache::Atom;
2222

23+
use util::tendril::Tendril;
24+
2325
#[repr(C)]
2426
pub struct h5e_buf {
2527
data: *const u8,
@@ -82,6 +84,12 @@ impl AsLifetimeBuf for String {
8284
}
8385
}
8486

87+
impl AsLifetimeBuf for Tendril {
88+
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
89+
LifetimeBuf::from_str(self.as_slice())
90+
}
91+
}
92+
8593
impl AsLifetimeBuf for Atom {
8694
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
8795
LifetimeBuf::from_str(self.as_slice())

src/for_c/tokenizer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
use core::prelude::*;
1313

14+
use util::tendril::Tendril;
1415
use for_c::common::{LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool};
1516

1617
use tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken};
@@ -64,7 +65,7 @@ impl TokenSink for *mut h5e_token_sink {
6465
($name:ident) => (call!($name,)); // bleh
6566
}
6667

67-
fn opt_str_to_buf<'a>(s: &'a Option<String>) -> LifetimeBuf<'a> {
68+
fn opt_str_to_buf<'a>(s: &'a Option<Tendril>) -> LifetimeBuf<'a> {
6869
match *s {
6970
None => LifetimeBuf::null(),
7071
Some(ref s) => s.as_lifetime_buf(),

0 commit comments

Comments
 (0)