Skip to content

Implement zero-copy parsing #114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ phf = "0"
phf_macros = "0"
time = "0"
log = "0"
iobuf = "5"

[dependencies.string_cache]
git = "https://github.com/servo/string-cache"
Expand Down
1 change: 0 additions & 1 deletion Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ RUST_DIRS := -L $(VPATH)/target/debug -L $(VPATH)/target/debug/deps

RUSTC_CMD := $(RUSTC) -D warnings -C rpath $(RUST_DIRS) \
--extern time=`find $(VPATH)/target/debug/deps -name 'libtime-*.rlib'` \
--extern log=`find $(VPATH)/target/debug/deps -name 'liblog-*.rlib'` \
$(RUSTFLAGS)

# We build the library itself using Cargo.
Expand Down
3 changes: 2 additions & 1 deletion benches/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![feature(box_syntax, core, std_misc, start, test, io, path)]
#![feature(box_syntax, core, std_misc, start, test)]

extern crate test;
extern crate html5ever;
Expand All @@ -21,6 +21,7 @@ use test::{black_box, Bencher, TestDesc, TestDescAndFn};
use test::{DynTestName, DynBenchFn, TDynBenchFn};
use test::ShouldPanic::No;

use html5ever::Tendril;
use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts};

struct Sink;
Expand Down
21 changes: 7 additions & 14 deletions examples/noop-tokenize.rs → examples/noop-tokenize-zerocopy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,19 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Run a single benchmark once. For use with profiling tools.

#![feature(core, test)]
#![feature(test)]

extern crate test;
extern crate html5ever;

use std::{fs, env};
use std::io::prelude::*;
use std::io;
use std::default::Default;

use test::black_box;

use html5ever::TendrilReader;
use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts};
use html5ever::driver::{tokenize_to, one_input};
use html5ever::driver::tokenize_to;

struct Sink;

Expand All @@ -34,15 +32,10 @@ impl TokenSink for Sink {
}

fn main() {
let mut path = env::current_exe().unwrap();
path.push("../data/bench/");
path.push(env::args().nth(1).unwrap().as_slice());

let mut file = fs::File::open(&path).unwrap();
let mut file_input = String::new();
file.read_to_string(&mut file_input).unwrap();
let reader = TendrilReader::from_utf8(16384, io::stdin())
.map(|r| r.unwrap());

tokenize_to(Sink, one_input(file_input), TokenizerOpts {
tokenize_to(Sink, reader, TokenizerOpts {
profile: true,
.. Default::default()
});
Expand Down
6 changes: 3 additions & 3 deletions examples/noop-tree-builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use std::collections::HashMap;
use std::borrow::Cow;
use string_cache::QualName;

use html5ever::{parse_to, one_input};
use html5ever::{parse_to, one_input, Tendril};
use html5ever::tokenizer::Attribute;
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText};

Expand Down Expand Up @@ -56,7 +56,7 @@ impl TreeSink for Sink {
id
}

fn create_comment(&mut self, _text: String) -> usize {
fn create_comment(&mut self, _text: Tendril) -> usize {
self.get_id()
}

Expand All @@ -72,7 +72,7 @@ impl TreeSink for Sink {
fn set_quirks_mode(&mut self, _mode: QuirksMode) { }
fn append(&mut self, _parent: usize, _child: NodeOrText<usize>) { }

fn append_doctype_to_document(&mut self, _name: String, _public_id: String, _system_id: String) { }
fn append_doctype_to_document(&mut self, _name: Tendril, _public_id: Tendril, _system_id: Tendril) { }
fn add_attrs_if_missing(&mut self, _target: usize, _attrs: Vec<Attribute>) { }
fn remove_from_parent(&mut self, _target: usize) { }
fn reparent_children(&mut self, _node: usize, _new_parent: usize) { }
Expand Down
6 changes: 3 additions & 3 deletions examples/print-tree-actions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use std::collections::HashMap;
use std::borrow::Cow;
use string_cache::QualName;

use html5ever::{parse_to, one_input};
use html5ever::{parse_to, one_input, Tendril};
use html5ever::tokenizer::Attribute;
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};

Expand Down Expand Up @@ -67,7 +67,7 @@ impl TreeSink for Sink {
id
}

fn create_comment(&mut self, text: String) -> usize {
fn create_comment(&mut self, text: Tendril) -> usize {
let id = self.get_id();
println!("Created comment \"{}\" as {}", text.escape_default(), id);
id
Expand Down Expand Up @@ -97,7 +97,7 @@ impl TreeSink for Sink {
Ok(())
}

fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) {
println!("Append doctype: {} {} {}", name, public_id, system_id);
}

Expand Down
2 changes: 1 addition & 1 deletion macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#![crate_type="dylib"]

#![feature(plugin_registrar, quote)]
#![feature(rustc_private, core, std_misc)]
#![feature(rustc_private, core, std_misc, str_char)]
#![deny(warnings)]

extern crate syntax;
Expand Down
59 changes: 27 additions & 32 deletions src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@

use core::prelude::*;

use util::tendril::IntoTendril;
use tokenizer::{TokenizerOpts, Tokenizer, TokenSink};
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink};

use core::default::Default;
use core::option;
use collections::string::String;

use string_cache::{Atom, QualName};

/// Convenience function to turn a single `String` into an iterator.
pub fn one_input(x: String) -> option::IntoIter<String> {
/// Convenience function to turn a single value into an iterator.
pub fn one_input<T>(x: T) -> option::IntoIter<T> {
Some(x).into_iter()
}

Expand All @@ -33,14 +33,11 @@ pub fn one_input(x: String) -> option::IntoIter<String> {
/// let mut sink = MySink;
/// tokenize_to(&mut sink, one_input(my_str), Default::default());
/// ```
pub fn tokenize_to<
Sink: TokenSink,
It: Iterator<Item=String>
>(
sink: Sink,
input: It,
opts: TokenizerOpts) -> Sink {

pub fn tokenize_to<Sink, T, It>(sink: Sink, input: It, opts: TokenizerOpts) -> Sink
where Sink: TokenSink,
T: IntoTendril,
It: Iterator<Item=T>,
{
let mut tok = Tokenizer::new(sink, opts);
for s in input {
tok.feed(s);
Expand All @@ -67,14 +64,11 @@ pub struct ParseOpts {
/// let mut sink = MySink;
/// parse_to(&mut sink, one_input(my_str), Default::default());
/// ```
pub fn parse_to<
Sink: TreeSink,
It: Iterator<Item=String>
>(
sink: Sink,
input: It,
opts: ParseOpts) -> Sink {

pub fn parse_to<Sink, T, It>(sink: Sink, input: It, opts: ParseOpts) -> Sink
where Sink: TreeSink,
T: IntoTendril,
It: Iterator<Item=T>,
{
let tb = TreeBuilder::new(sink, opts.tree_builder);
let mut tok = Tokenizer::new(tb, opts.tokenizer);
for s in input {
Expand All @@ -92,15 +86,14 @@ pub fn parse_to<
/// let mut sink = MySink;
/// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default());
/// ```
pub fn parse_fragment_to<
Sink: TreeSink,
It: Iterator<Item=String>
>(
sink: Sink,
input: It,
context: Atom,
opts: ParseOpts) -> Sink {

pub fn parse_fragment_to<Sink, T, It>(sink: Sink,
input: It,
context: Atom,
opts: ParseOpts) -> Sink
where Sink: TreeSink,
T: IntoTendril,
It: Iterator<Item=T>
{
let mut sink = sink;
let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!());
let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder);
Expand Down Expand Up @@ -132,9 +125,10 @@ pub trait ParseResult {
/// ```ignore
/// let dom: RcDom = parse(one_input(my_str), Default::default());
/// ```
pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
pub fn parse<Output, T, It>(input: It, opts: ParseOpts) -> Output
where Output: ParseResult,
It: Iterator<Item=String>,
T: IntoTendril,
It: Iterator<Item=T>,
{
let sink = parse_to(Default::default(), input, opts);
ParseResult::get_result(sink)
Expand All @@ -147,9 +141,10 @@ pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
/// ```ignore
/// let dom: RcDom = parse_fragment(one_input(my_str), context_token, Default::default());
/// ```
pub fn parse_fragment<Output, It>(input: It, context: Atom, opts: ParseOpts) -> Output
pub fn parse_fragment<Output, T, It>(input: It, context: Atom, opts: ParseOpts) -> Output
where Output: ParseResult,
It: Iterator<Item=String>,
T: IntoTendril,
It: Iterator<Item=T>,
{
let sink = parse_fragment_to(Default::default(), input, context, opts);
ParseResult::get_result(sink)
Expand Down
8 changes: 8 additions & 0 deletions src/for_c/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ use libc::{size_t, c_int, c_char, strlen};

use string_cache::Atom;

use util::tendril::Tendril;

#[repr(C)]
pub struct h5e_buf {
data: *const u8,
Expand Down Expand Up @@ -82,6 +84,12 @@ impl AsLifetimeBuf for String {
}
}

impl AsLifetimeBuf for Tendril {
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
LifetimeBuf::from_str(self.as_slice())
}
}

impl AsLifetimeBuf for Atom {
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
LifetimeBuf::from_str(self.as_slice())
Expand Down
3 changes: 2 additions & 1 deletion src/for_c/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

use core::prelude::*;

use util::tendril::Tendril;
use for_c::common::{LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool};

use tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken};
Expand Down Expand Up @@ -64,7 +65,7 @@ impl TokenSink for *mut h5e_token_sink {
($name:ident) => (call!($name,)); // bleh
}

fn opt_str_to_buf<'a>(s: &'a Option<String>) -> LifetimeBuf<'a> {
fn opt_str_to_buf<'a>(s: &'a Option<Tendril>) -> LifetimeBuf<'a> {
match *s {
None => LifetimeBuf::null(),
Some(ref s) => s.as_lifetime_buf(),
Expand Down
6 changes: 5 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#![crate_name="html5ever"]
#![crate_type="dylib"]

#![feature(plugin, box_syntax, no_std, core, collections, alloc)]
#![feature(plugin, box_syntax, no_std, core, collections, alloc, str_char)]
#![deny(warnings)]
#![allow(unused_parens)]

Expand Down Expand Up @@ -49,6 +49,9 @@ extern crate phf;

extern crate time;

extern crate iobuf;

pub use util::tendril::{Tendril, TendrilReader, TendrilReaderError, IntoTendril};
pub use tokenizer::Attribute;
pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment};

Expand All @@ -61,6 +64,7 @@ mod macros;
#[macro_use]
mod util {
pub mod str;
pub mod tendril;
#[macro_use] pub mod smallcharset;
}

Expand Down
8 changes: 4 additions & 4 deletions src/sink/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use util::tendril::Tendril;
use tokenizer::Attribute;

use collections::vec::Vec;
use collections::string::String;
use string_cache::QualName;

pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element};
Expand All @@ -22,13 +22,13 @@ pub enum NodeEnum {
Document,

/// A `DOCTYPE` with name, public id, and system id.
Doctype(String, String, String),
Doctype(Tendril, Tendril, Tendril),

/// A text node.
Text(String),
Text(Tendril),

/// A comment.
Comment(String),
Comment(Tendril),

/// An element with attributes.
Element(QualName, Vec<Attribute>),
Expand Down
6 changes: 3 additions & 3 deletions src/sink/owned_dom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use core::prelude::*;

use sink::common::{NodeEnum, Document, Doctype, Text, Comment, Element};

use util::tendril::Tendril;
use tokenizer::Attribute;
use tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
use tree_builder;
Expand All @@ -38,7 +39,6 @@ use core::mem;
use core::ptr;
use alloc::boxed::Box;
use collections::vec::Vec;
use collections::string::String;
use std::borrow::Cow;
use std::io::{self, Write};
use std::collections::HashSet;
Expand Down Expand Up @@ -215,7 +215,7 @@ impl TreeSink for Sink {
self.new_node(Element(name, attrs))
}

fn create_comment(&mut self, text: String) -> Handle {
fn create_comment(&mut self, text: Tendril) -> Handle {
self.new_node(Comment(text))
}

Expand Down Expand Up @@ -269,7 +269,7 @@ impl TreeSink for Sink {
Ok(())
}

fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
fn append_doctype_to_document(&mut self, name: Tendril, public_id: Tendril, system_id: Tendril) {
append(self.document, self.new_node(Doctype(name, public_id, system_id)));
}

Expand Down
Loading