Skip to content
This repository was archived by the owner on Jan 21, 2023. It is now read-only.

Commit e77cf98

Browse files
committed
Merge pull request #19 from SimonSapin/tendrilsink
Upgrade html5ever to a TendrilSink based API.
2 parents 13a914d + 04f846d commit e77cf98

File tree

6 files changed

+96
-102
lines changed

6 files changed

+96
-102
lines changed

Cargo.toml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "kuchiki"
3-
version = "0.2.1"
3+
version = "0.3.0"
44
authors = ["Simon Sapin <[email protected]>"]
55
license = "MIT"
66
description = "(朽木) HTML/XML tree manipulation library"
@@ -13,22 +13,18 @@ doctest = false
1313
[features]
1414
unstable = [
1515
"string_cache/unstable",
16-
"tendril/unstable",
1716
"rc/unstable",
1817
"html5ever/unstable",
1918
"selectors/unstable",
2019
]
21-
with-hyper = ["hyper"]
2220

2321
[dependencies]
2422
matches = "0.1.2"
25-
html5ever = "0.2.11"
23+
html5ever = "0.4"
24+
hyper = {version = "0.7", optional = true}
2625
string_cache = "0.2"
27-
tendril = "0.1.1"
2826
selectors = "0.2.0"
2927
rc = "0.1.0"
30-
hyper = {version = "0.7", optional = true}
3128

3229
[dev-dependencies]
3330
tempdir = "0.3"
34-
hyper = "0.7"

examples/find_matches.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
extern crate kuchiki;
22

3+
use kuchiki::traits::*;
4+
35
fn main() {
46
let html = r"
57
<DOCTYPE html>
@@ -14,7 +16,7 @@ fn main() {
1416
";
1517
let css_selector = ".foo";
1618

17-
let document = kuchiki::Html::from_string(html).parse();
19+
let document = kuchiki::parse_html().one(html);
1820

1921
for css_match in document.select(css_selector).unwrap() {
2022
// css_match is a NodeDataRef, but most of the interesting methods are

examples/hyper.rs

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
1-
extern crate hyper;
21
extern crate kuchiki;
32

4-
use hyper::Client;
5-
6-
use kuchiki::Html;
3+
#[cfg(not(feature = "hyper"))]
4+
fn main() {
5+
// Intentionally trigger an unused_import warning,
6+
// with a message on the same line that will be visible in compiler output:
7+
use kuchiki::traits::*; // This file requires the `hyper` feature to be enabled
8+
}
79

10+
#[cfg(feature = "hyper")]
811
fn main() {
9-
// Create a client.
10-
let client = Client::new();
12+
use kuchiki::traits::*;
13+
1114
let url = "https://www.mozilla.org/en-US/";
1215
println!("{} - {} ", "Calling site ", url);
1316

14-
// Get response
15-
let mut response = client.get(url).send().unwrap();
16-
17-
// Parse the html page
18-
if let Ok(html) = Html::from_stream(&mut response) {
17+
// Fetch and parse the html page
18+
if let Ok(doc) = kuchiki::parse_html().from_http(url) {
1919
println!("{}", "Finding Easter egg");
20-
let doc = html.parse();
2120

2221
// Manually navigate to hidden comment
2322
let x = doc.children().nth(1).unwrap()
@@ -29,6 +28,6 @@ fn main() {
2928

3029
println!("{}", *comment);
3130
} else {
32-
println!("{}", "The page couldn't be parsed");
31+
println!("{}", "The page couldn't be fetched");
3332
}
3433
}

src/lib.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@ Kuchiki (朽木), a HTML/XML tree manipulation library for Rust.
88
#![deny(missing_docs)]
99

1010
extern crate html5ever;
11-
#[cfg(feature = "with-hyper")] extern crate hyper;
11+
#[cfg(feature = "hyper")] extern crate hyper;
1212
#[macro_use] extern crate matches;
1313
extern crate selectors;
1414
extern crate rc;
1515
#[macro_use] extern crate string_cache;
1616
#[cfg(test)] extern crate tempdir;
17-
extern crate tendril;
1817

1918
mod attributes;
2019
pub mod iter;
@@ -27,8 +26,20 @@ mod serializer;
2726
mod tree;
2827

2928
pub use attributes::Attributes;
30-
pub use iter::{NodeIterator, ElementIterator};
3129
pub use node_data_ref::NodeDataRef;
32-
pub use parser::{Html, ParseOpts};
30+
pub use parser::{parse_html, ParseOpts};
3331
pub use select::Selectors;
3432
pub use tree::{NodeRef, Node, NodeData, ElementData, Doctype, DocumentData};
33+
34+
/// This module re-exports a number of traits that are useful when using Kuchiki.
35+
/// It can be used with:
36+
///
37+
/// ```rust
38+
/// use kuchiki::traits::*;
39+
/// ```
40+
pub mod traits {
41+
pub use html5ever::tendril::TendrilSink;
42+
pub use iter::{NodeIterator, ElementIterator};
43+
pub use parser::ParserExt;
44+
}
45+

src/parser.rs

Lines changed: 54 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,12 @@
11
use std::borrow::Cow;
2-
use std::fs::File;
3-
use std::io::{Error, ErrorKind, Read};
4-
use std::option;
5-
use std::path::Path;
62
use html5ever::{self, Attribute};
3+
use html5ever::tendril::StrTendril;
74
use html5ever::tree_builder::{TreeSink, NodeOrText, QuirksMode};
8-
#[cfg(feature = "with-hyper")] use hyper::client::IntoUrl;
5+
#[cfg(feature = "hyper")] use hyper::client::IntoUrl;
96
use string_cache::QualName;
10-
use tendril::{StrTendril, ReadExt, Tendril};
117

128
use tree::NodeRef;
139

14-
/// The HTML parser.
15-
pub struct Html {
16-
opts: ParseOpts,
17-
data: option::IntoIter<StrTendril>,
18-
}
19-
20-
impl Html {
21-
/// Parse from a single string in memory.
22-
#[inline]
23-
pub fn from_string<S: Into<StrTendril>>(string: S) -> Html {
24-
Html {
25-
opts: ParseOpts::default(),
26-
data: Some(string.into()).into_iter(),
27-
}
28-
}
29-
30-
/// Parse from reading a file.
31-
#[inline]
32-
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Html, Error> {
33-
Html::from_stream(&mut try!(File::open(&path)))
34-
}
35-
36-
/// Fetch an HTTP or HTTPS URL with Hyper and parse.
37-
#[cfg(feature = "with-hyper")]
38-
pub fn from_http<U: IntoUrl>(url: U) -> Result<Html, ::hyper::Error> {
39-
let mut response = try!(::hyper::Client::new().get(url).send());
40-
Ok(try!(Html::from_stream(&mut response)))
41-
}
42-
43-
/// Parse from reading a stream of bytes.
44-
#[inline]
45-
pub fn from_stream<S: Read>(stream: &mut S) -> Result<Html, Error> {
46-
let mut buf = Tendril::new();
47-
try!(stream.read_to_tendril(&mut buf));
48-
Ok(Html {
49-
opts: ParseOpts::default(),
50-
// FIXME: Make UTF-8 decoding lossy, but try to minimize copying.
51-
data: Some(try!(buf.try_reinterpret().map_err(|_| {
52-
Error::new(ErrorKind::Other, "Invalid UTF-8.")
53-
}))).into_iter(),
54-
})
55-
}
56-
57-
/// Run the parser and return a reference to the document node, the root of the tree.
58-
#[inline]
59-
pub fn parse(self) -> NodeRef {
60-
let parser = Parser {
61-
document_node: NodeRef::new_document(),
62-
on_parse_error: self.opts.on_parse_error,
63-
};
64-
let html5opts = html5ever::ParseOpts {
65-
tokenizer: self.opts.tokenizer,
66-
tree_builder: self.opts.tree_builder,
67-
};
68-
let parser = html5ever::parse_to(parser, self.data, html5opts);
69-
parser.document_node
70-
}
71-
72-
}
73-
7410
/// Options for the HTML parser.
7511
#[derive(Default)]
7612
pub struct ParseOpts {
@@ -84,14 +20,64 @@ pub struct ParseOpts {
8420
pub on_parse_error: Option<Box<FnMut(Cow<'static, str>)>>,
8521
}
8622

23+
/// Parse an HTML document with html5ever and the default configuration.
24+
pub fn parse_html() -> html5ever::Parser<Sink> {
25+
parse_html_with_options(ParseOpts::default())
26+
}
27+
28+
/// Parse an HTML document with html5ever.
29+
pub fn parse_html_with_options(opts: ParseOpts) -> html5ever::Parser<Sink> {
30+
let sink = Sink {
31+
document_node: NodeRef::new_document(),
32+
on_parse_error: opts.on_parse_error,
33+
};
34+
let html5opts = html5ever::ParseOpts {
35+
tokenizer: opts.tokenizer,
36+
tree_builder: opts.tree_builder,
37+
};
38+
html5ever::parse_document(sink, html5opts)
39+
}
8740

88-
struct Parser {
41+
/// Additional methods for html5ever::Parser
42+
pub trait ParserExt {
43+
/// Fetch an HTTP or HTTPS URL with Hyper and parse,
44+
/// giving the `charset` parameter of a `Content-Type` response header, if any,
45+
/// as a character encoding hint to html5ever.
46+
#[cfg(feature = "hyper")]
47+
fn from_http<U: IntoUrl>(self, url: U) -> Result<NodeRef, ::hyper::Error>;
48+
}
49+
50+
impl ParserExt for html5ever::Parser<Sink> {
51+
#[cfg(feature = "hyper")]
52+
fn from_http<U: IntoUrl>(self, url: U) -> Result<NodeRef, ::hyper::Error> {
53+
use html5ever::encoding::label::encoding_from_whatwg_label;
54+
use html5ever::tendril::TendrilSink;
55+
use hyper::Client;
56+
use hyper::header::ContentType;
57+
use hyper::mime::Attr::Charset;
58+
use html5ever::driver::BytesOpts;
59+
60+
let mut response = try!(Client::new().get(url).send());
61+
let opts = BytesOpts {
62+
transport_layer_encoding: response.headers.get::<ContentType>()
63+
.and_then(|content_type| content_type.get_param(Charset))
64+
.and_then(|charset| encoding_from_whatwg_label(charset))
65+
};
66+
Ok(try!(self.from_bytes(opts).read_from(&mut response)))
67+
}
68+
}
69+
70+
71+
pub struct Sink {
8972
document_node: NodeRef,
9073
on_parse_error: Option<Box<FnMut(Cow<'static, str>)>>,
9174
}
9275

76+
impl TreeSink for Sink {
77+
type Output = NodeRef;
78+
79+
fn finish(self) -> NodeRef { self.document_node }
9380

94-
impl TreeSink for Parser {
9581
type Handle = NodeRef;
9682

9783
#[inline]

src/tests.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@ use std::path::Path;
33

44
use tempdir::TempDir;
55

6-
use Html;
7-
use iter::NodeIterator;
6+
use parser::parse_html;
7+
use traits::*;
88

99
#[test]
1010
fn text_nodes() {
1111
let html = r"
1212
<!doctype html>
1313
<title>Test case</title>
1414
<p>Content contains <b>Important</b> data</p>";
15-
let document = Html::from_string(html).parse();
15+
let document = parse_html().one(html);
1616
let paragraph = document.select("p").unwrap().collect::<Vec<_>>();
1717
assert_eq!(paragraph.len(), 1);
1818
assert_eq!(paragraph[0].text_contents(), "Content contains Important data");
@@ -35,7 +35,7 @@ fn parse_and_serialize() {
3535
<!doctype html>
3636
<title>Test case</title>
3737
<p>Content";
38-
let document = Html::from_string(html).parse();
38+
let document = parse_html().one(html);
3939
assert_eq!(document.as_document().unwrap().quirks_mode(), QuirksMode::NoQuirks);
4040
assert_eq!(document.to_string(), r"<!DOCTYPE html>
4141
<html><head><title>Test case</title>
@@ -57,7 +57,7 @@ fn parse_file() {
5757
5858
5959
</body></html>";
60-
let document = Html::from_file(&path).unwrap().parse();
60+
let document = parse_html().from_utf8().from_file(&path).unwrap();
6161
assert_eq!(document.to_string(), html);
6262
}
6363

@@ -68,10 +68,10 @@ fn serialize_and_read_file() {
6868
path.push("temp.html");
6969

7070
let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>";
71-
let document = Html::from_string(html).parse();
71+
let document = parse_html().one(html);
7272
let _ = document.serialize_to_file(path.clone());
7373

74-
let document2 = Html::from_file(&path).unwrap().parse();
74+
let document2 = parse_html().from_utf8().from_file(&path).unwrap();
7575
assert_eq!(document.to_string(), document2.to_string());
7676
}
7777

@@ -84,7 +84,7 @@ fn select() {
8484
<p class=foo>Foo
8585
";
8686

87-
let document = Html::from_string(html).parse();
87+
let document = parse_html().one(html);
8888
let matching = document.select("p.foo").unwrap().collect::<Vec<_>>();
8989
assert_eq!(matching.len(), 2);
9090
let child = matching[0].as_node().first_child().unwrap();
@@ -105,6 +105,6 @@ fn to_string() {
105105
</body>
106106
</html>";
107107

108-
let document = Html::from_string(html).parse();
108+
let document = parse_html().one(html);
109109
assert_eq!(document.inclusive_descendants().nth(11).unwrap().to_string(), "<p class=\"foo\">Foo\n \n</p>");
110110
}

0 commit comments

Comments
 (0)