From 8961d44dbbbcdc9323837dae811ba4ad7829e213 Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Wed, 4 Apr 2018 19:33:56 -0700 Subject: [PATCH 1/3] encoding: feature `query_encoding_2` to use `encoding_rs` crate --- Cargo.toml | 4 +- src/encoding.rs | 86 +++++++++++++++++++++++++++++++++++++++--- src/form_urlencoded.rs | 42 ++++++++++++++++++++- src/lib.rs | 13 +++++++ 4 files changed, 136 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f39f35e0e..26c6fd70b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,11 +36,13 @@ rustc-serialize = "0.3" serde_json = ">=0.6.1, <0.9" [features] +query_encoding_2 = ["encoding_rs"] query_encoding = ["encoding"] heap_size = ["heapsize"] [dependencies] encoding = {version = "0.2", optional = true} +encoding_rs = {version = "0.7", optional = true} heapsize = {version = ">=0.4.1, <0.5", optional = true} idna = { version = "0.1.0", path = "./idna" } matches = "0.1" @@ -49,4 +51,4 @@ rustc-serialize = {version = "0.3", optional = true} serde = {version = ">=0.6.1, <0.9", optional = true} [package.metadata.docs.rs] -features = ["query_encoding"] +features = ["query_encoding_2", "query_encoding"] diff --git a/src/encoding.rs b/src/encoding.rs index 920b30e11..d7972dc7b 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -7,18 +7,92 @@ // except according to those terms. -//! Abstraction that conditionally compiles either to rust-encoding, -//! or to only support UTF-8. +//! Abstraction that conditionally compiles either to encoding_rs, +//! or rust-encoding (legacy), or to only support UTF-8. +#[cfg(feature = "query_encoding_2")] extern crate encoding_rs; #[cfg(feature = "query_encoding")] extern crate encoding; use std::borrow::Cow; -#[cfg(feature = "query_encoding")] use std::fmt::{self, Debug, Formatter}; +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] use std::fmt::{self, Debug, Formatter}; + +#[cfg(feature = "query_encoding_2")] pub use self::encoding_rs::Encoding; #[cfg(feature = "query_encoding")] use self::encoding::types::{DecoderTrap, EncoderTrap}; #[cfg(feature = "query_encoding")] use self::encoding::label::encoding_from_whatwg_label; #[cfg(feature = "query_encoding")] pub use self::encoding::types::EncodingRef; + + +#[cfg(feature = "query_encoding_2")] +#[derive(Copy, Clone)] +pub struct EncodingOverride { + /// `None` means UTF-8. + encoding: Option<&'static Encoding> +} + +#[cfg(feature = "query_encoding_2")] +impl EncodingOverride { + pub fn from_opt_encoding(encoding: Option<&'static Encoding>) -> Self { + encoding.map(Self::from_encoding).unwrap_or_else(Self::utf8) + } + + pub fn from_encoding(encoding: &'static Encoding) -> Self { + EncodingOverride { + encoding: if encoding.name() == "UTF-8" { None } else { Some(encoding) } + } + } + + #[inline] + pub fn utf8() -> Self { + EncodingOverride { encoding: None } + } + + pub fn lookup(label: &[u8]) -> Option { + // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD + // https://encoding.spec.whatwg.org/#names-and-labels + Encoding::for_label(label) + .map(Self::from_encoding) + } + + /// https://encoding.spec.whatwg.org/#get-an-output-encoding + pub fn to_output_encoding(self) -> Self { + if let Some(encoding) = self.encoding { + if matches!(encoding.name(), "UTF-16LE" | "UTF-16BE") { + return Self::utf8() + } + } + self + } + + pub fn is_utf8(&self) -> bool { + self.encoding.is_none() + } + + pub fn name(&self) -> &'static str { + match self.encoding { + Some(encoding) => encoding.name(), + None => "UTF-8", + } + } + + pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { + match self.encoding { + // encoding_rs returns a short-lived Cow, so create an owned Cow + Some(encoding) => Cow::from(encoding.decode(&input).0.into_owned()), + None => decode_utf8_lossy(input.into()), + } + } + + pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + match self.encoding { + // encoding_rs returns a short-lived Cow, so create an owned Cow + Some(encoding) => Cow::from(encoding.encode(&input).0.into_owned()), + None => encode_utf8(input) + } + } +} + #[cfg(feature = "query_encoding")] #[derive(Copy, Clone)] pub struct EncodingOverride { @@ -90,7 +164,7 @@ impl EncodingOverride { } } -#[cfg(feature = "query_encoding")] +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] impl Debug for EncodingOverride { fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "EncodingOverride {{ encoding: ")?; @@ -101,11 +175,11 @@ impl Debug for EncodingOverride { } } -#[cfg(not(feature = "query_encoding"))] +#[cfg(all(not(feature = "query_encoding"), not(feature = "query_encoding_2")))] #[derive(Copy, Clone, Debug)] pub struct EncodingOverride; -#[cfg(not(feature = "query_encoding"))] +#[cfg(all(not(feature = "query_encoding"), not(feature = "query_encoding_2")))] impl EncodingOverride { #[inline] pub fn utf8() -> Self { diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 7ba8b4a30..8744fb580 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -36,6 +36,29 @@ pub fn parse(input: &[u8]) -> Parse { } +/// Convert a byte string in the `application/x-www-form-urlencoded` syntax +/// into a iterator of (name, value) pairs. +/// +/// Use `parse(input.as_bytes())` to parse a `&str` string. +/// +/// This function is only available if the `query_encoding_2` +/// [feature](http://doc.crates.io/manifest.html#the-features-section]) is enabled. +/// +/// Arguments: +/// +/// * `encoding_override`: The character encoding each name and values is decoded as +/// after percent-decoding. Defaults to UTF-8. +/// `Encoding` is defined in [encoding_rs](https://github.com/hsivonen/encoding_rs). +/// * `use_charset`: The *use _charset_ flag*. If in doubt, set to `false`. +#[cfg(feature = "query_encoding_2")] +pub fn parse_with_encoding<'a>(input: &'a [u8], + encoding_override: Option<&'static ::encoding::Encoding>, + use_charset: bool) + -> Result, ()> { + let encoding = EncodingOverride::from_opt_encoding(encoding_override); + do_parse_with_encoding(input, encoding, use_charset) +} + /// Convert a byte string in the `application/x-www-form-urlencoded` syntax /// into a iterator of (name, value) pairs. /// @@ -55,9 +78,17 @@ pub fn parse_with_encoding<'a>(input: &'a [u8], encoding_override: Option<::encoding::EncodingRef>, use_charset: bool) -> Result, ()> { + let encoding = EncodingOverride::from_opt_encoding(encoding_override); + do_parse_with_encoding(input, encoding, use_charset) +} + +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] +fn do_parse_with_encoding<'a>(input: &'a [u8], + mut encoding: EncodingOverride, + use_charset: bool) + -> Result, ()> { use std::ascii::AsciiExt; - let mut encoding = EncodingOverride::from_opt_encoding(encoding_override); if !(encoding.is_utf8() || input.is_ascii()) { return Err(()) } @@ -294,6 +325,13 @@ impl Serializer { self } + /// Set the character encoding to be used for names and values before percent-encoding. + #[cfg(feature = "query_encoding_2")] + pub fn encoding_override(&mut self, new: Option<&'static ::encoding::Encoding>) -> &mut Self { + self.encoding = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self + } + /// Set the character encoding to be used for names and values before percent-encoding. #[cfg(feature = "query_encoding")] pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) -> &mut Self { @@ -343,7 +381,7 @@ impl Serializer { /// (See the `encoding_override()` method.) /// /// Panics if called after `.finish()`. - #[cfg(feature = "query_encoding")] + #[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] pub fn append_charset(&mut self) -> &mut Self { assert!(self.custom_encoding.is_none(), "Cannot use both custom_encoding_override() and append_charset()"); diff --git a/src/lib.rs b/src/lib.rs index f24285fe1..63e9c7bf1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -196,6 +196,19 @@ impl<'a> ParseOptions<'a> { self } + /// Override the character encoding of query strings. + /// This is a legacy concept only relevant for HTML. + /// + /// `Encoding` is defined in [encoding_rs](https://github.com/hsivonen/encoding_rs). + /// + /// This method is only available if the `query_encoding_2` + /// [feature](http://doc.crates.io/manifest.html#the-features-section]) is enabled. + #[cfg(feature = "query_encoding_2")] + pub fn encoding_override(mut self, new: Option<&'static encoding::Encoding>) -> Self { + self.encoding_override = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self + } + /// Override the character encoding of query strings. /// This is a legacy concept only relevant for HTML. /// From dd69a7050efb8eb79c8c3733a7b6ee2fa56b1e0f Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Wed, 4 Apr 2018 19:39:45 -0700 Subject: [PATCH 2/3] travis: separate query_encoding_2 and query_encoding features --- .travis.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index cbd876905..a8d54c15d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,16 +9,24 @@ jobs: # getopts is only used in tests. Its versions 0.2.16+ don’t build on 1.17.0 - cargo update -p getopts --precise 0.2.15 # data-url uses pub(crate) which is unstable in 1.17 - script: cargo test --all-features -p url -p idna -p percent-encoding -p url_serde + script: + - cargo test --features "heap_size query_encoding_2" -p url -p idna -p percent-encoding -p url_serde + - cargo test --features "heap_size query_encoding" -p url -p idna -p percent-encoding -p url_serde - rust: stable - script: cargo test --all-features --all + script: + - cargo test --features "heap_size query_encoding_2" --all + - cargo test --features "heap_size query_encoding" --all - rust: beta - script: cargo test --all-features --all + script: + - cargo test --features "heap_size query_encoding_2" --all + - cargo test --features "heap_size query_encoding" --all - rust: nightly - script: cargo test --all-features --all + script: + - cargo test --features "heap_size query_encoding_2" --all + - cargo test --features "heap_size query_encoding" --all - rust: nightly env: TARGET=WASM32 # For job list UI From 797cb36d074f186b8fab920b96a4d9400bb37de9 Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Thu, 5 Apr 2018 01:40:07 -0700 Subject: [PATCH 3/3] encoding: reuse buffers when possible Applies when using `encoding_rs` via the `query_encoding_2` feature. Code originally by @hsivonen in https://github.com/servo/rust-url/pull/262 --- src/encoding.rs | 52 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index d7972dc7b..5941370a9 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -78,17 +78,57 @@ impl EncodingOverride { pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { match self.encoding { - // encoding_rs returns a short-lived Cow, so create an owned Cow - Some(encoding) => Cow::from(encoding.decode(&input).0.into_owned()), - None => decode_utf8_lossy(input.into()), + Some(encoding) => { + match input { + Cow::Borrowed(b) => { + let (cow, _) = encoding.decode_without_bom_handling(b); + cow + }, + Cow::Owned(v) => { + { + let (cow, _) = encoding.decode_without_bom_handling(&v[..]); + match cow { + Cow::Owned(s) => { + // Free old heap buffer and return a new one. + return Cow::Owned(s); + } + Cow::Borrowed(_) => {} + } + } + // Reuse the old heap buffer. + Cow::Owned(unsafe { String::from_utf8_unchecked(v) }) + }, + } + }, + None => decode_utf8_lossy(input), } } pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { match self.encoding { - // encoding_rs returns a short-lived Cow, so create an owned Cow - Some(encoding) => Cow::from(encoding.encode(&input).0.into_owned()), - None => encode_utf8(input) + Some(encoding) => { + match input { + Cow::Borrowed(s) => { + let (cow, _, _) = encoding.encode(s); + cow + }, + Cow::Owned(s) => { + { + let (cow, _, _) = encoding.encode(&s[..]); + match cow { + Cow::Owned(v) => { + // Free old heap buffer and return a new one. + return Cow::Owned(v); + }, + Cow::Borrowed(_) => {}, + } + } + // Reuse the old heap buffer. + Cow::Owned(s.into_bytes()) + }, + } + }, + None => encode_utf8(input), } } }