From c602e9ff7e0c6016e09fbe356040db3158a0f379 Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Thu, 5 Apr 2018 17:05:59 -0700 Subject: [PATCH 1/5] encoding: convert EncodingOverride to trait --- src/encoding.rs | 146 ----------------------------------- src/encoding/fallback.rs | 53 +++++++++++++ src/encoding/legacy.rs | 94 ++++++++++++++++++++++ src/encoding/mod.rs | 65 ++++++++++++++++ src/encoding/utf8_helpers.rs | 36 +++++++++ src/form_urlencoded.rs | 33 ++++---- src/lib.rs | 13 ++-- src/parser.rs | 10 ++- 8 files changed, 280 insertions(+), 170 deletions(-) delete mode 100644 src/encoding.rs create mode 100644 src/encoding/fallback.rs create mode 100644 src/encoding/legacy.rs create mode 100644 src/encoding/mod.rs create mode 100644 src/encoding/utf8_helpers.rs diff --git a/src/encoding.rs b/src/encoding.rs deleted file mode 100644 index 920b30e11..000000000 --- a/src/encoding.rs +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2013-2014 The rust-url developers. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - - -//! Abstraction that conditionally compiles either to rust-encoding, -//! or to only support UTF-8. - -#[cfg(feature = "query_encoding")] extern crate encoding; - -use std::borrow::Cow; -#[cfg(feature = "query_encoding")] use std::fmt::{self, Debug, Formatter}; - -#[cfg(feature = "query_encoding")] use self::encoding::types::{DecoderTrap, EncoderTrap}; -#[cfg(feature = "query_encoding")] use self::encoding::label::encoding_from_whatwg_label; -#[cfg(feature = "query_encoding")] pub use self::encoding::types::EncodingRef; - -#[cfg(feature = "query_encoding")] -#[derive(Copy, Clone)] -pub struct EncodingOverride { - /// `None` means UTF-8. - encoding: Option -} - -#[cfg(feature = "query_encoding")] -impl EncodingOverride { - pub fn from_opt_encoding(encoding: Option) -> Self { - encoding.map(Self::from_encoding).unwrap_or_else(Self::utf8) - } - - pub fn from_encoding(encoding: EncodingRef) -> Self { - EncodingOverride { - encoding: if encoding.name() == "utf-8" { None } else { Some(encoding) } - } - } - - #[inline] - pub fn utf8() -> Self { - EncodingOverride { encoding: None } - } - - pub fn lookup(label: &[u8]) -> Option { - // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD - // https://encoding.spec.whatwg.org/#names-and-labels - ::std::str::from_utf8(label) - .ok() - .and_then(encoding_from_whatwg_label) - .map(Self::from_encoding) - } - - /// https://encoding.spec.whatwg.org/#get-an-output-encoding - pub fn to_output_encoding(self) -> Self { - if let Some(encoding) = self.encoding { - if matches!(encoding.name(), "utf-16le" | "utf-16be") { - return Self::utf8() - } - } - self - } - - pub fn is_utf8(&self) -> bool { - self.encoding.is_none() - } - - pub fn name(&self) -> &'static str { - match self.encoding { - Some(encoding) => encoding.name(), - None => "utf-8", - } - } - - pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { - match self.encoding { - // `encoding.decode` never returns `Err` when called with `DecoderTrap::Replace` - Some(encoding) => encoding.decode(&input, DecoderTrap::Replace).unwrap().into(), - None => decode_utf8_lossy(input), - } - } - - pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { - match self.encoding { - // `encoding.encode` never returns `Err` when called with `EncoderTrap::NcrEscape` - Some(encoding) => Cow::Owned(encoding.encode(&input, EncoderTrap::NcrEscape).unwrap()), - None => encode_utf8(input) - } - } -} - -#[cfg(feature = "query_encoding")] -impl Debug for EncodingOverride { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "EncodingOverride {{ encoding: ")?; - match self.encoding { - Some(e) => write!(f, "{} }}", e.name()), - None => write!(f, "None }}") - } - } -} - -#[cfg(not(feature = "query_encoding"))] -#[derive(Copy, Clone, Debug)] -pub struct EncodingOverride; - -#[cfg(not(feature = "query_encoding"))] -impl EncodingOverride { - #[inline] - pub fn utf8() -> Self { - EncodingOverride - } - - pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { - decode_utf8_lossy(input) - } - - pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { - encode_utf8(input) - } -} - -pub fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { - match input { - Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), - Cow::Owned(bytes) => { - let raw_utf8: *const [u8]; - match String::from_utf8_lossy(&bytes) { - Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(), - Cow::Owned(s) => return s.into(), - } - // from_utf8_lossy returned a borrow of `bytes` unchanged. - debug_assert!(raw_utf8 == &*bytes as *const [u8]); - // Reuse the existing `Vec` allocation. - unsafe { String::from_utf8_unchecked(bytes) }.into() - } - } -} - -pub fn encode_utf8(input: Cow) -> Cow<[u8]> { - match input { - Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), - Cow::Owned(s) => Cow::Owned(s.into_bytes()) - } -} diff --git a/src/encoding/fallback.rs b/src/encoding/fallback.rs new file mode 100644 index 000000000..2a98763a1 --- /dev/null +++ b/src/encoding/fallback.rs @@ -0,0 +1,53 @@ +// Copyright 2013-2018 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Implementation using UTF-8 only. +//! Used when building without any query encoding feature flags. + +use std::borrow::Cow; + +use encoding::EncodingOverride; +use encoding::utf8_helpers::{decode_utf8_lossy, encode_utf8}; + +#[derive(Copy, Clone, Debug)] +pub struct EncodingOverrideFallback; + +impl EncodingOverrideFallback { + #[inline] + pub fn utf8() -> Self { + EncodingOverrideFallback + } +} + +impl EncodingOverride for EncodingOverrideFallback { + fn utf8() -> Self { + Self {} + } + + fn lookup(_label: &[u8]) -> Option { + // always return `None` which means UTF-8 + None + } + + fn is_utf8(&self) -> bool { + true + } + + fn name(&self) -> &'static str { + "utf-8" + } + + fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { + decode_utf8_lossy(input) + } + + fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + encode_utf8(input) + } +} diff --git a/src/encoding/legacy.rs b/src/encoding/legacy.rs new file mode 100644 index 000000000..10d5cefa9 --- /dev/null +++ b/src/encoding/legacy.rs @@ -0,0 +1,94 @@ +// Copyright 2013-2018 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Implementation using rust-encoding (legacy). +//! Only built with feature flag `query_encoding`. + +extern crate encoding; + +use encoding::EncodingOverride; +use encoding::utf8_helpers::{decode_utf8_lossy, encode_utf8}; + +use std::borrow::Cow; +use std::fmt::{self, Debug, Formatter}; + +use self::encoding::types::{DecoderTrap, EncoderTrap}; +use self::encoding::label::encoding_from_whatwg_label; +pub use self::encoding::types::EncodingRef; + +#[derive(Copy, Clone)] +pub struct EncodingOverrideLegacy { + /// `None` means UTF-8. + encoding: Option +} + +impl EncodingOverrideLegacy { + pub fn from_opt_encoding(encoding: Option) -> Self { + encoding.map(Self::from_encoding).unwrap_or_else(Self::utf8) + } + + pub fn from_encoding(encoding: EncodingRef) -> Self { + Self { + encoding: if encoding.name() == "utf-8" { None } else { Some(encoding) } + } + } +} + +impl EncodingOverride for EncodingOverrideLegacy { + #[inline] + fn utf8() -> Self { + Self { encoding: None } + } + + fn lookup(label: &[u8]) -> Option { + // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD + // https://encoding.spec.whatwg.org/#names-and-labels + ::std::str::from_utf8(label) + .ok() + .and_then(encoding_from_whatwg_label) + .map(Self::from_encoding) + } + + fn is_utf8(&self) -> bool { + self.encoding.is_none() + } + + fn name(&self) -> &'static str { + match self.encoding { + Some(encoding) => encoding.name(), + None => "utf-8", + } + } + + fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { + match self.encoding { + // `encoding.decode` never returns `Err` when called with `DecoderTrap::Replace` + Some(encoding) => encoding.decode(&input, DecoderTrap::Replace).unwrap().into(), + None => decode_utf8_lossy(input), + } + } + + fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + match self.encoding { + // `encoding.encode` never returns `Err` when called with `EncoderTrap::NcrEscape` + Some(encoding) => Cow::Owned(encoding.encode(&input, EncoderTrap::NcrEscape).unwrap()), + None => encode_utf8(input) + } + } +} + +impl Debug for EncodingOverrideLegacy { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "EncodingOverride {{ encoding: ")?; + match self.encoding { + Some(e) => write!(f, "{} }}", e.name()), + None => write!(f, "None }}") + } + } +} diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs new file mode 100644 index 000000000..f43290d4a --- /dev/null +++ b/src/encoding/mod.rs @@ -0,0 +1,65 @@ +// Copyright 2013-2018 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Abstraction that conditionally compiles either to encoding_rs, +//! or rust-encoding (legacy), or to only support UTF-8. + +mod utf8_helpers; + +use std::borrow::Cow; +use std::fmt::Debug; + +#[cfg(feature = "query_encoding")] mod legacy; +#[cfg(feature = "query_encoding")] pub use self::legacy::{EncodingOverrideLegacy, EncodingRef}; + +#[cfg(not(feature = "query_encoding"))] mod fallback; +#[cfg(not(feature = "query_encoding"))] use self::fallback::EncodingOverrideFallback; + +pub trait EncodingOverride : Debug { + /// Get an Encoding representing UTF-8. + fn utf8() -> Self where Self: Sized; + + /// Look up an Encoding using the WHATWG label, + /// listed at https://encoding.spec.whatwg.org/#names-and-labels + fn lookup(label: &[u8]) -> Option where Self: Sized; + + /// Whether this Encoding represents UTF-8. + fn is_utf8(&self) -> bool; + + /// Get the name of this Encoding, which when ASCII lowercased, may be used as a + /// lookup label. https://encoding.spec.whatwg.org/#names-and-labels + fn name(&self) -> &'static str; + + /// https://encoding.spec.whatwg.org/#get-an-output-encoding + fn to_output_encoding(self) -> Self where Self: Sized { + if !self.is_utf8() { + let lowercased = self.name().to_lowercase(); + if lowercased == "utf-16le" || lowercased == "utf-16be" { + return Self::utf8() + } + } + self + } + + /// Decode the specified bytes in the current encoding, to UTF-8. + fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str>; + + /// Encode the UTF-8 string to the current encoding. + fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]>; +} + +#[cfg(feature = "query_encoding")] +pub fn default_encoding_override() -> EncodingOverrideLegacy { + EncodingOverrideLegacy::utf8() +} + +#[cfg(not(feature = "query_encoding"))] +pub fn default_encoding_override() -> EncodingOverrideFallback { + EncodingOverrideFallback::utf8() +} diff --git a/src/encoding/utf8_helpers.rs b/src/encoding/utf8_helpers.rs new file mode 100644 index 000000000..0923a196d --- /dev/null +++ b/src/encoding/utf8_helpers.rs @@ -0,0 +1,36 @@ +// Copyright 2013-2018 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! UTF-8 encode and decode methods. + +use std::borrow::Cow; + +pub fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { + match input { + Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), + Cow::Owned(bytes) => { + let raw_utf8: *const [u8]; + match String::from_utf8_lossy(&bytes) { + Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(), + Cow::Owned(s) => return s.into(), + } + // from_utf8_lossy returned a borrow of `bytes` unchanged. + debug_assert!(raw_utf8 == &*bytes as *const [u8]); + // Reuse the existing `Vec` allocation. + unsafe { String::from_utf8_unchecked(bytes) }.into() + } + } +} + +pub fn encode_utf8(input: Cow) -> Cow<[u8]> { + match input { + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + Cow::Owned(s) => Cow::Owned(s.into_bytes()) + } +} diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 7ba8b4a30..4631ef592 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -14,9 +14,12 @@ //! and a sequence of (name, value) pairs. use encoding::EncodingOverride; +#[cfg(feature = "query_encoding")] use encoding::EncodingOverrideLegacy; +use encoding::default_encoding_override; use percent_encoding::{percent_encode_byte, percent_decode}; use std::borrow::{Borrow, Cow}; use std::fmt; +use std::rc::Rc; use std::str; @@ -31,7 +34,7 @@ use std::str; pub fn parse(input: &[u8]) -> Parse { Parse { input: input, - encoding: EncodingOverride::utf8(), + encoding: Rc::new(default_encoding_override()), } } @@ -57,7 +60,7 @@ pub fn parse_with_encoding<'a>(input: &'a [u8], -> Result, ()> { use std::ascii::AsciiExt; - let mut encoding = EncodingOverride::from_opt_encoding(encoding_override); + let mut encoding = EncodingOverrideLegacy::from_opt_encoding(encoding_override); if !(encoding.is_utf8() || input.is_ascii()) { return Err(()) } @@ -77,15 +80,15 @@ pub fn parse_with_encoding<'a>(input: &'a [u8], } Ok(Parse { input: input, - encoding: encoding, + encoding: Rc::new(encoding), }) } /// The return type of `parse()`. -#[derive(Copy, Clone, Debug)] +#[derive(Clone, Debug)] pub struct Parse<'a> { input: &'a [u8], - encoding: EncodingOverride, + encoding: Rc, } impl<'a> Iterator for Parse<'a> { @@ -106,14 +109,14 @@ impl<'a> Iterator for Parse<'a> { let name = split2.next().unwrap(); let value = split2.next().unwrap_or(&[][..]); return Some(( - decode(name, self.encoding), - decode(value, self.encoding), + decode(name, &*self.encoding), + decode(value, &*self.encoding), )) } } } -fn decode(input: &[u8], encoding: EncodingOverride) -> Cow { +fn decode<'i>(input: &'i [u8], encoding: &EncodingOverride) -> Cow<'i, str> { let replaced = replace_plus(input); encoding.decode(match percent_decode(&replaced).if_any() { Some(vec) => Cow::Owned(vec), @@ -216,7 +219,7 @@ impl<'a> Iterator for ByteSerialize<'a> { pub struct Serializer { target: Option, start_position: usize, - encoding: EncodingOverride, + encoding: Rc, custom_encoding: Option Cow<[u8]>>>>, } @@ -281,7 +284,7 @@ impl Serializer { Serializer { target: Some(target), start_position: start_position, - encoding: EncodingOverride::utf8(), + encoding: Rc::new(default_encoding_override()), custom_encoding: None, } } @@ -297,7 +300,7 @@ impl Serializer { /// Set the character encoding to be used for names and values before percent-encoding. #[cfg(feature = "query_encoding")] pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) -> &mut Self { - self.encoding = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self.encoding = Rc::new(EncodingOverrideLegacy::from_opt_encoding(new).to_output_encoding()); self } @@ -313,7 +316,7 @@ impl Serializer { /// /// Panics if called after `.finish()`. pub fn append_pair(&mut self, name: &str, value: &str) -> &mut Self { - append_pair(string(&mut self.target), self.start_position, self.encoding, + append_pair(string(&mut self.target), self.start_position, &*self.encoding, &mut self.custom_encoding, name, value); self } @@ -331,7 +334,7 @@ impl Serializer { let string = string(&mut self.target); for pair in iter { let &(ref k, ref v) = pair.borrow(); - append_pair(string, self.start_position, self.encoding, + append_pair(string, self.start_position, &*self.encoding, &mut self.custom_encoding, k.as_ref(), v.as_ref()); } } @@ -383,7 +386,7 @@ fn string(target: &mut Option) -> &mut String { target.as_mut().expect("url::form_urlencoded::Serializer finished").as_mut_string() } -fn append_pair(string: &mut String, start_position: usize, encoding: EncodingOverride, +fn append_pair(string: &mut String, start_position: usize, encoding: &EncodingOverride, custom_encoding: &mut Option Cow<[u8]>>>>, name: &str, value: &str) { append_separator_if_needed(string, start_position); @@ -392,7 +395,7 @@ fn append_pair(string: &mut String, start_position: usize, encoding: EncodingOve append_encoded(value, string, encoding, custom_encoding); } -fn append_encoded(s: &str, string: &mut String, encoding: EncodingOverride, +fn append_encoded(s: &str, string: &mut String, encoding: &EncodingOverride, custom_encoding: &mut Option Cow<[u8]>>>>) { let bytes = if let Some(SilentDebug(ref mut custom)) = *custom_encoding { custom(s) diff --git a/src/lib.rs b/src/lib.rs index f24285fe1..329dcca6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,6 +115,8 @@ pub extern crate idna; pub extern crate percent_encoding; use encoding::EncodingOverride; +use encoding::default_encoding_override; +#[cfg(feature = "query_encoding")] use encoding::EncodingOverrideLegacy; #[cfg(feature = "heapsize")] use heapsize::HeapSizeOf; use host::HostInternal; use parser::{Parser, Context, SchemeType, to_u32, ViolationFn}; @@ -130,6 +132,7 @@ use std::mem; use std::net::{ToSocketAddrs, IpAddr}; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; +use std::rc::Rc; use std::str; pub use origin::{Origin, OpaqueOrigin}; @@ -182,10 +185,10 @@ impl HeapSizeOf for Url { } /// Full configuration for the URL parser. -#[derive(Copy, Clone)] +#[derive(Clone)] pub struct ParseOptions<'a> { base_url: Option<&'a Url>, - encoding_override: encoding::EncodingOverride, + encoding_override: Rc, violation_fn: ViolationFn<'a>, } @@ -205,7 +208,7 @@ impl<'a> ParseOptions<'a> { /// [feature](http://doc.crates.io/manifest.html#the-features-section]) is enabled. #[cfg(feature = "query_encoding")] pub fn encoding_override(mut self, new: Option) -> Self { - self.encoding_override = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self.encoding_override = Rc::new(EncodingOverrideLegacy::from_opt_encoding(new).to_output_encoding()); self } @@ -258,7 +261,7 @@ impl<'a> ParseOptions<'a> { Parser { serialization: String::with_capacity(input.len()), base_url: self.base_url, - query_encoding_override: self.encoding_override, + query_encoding_override: self.encoding_override.clone(), violation_fn: self.violation_fn, context: Context::UrlParser, }.parse_url(input) @@ -401,7 +404,7 @@ impl Url { pub fn options<'a>() -> ParseOptions<'a> { ParseOptions { base_url: None, - encoding_override: EncodingOverride::utf8(), + encoding_override: Rc::new(default_encoding_override()), violation_fn: ViolationFn::NoOp, } } diff --git a/src/parser.rs b/src/parser.rs index 92b97afdd..33c3b02ff 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -9,10 +9,12 @@ use std::ascii::AsciiExt; use std::error::Error; use std::fmt::{self, Formatter, Write}; +use std::rc::Rc; use std::str; use Url; use encoding::EncodingOverride; +use encoding::default_encoding_override; use host::{Host, HostInternal}; use percent_encoding::{ utf8_percent_encode, percent_encode, @@ -315,7 +317,7 @@ impl<'a> fmt::Debug for ViolationFn<'a> { pub struct Parser<'a> { pub serialization: String, pub base_url: Option<&'a Url>, - pub query_encoding_override: EncodingOverride, + pub query_encoding_override: Rc, pub violation_fn: ViolationFn<'a>, pub context: Context, } @@ -332,7 +334,7 @@ impl<'a> Parser<'a> { Parser { serialization: serialization, base_url: None, - query_encoding_override: EncodingOverride::utf8(), + query_encoding_override: Rc::new(default_encoding_override()), violation_fn: ViolationFn::NoOp, context: Context::Setter, } @@ -1151,8 +1153,8 @@ impl<'a> Parser<'a> { } let encoding = match &self.serialization[..scheme_end as usize] { - "http" | "https" | "file" | "ftp" | "gopher" => self.query_encoding_override, - _ => EncodingOverride::utf8(), + "http" | "https" | "file" | "ftp" | "gopher" => self.query_encoding_override.clone(), + _ => Rc::new(default_encoding_override()), }; let query_bytes = encoding.encode(query.into()); self.serialization.extend(percent_encode(&query_bytes, QUERY_ENCODE_SET)); From 42f9c577df0b38f000dac47b889c185c990c4388 Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Fri, 6 Apr 2018 10:58:38 -0700 Subject: [PATCH 2/5] encoding: `query_encoding_2` feature, encoding_rs enable feature to use encoding_rs implementation for EncodingOverride includes Cow optimizations by @hsivonen. --- Cargo.toml | 4 +- src/encoding/encoding_rs.rs | 125 ++++++++++++++++++++++++++++++++++++ src/encoding/legacy.rs | 6 +- src/encoding/mod.rs | 19 ++++-- 4 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 src/encoding/encoding_rs.rs diff --git a/Cargo.toml b/Cargo.toml index f39f35e0e..26c6fd70b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,11 +36,13 @@ rustc-serialize = "0.3" serde_json = ">=0.6.1, <0.9" [features] +query_encoding_2 = ["encoding_rs"] query_encoding = ["encoding"] heap_size = ["heapsize"] [dependencies] encoding = {version = "0.2", optional = true} +encoding_rs = {version = "0.7", optional = true} heapsize = {version = ">=0.4.1, <0.5", optional = true} idna = { version = "0.1.0", path = "./idna" } matches = "0.1" @@ -49,4 +51,4 @@ rustc-serialize = {version = "0.3", optional = true} serde = {version = ">=0.6.1, <0.9", optional = true} [package.metadata.docs.rs] -features = ["query_encoding"] +features = ["query_encoding_2", "query_encoding"] diff --git a/src/encoding/encoding_rs.rs b/src/encoding/encoding_rs.rs new file mode 100644 index 000000000..7b5014f1d --- /dev/null +++ b/src/encoding/encoding_rs.rs @@ -0,0 +1,125 @@ +// Copyright 2013-2018 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Implementation using [encoding_rs](https://github.com/hsivonen/encoding_rs). +//! Only built with feature flag `query_encoding_2`. + +extern crate encoding_rs; + +use encoding::EncodingOverride; +use encoding::utf8_helpers::{decode_utf8_lossy, encode_utf8}; + +use std::borrow::Cow; +use std::fmt::{self, Debug, Formatter}; + +use self::encoding_rs::Encoding; + +pub struct EncodingOverrideRs { + /// `None` means UTF-8. + encoding: Option<&'static Encoding> +} + +impl EncodingOverrideRs { + fn from_encoding(encoding: &'static Encoding) -> Self { + Self { + encoding: if encoding.name() == "UTF-8" { None } else { Some(encoding) } + } + } +} + +impl EncodingOverride for EncodingOverrideRs { + #[inline] + fn utf8() -> Self { + Self { encoding: None } + } + + fn lookup(label: &[u8]) -> Option { + // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD + // https://encoding.spec.whatwg.org/#names-and-labels + Encoding::for_label(label) + .map(Self::from_encoding) + } + + fn is_utf8(&self) -> bool { + self.encoding.is_none() + } + + fn name(&self) -> &'static str { + match self.encoding { + Some(encoding) => encoding.name(), + None => encoding_rs::UTF_8.name(), + } + } + + fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { + match self.encoding { + Some(encoding) => { + match input { + Cow::Borrowed(b) => { + let (cow, _) = encoding.decode_without_bom_handling(b); + cow + }, + Cow::Owned(v) => { + { + let (cow, _) = encoding.decode_without_bom_handling(&v[..]); + match cow { + Cow::Owned(s) => { + // Free old heap buffer and return a new one. + return Cow::Owned(s); + }, + Cow::Borrowed(_) => {}, + } + } + // Reuse the old heap buffer. + Cow::Owned(unsafe { String::from_utf8_unchecked(v) }) + }, + } + }, + None => decode_utf8_lossy(input), + } + } + + fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + match self.encoding { + Some(encoding) => { + match input { + Cow::Borrowed(s) => { + let (cow, _, _) = encoding.encode(s); + cow + }, + Cow::Owned(s) => { + { + let (cow, _, _) = encoding.encode(&s[..]); + match cow { + Cow::Owned(v) => { + // Free old heap buffer and return a new one. + return Cow::Owned(v); + }, + Cow::Borrowed(_) => {}, + } + } + // Reuse the old heap buffer. + Cow::Owned(s.into_bytes()) + }, + } + }, + None => encode_utf8(input), + } + } +} + +impl Debug for EncodingOverrideRs { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "EncodingOverride {{ encoding: ")?; + match self.encoding { + Some(e) => write!(f, "{} }}", e.name()), + None => write!(f, "None }}") + } + } +} diff --git a/src/encoding/legacy.rs b/src/encoding/legacy.rs index 10d5cefa9..13c6b40e9 100644 --- a/src/encoding/legacy.rs +++ b/src/encoding/legacy.rs @@ -7,8 +7,10 @@ // except according to those terms. -//! Implementation using rust-encoding (legacy). -//! Only built with feature flag `query_encoding`. +//! Legacy implementation using +//! [rust-encoding](https://github.com/lifthrasiir/rust-encoding). +//! Only built when setting feature flag `query_encoding`. +//! Use feature flag `query_encoding_2` for the new `encoding_rs` implementation. extern crate encoding; diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs index f43290d4a..82667f518 100644 --- a/src/encoding/mod.rs +++ b/src/encoding/mod.rs @@ -15,11 +15,17 @@ mod utf8_helpers; use std::borrow::Cow; use std::fmt::Debug; +#[cfg(feature = "query_encoding_2")] mod encoding_rs; +#[cfg(feature = "query_encoding_2")] use self::encoding_rs::EncodingOverrideRs; + #[cfg(feature = "query_encoding")] mod legacy; #[cfg(feature = "query_encoding")] pub use self::legacy::{EncodingOverrideLegacy, EncodingRef}; -#[cfg(not(feature = "query_encoding"))] mod fallback; -#[cfg(not(feature = "query_encoding"))] use self::fallback::EncodingOverrideFallback; +#[cfg(not(any(feature = "query_encoding", feature = "query_encoding_2")))] +mod fallback; +#[cfg(not(any(feature = "query_encoding", feature = "query_encoding_2")))] +use self::fallback::EncodingOverrideFallback; + pub trait EncodingOverride : Debug { /// Get an Encoding representing UTF-8. @@ -54,12 +60,17 @@ pub trait EncodingOverride : Debug { fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]>; } -#[cfg(feature = "query_encoding")] +#[cfg(feature = "query_encoding_2")] +pub fn default_encoding_override() -> EncodingOverrideRs { + EncodingOverrideRs::utf8() +} + +#[cfg(all(feature = "query_encoding", not(feature = "query_encoding_2")))] pub fn default_encoding_override() -> EncodingOverrideLegacy { EncodingOverrideLegacy::utf8() } -#[cfg(not(feature = "query_encoding"))] +#[cfg(not(any(feature = "query_encoding", feature = "query_encoding_2")))] pub fn default_encoding_override() -> EncodingOverrideFallback { EncodingOverrideFallback::utf8() } From eefd35b2049c033ed95c06c95c5a60e4d13c66aa Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Fri, 6 Apr 2018 12:21:30 -0700 Subject: [PATCH 3/5] encoding: export API taking &[u8] label hides upstream types from downstream consumers. also deprecate `query_encoding` feature since it exposes upstream types. --- src/encoding/mod.rs | 20 ++++++++++++++++++ src/form_urlencoded.rs | 46 ++++++++++++++++++++++++++++++++++++++++-- src/lib.rs | 15 ++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs index 82667f518..20788eba5 100644 --- a/src/encoding/mod.rs +++ b/src/encoding/mod.rs @@ -65,11 +65,31 @@ pub fn default_encoding_override() -> EncodingOverrideRs { EncodingOverrideRs::utf8() } +#[cfg(feature = "query_encoding_2")] +pub fn encoding_override_for_label(label: Option<&[u8]>) -> EncodingOverrideRs { + if let Some(label) = label { + if let Some(encoding) = EncodingOverrideRs::lookup(label) { + return encoding; + } + } + EncodingOverrideRs::utf8() +} + #[cfg(all(feature = "query_encoding", not(feature = "query_encoding_2")))] pub fn default_encoding_override() -> EncodingOverrideLegacy { EncodingOverrideLegacy::utf8() } +#[cfg(all(feature = "query_encoding", not(feature = "query_encoding_2")))] +pub fn encoding_override_for_label(label: Option<&[u8]>) -> EncodingOverrideLegacy { + if let Some(label) = label { + if let Some(encoding) = EncodingOverrideLegacy::lookup(label) { + return encoding; + } + } + EncodingOverrideLegacy::utf8() +} + #[cfg(not(any(feature = "query_encoding", feature = "query_encoding_2")))] pub fn default_encoding_override() -> EncodingOverrideFallback { EncodingOverrideFallback::utf8() diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 4631ef592..08bd473a7 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -16,6 +16,7 @@ use encoding::EncodingOverride; #[cfg(feature = "query_encoding")] use encoding::EncodingOverrideLegacy; use encoding::default_encoding_override; +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] use encoding::encoding_override_for_label; use percent_encoding::{percent_encode_byte, percent_decode}; use std::borrow::{Borrow, Cow}; use std::fmt; @@ -39,6 +40,29 @@ pub fn parse(input: &[u8]) -> Parse { } +/// Convert a byte string in the `application/x-www-form-urlencoded` syntax +/// into a iterator of (name, value) pairs. +/// +/// Use `parse(input.as_bytes())` to parse a `&str` string. +/// +/// This function is only available if the `query_encoding_2` or `query_encoding` +/// [feature](http://doc.crates.io/manifest.html#the-features-section]) is enabled. +/// +/// Arguments: +/// +/// * `encoding_label`: The character encoding each name and values is decoded as +/// after percent-decoding. Defaults to UTF-8. +/// Labels are listed at https://encoding.spec.whatwg.org/#names-and-labels +/// * `use_charset`: The *use _charset_ flag*. If in doubt, set to `false`. +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] +pub fn parse_with_encoding_label<'a>(input: &'a [u8], + encoding_label: Option<&[u8]>, + use_charset: bool) + -> Result, ()> { + let encoding = encoding_override_for_label(encoding_label); + parse_with_encoding_override(input, encoding, use_charset) +} + /// Convert a byte string in the `application/x-www-form-urlencoded` syntax /// into a iterator of (name, value) pairs. /// @@ -54,13 +78,23 @@ pub fn parse(input: &[u8]) -> Parse { /// `EncodingRef` is defined in [rust-encoding](https://github.com/lifthrasiir/rust-encoding). /// * `use_charset`: The *use _charset_ flag*. If in doubt, set to `false`. #[cfg(feature = "query_encoding")] +#[deprecated(note="Build with `query_encoding_2` instead")] pub fn parse_with_encoding<'a>(input: &'a [u8], encoding_override: Option<::encoding::EncodingRef>, use_charset: bool) -> Result, ()> { + let encoding = EncodingOverrideLegacy::from_opt_encoding(encoding_override); + parse_with_encoding_override(input, encoding, use_charset) +} + +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] +fn parse_with_encoding_override<'a, E>(input: &'a [u8], + mut encoding: E, + use_charset: bool) + -> Result, ()> + where E: 'static + EncodingOverride { use std::ascii::AsciiExt; - let mut encoding = EncodingOverrideLegacy::from_opt_encoding(encoding_override); if !(encoding.is_utf8() || input.is_ascii()) { return Err(()) } @@ -297,8 +331,16 @@ impl Serializer { self } + /// Set the character encoding to be used for names and values before percent-encoding. + #[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] + pub fn encoding_override_for_label(&mut self, label: Option<&[u8]>) -> &mut Self { + self.encoding = Rc::new(encoding_override_for_label(label).to_output_encoding()); + self + } + /// Set the character encoding to be used for names and values before percent-encoding. #[cfg(feature = "query_encoding")] + #[deprecated(note="Build with `query_encoding_2` instead")] pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) -> &mut Self { self.encoding = Rc::new(EncodingOverrideLegacy::from_opt_encoding(new).to_output_encoding()); self @@ -346,7 +388,7 @@ impl Serializer { /// (See the `encoding_override()` method.) /// /// Panics if called after `.finish()`. - #[cfg(feature = "query_encoding")] + #[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] pub fn append_charset(&mut self) -> &mut Self { assert!(self.custom_encoding.is_none(), "Cannot use both custom_encoding_override() and append_charset()"); diff --git a/src/lib.rs b/src/lib.rs index 329dcca6a..d7516946e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -116,6 +116,7 @@ pub extern crate percent_encoding; use encoding::EncodingOverride; use encoding::default_encoding_override; +#[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] use encoding::encoding_override_for_label; #[cfg(feature = "query_encoding")] use encoding::EncodingOverrideLegacy; #[cfg(feature = "heapsize")] use heapsize::HeapSizeOf; use host::HostInternal; @@ -199,6 +200,19 @@ impl<'a> ParseOptions<'a> { self } + /// Override the character encoding of query strings. + /// This is a legacy concept only relevant for HTML. + /// + /// Labels are listed at https://encoding.spec.whatwg.org/#names-and-labels + /// + /// This method is only available if the `query_encoding_2` or `query_encoding` + /// [feature](http://doc.crates.io/manifest.html#the-features-section]) is enabled. + #[cfg(any(feature = "query_encoding", feature = "query_encoding_2"))] + pub fn encoding_override_for_label(mut self, label: Option<&[u8]>) -> Self { + self.encoding_override = Rc::new(encoding_override_for_label(label).to_output_encoding()); + self + } + /// Override the character encoding of query strings. /// This is a legacy concept only relevant for HTML. /// @@ -207,6 +221,7 @@ impl<'a> ParseOptions<'a> { /// This method is only available if the `query_encoding` /// [feature](http://doc.crates.io/manifest.html#the-features-section]) is enabled. #[cfg(feature = "query_encoding")] + #[deprecated(note="Build with `query_encoding_2` instead")] pub fn encoding_override(mut self, new: Option) -> Self { self.encoding_override = Rc::new(EncodingOverrideLegacy::from_opt_encoding(new).to_output_encoding()); self From 99086f2c7cb6dc55f1d201df5844dabd922b74ae Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Fri, 6 Apr 2018 12:37:40 -0700 Subject: [PATCH 4/5] travis: test building potentially conflicting features --- .travis.yml | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index cbd876905..e6e385841 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,17 +8,37 @@ jobs: - cargo update # getopts is only used in tests. Its versions 0.2.16+ don’t build on 1.17.0 - cargo update -p getopts --precise 0.2.15 - # data-url uses pub(crate) which is unstable in 1.17 - script: cargo test --all-features -p url -p idna -p percent-encoding -p url_serde + script: + # test building potentially conflicting features + - cargo build + - cargo build --features query_encoding + - cargo build --features query_encoding_2 + # data-url uses pub(crate) which is unstable in 1.17 + - cargo test --all-features -p url -p idna -p percent-encoding -p url_serde - rust: stable - script: cargo test --all-features --all + script: + # test building potentially conflicting features + - cargo build + - cargo build --features query_encoding + - cargo build --features query_encoding_2 + - cargo test --all-features --all - rust: beta - script: cargo test --all-features --all + script: + # test building potentially conflicting features + - cargo build + - cargo build --features query_encoding + - cargo build --features query_encoding_2 + - cargo test --all-features --all - rust: nightly - script: cargo test --all-features --all + script: + # test building potentially conflicting features + - cargo build + - cargo build --features query_encoding + - cargo build --features query_encoding_2 + - cargo test --all-features --all - rust: nightly env: TARGET=WASM32 # For job list UI From fb41c2b474e03cf0bcb20e6a4a24d9967a2b0d8e Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Fri, 6 Apr 2018 12:50:45 -0700 Subject: [PATCH 5/5] tests: fix for structs losing Copy trait due to EncodingOverride trait --- src/lib.rs | 2 +- tests/unit.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d7516946e..c926afa03 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1161,7 +1161,7 @@ impl Url { /// let url = Url::parse("https://example.com/products?page=2&sort=desc")?; /// let mut pairs = url.query_pairs(); /// - /// assert_eq!(pairs.count(), 2); + /// assert_eq!(pairs.clone().count(), 2); /// /// assert_eq!(pairs.next(), Some((Cow::Borrowed("page"), Cow::Borrowed("2")))); /// assert_eq!(pairs.next(), Some((Cow::Borrowed("sort"), Cow::Borrowed("desc")))); diff --git a/tests/unit.rs b/tests/unit.rs index 10bb86a9d..9a9ce356f 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -528,16 +528,16 @@ fn test_syntax_violation_callback_lifetimes() { } #[test] -fn test_options_reuse() { +fn test_options_clone() { use url::SyntaxViolation::*; let violations = RefCell::new(Vec::new()); let vfn = |v| violations.borrow_mut().push(v); let options = Url::options() .syntax_violation_callback(Some(&vfn)); - let url = options.parse("http:////mozilla.org").unwrap(); + let url = options.clone().parse("http:////mozilla.org").unwrap(); - let options = options.base_url(Some(&url)); + let options = options.clone().base_url(Some(&url)); let url = options.parse("/sub\\path").unwrap(); assert_eq!(url.as_str(), "http://mozilla.org/sub/path"); assert_eq!(*violations.borrow(),