From 18adff60da6dc2e6d41f2072bf2a4fdf61ce71cc Mon Sep 17 00:00:00 2001 From: Konstantin Vdovkin Date: Wed, 8 May 2019 01:04:55 +0500 Subject: [PATCH] bidi rules --- idna/src/uts46.rs | 49 +++++++++++++++++++++++++++++++++++++-------- idna/tests/unit.rs | 3 ++- idna/tests/uts46.rs | 10 +++++++++ tests/unit.rs | 3 +++ 4 files changed, 56 insertions(+), 9 deletions(-) diff --git a/idna/src/uts46.rs b/idna/src/uts46.rs index ac348d1fa..5e964ad1e 100644 --- a/idna/src/uts46.rs +++ b/idna/src/uts46.rs @@ -115,7 +115,7 @@ fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec } // http://tools.ietf.org/html/rfc5893#section-2 -fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool { +fn passes_bidi(label: &str, is_bidi_domain: bool, after_rtl_label: &mut bool) -> bool { // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label // is RTL if it contains at least one character of bidi class R, AL or AN. if !is_bidi_domain { @@ -123,7 +123,8 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool { } let mut chars = label.chars(); - let first_char_class = match chars.next() { + let first_char = chars.next(); + let first_char_class = match first_char { Some(c) => bidi_class(c), None => return true, // empty string }; @@ -174,6 +175,7 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool { BidiClass::R | BidiClass::AL => { let mut found_en = false; let mut found_an = false; + *after_rtl_label = true; // Rule 2 loop { @@ -223,6 +225,36 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool { } } + BidiClass::EN => { + // https://github.com/servo/rust-url/issues/489 + // LDH labels that start with a digit are allowed when they don't come after RTL label + if *after_rtl_label { + return false; + } + match first_char { + Some(c) if c.is_ascii() => {}, + _ => { return false; } + }; + // check that label is LDH + // https://tools.ietf.org/html/rfc5890#section-2.3.1 + let mut last = chars.next(); + loop { + match last { + Some(c) => { + if !c.is_ascii() { + return false; + } + last = chars.next(); + } + _ => { break; } + } + } + if last == Some('-') { + return false; + } + return true; + } + // Rule 1: Should start with L or R/AL _ => { return false; @@ -233,16 +265,16 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool { } /// http://www.unicode.org/reports/tr46/#Validity_Criteria -fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec) { +fn validate_full(label: &str, is_bidi_domain: bool, after_rtl_label: &mut bool, flags: Flags, errors: &mut Vec) { // V1: Must be in NFC form. if label.nfc().ne(label.chars()) { errors.push(Error::ValidityCriteria); } else { - validate(label, is_bidi_domain, flags, errors); + validate(label, is_bidi_domain, after_rtl_label, flags, errors); } } -fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec) { +fn validate(label: &str, is_bidi_domain: bool, after_rtl_label: &mut bool, flags: Flags, errors: &mut Vec) { let first_char = label.chars().next(); if first_char == None { // Empty string, pass @@ -287,7 +319,7 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec) -> String { let mut validated = String::new(); let mut first = true; + let mut after_rtl_label = false; for label in normalized.split('.') { if !first { validated.push('.'); @@ -339,14 +372,14 @@ fn processing(domain: &str, flags: Flags, errors: &mut Vec) -> String { match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) { Some(decoded_label) => { let flags = Flags { transitional_processing: false, ..flags }; - validate_full(&decoded_label, is_bidi_domain, flags, errors); + validate_full(&decoded_label, is_bidi_domain, &mut after_rtl_label, flags, errors); validated.push_str(&decoded_label) } None => errors.push(Error::PunycodeError) } } else { // `normalized` is already `NFC` so we can skip that check - validate(label, is_bidi_domain, flags, errors); + validate(label, is_bidi_domain, &mut after_rtl_label, flags, errors); validated.push_str(label) } } diff --git a/idna/tests/unit.rs b/idna/tests/unit.rs index a7d158d5c..b620c0bf8 100644 --- a/idna/tests/unit.rs +++ b/idna/tests/unit.rs @@ -32,7 +32,8 @@ fn test_v8_bidi_rules() { assert_eq!(_to_ascii("אבּג.ابج").unwrap(), "xn--kdb3bdf.xn--mgbcm"); // Bidi domain names cannot start with digits - assert!(_to_ascii("0a.\u{05D0}").is_err()); + assert!(_to_ascii("0a.\u{05D0}").is_ok()); + assert!(_to_ascii("\u{05D0}.0a").is_err()); assert!(_to_ascii("0à.\u{05D0}").is_err()); // Bidi chars may be punycode-encoded diff --git a/idna/tests/uts46.rs b/idna/tests/uts46.rs index 59ec1cd76..f801ea448 100644 --- a/idna/tests/uts46.rs +++ b/idna/tests/uts46.rs @@ -10,6 +10,9 @@ use std::char; use idna::uts46; use test::TestFn; +const SKIP_TEST: &'static [&'static str] = &["0A.\\u05D0", "0a.\\u05D0", "0a.xn--4db" + ,"1.걾6.𐱁\\u06D0", "1.걾6.𐱁\\u06D0","1.xn--6-945e.xn--glb1794k"]; + pub fn collect_tests(add_test: &mut F) { // http://www.unicode.org/Public/idna/latest/IdnaTest.txt for (i, line) in include_str!("IdnaTest.txt").lines().enumerate() { @@ -37,6 +40,13 @@ pub fn collect_tests(add_test: &mut F) { let to_ascii = pieces.remove(0); let nv8 = if pieces.len() > 0 { pieces.remove(0) } else { "" }; + for skip in SKIP_TEST { + if original == *skip { + expected_failure = true; + break; + } + } + if expected_failure { continue; } diff --git a/tests/unit.rs b/tests/unit.rs index 62401c943..1a751dc9c 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -226,6 +226,9 @@ fn test_idna() { assert!("http://goșu.ro".parse::().is_ok()); assert_eq!(Url::parse("http://☃.net/").unwrap().host(), Some(Host::Domain("xn--n3h.net"))); assert!("https://r2---sn-huoa-cvhl.googlevideo.com/crossdomain.xml".parse::().is_ok()); + // https://github.com/servo/rust-url/issues/489 + assert!("http://mail.163.com.xn----9mcjf9b4dbm09f.com/iloystgnjfrgthteawvo/indexx.php".parse::().is_ok()); + assert!("http://mail.com.xn----9mcjf9b4dbm09f.163.com/iloystgnjfrgthteawvo/indexx.php".parse::().is_err()); } #[test]