Merge branch 'main' into locid

robertbastian · robertbastian · commit dd9f2a1b3ea9 · 2022-03-16T09:27:10.000+01:00
diff --git a/components/locale_canonicalizer/src/locale_canonicalizer.rs b/components/locale_canonicalizer/src/locale_canonicalizer.rs
@@ -165,8 +165,8 @@ fn uts35_check_language_rules(
     locale: &mut Locale,
     alias_data: &DataPayload<AliasesV1Marker>,
 ) -> CanonicalizationResult {
-    let maybe_lang: Option<TinyAsciiStr<3>> = locale.id.language.into();
-    if let Some(lang) = maybe_lang {
+    if !locale.id.language.is_empty() {
+        let lang: TinyAsciiStr<3> = locale.id.language.into();
         let replacement = if lang.len() == 2 {
             alias_data
                 .get()
@@ -557,14 +557,24 @@ impl LocaleCanonicalizer {
             return CanonicalizationResult::Unmodified;
         }
 
-        if let Some(language) = langid.language.into() {
+        if !langid.language.is_empty() {
             if let Some(region) = langid.region {
-                maximize_locale!(langid, data.language_region, language, region.into());
+                maximize_locale!(
+                    langid,
+                    data.language_region,
+                    langid.language.into(),
+                    region.into()
+                );
             }
             if let Some(script) = langid.script {
-                maximize_locale!(langid, data.language_script, language, script.into());
+                maximize_locale!(
+                    langid,
+                    data.language_script,
+                    langid.language.into(),
+                    script.into()
+                );
             }
-            maximize_locale!(langid, data.language, language);
+            maximize_locale!(langid, data.language, langid.language.into());
         }
         if let Some(script) = langid.script {
             if let Some(region) = langid.region {
diff --git a/components/locid/src/subtags/language.rs b/components/locid/src/subtags/language.rs
@@ -6,7 +6,7 @@ use crate::parser::errors::ParserError;
 use core::fmt;
 use core::ops::RangeInclusive;
 use core::str::FromStr;
-use tinystr::{tinystr, TinyAsciiStr};
+use tinystr::TinyAsciiStr;
 
 /// A language subtag (examples: `"en"`, `"csb"`, `"zh"`, `"und"`, etc.)
 ///
@@ -38,11 +38,13 @@ use tinystr::{tinystr, TinyAsciiStr};
 /// but that form has not been used and ICU4X does not support it right now.
 ///
 /// [`unicode_language_id`]: https://unicode.org/reports/tr35/#unicode_language_id
-#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
-pub struct Language(Option<TinyAsciiStr<{ *LANGUAGE_LENGTH.end() }>>);
+#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
+pub struct Language(TinyAsciiStr<{ *LANGUAGE_LENGTH.end() }>);
 
 const LANGUAGE_LENGTH: RangeInclusive<usize> = 2..=3;
-const UND_VALUE: TinyAsciiStr<3> = tinystr!(3, "und");
+// TODO(#348): Change this to invoke a const function.
+// Safe because "und" is a valid language subtag
+const UND: Language = Language(unsafe { TinyAsciiStr::from_bytes_unchecked(*b"und") });
 
 impl Language {
     /// A constructor which takes a utf8 slice, parses it and
@@ -75,16 +77,8 @@ impl Language {
             return Err(ParserError::InvalidLanguage);
         }
 
-        if slen == 3
-            && v[start] == UND_VALUE.all_bytes()[0]
-            && v[start + 1] == UND_VALUE.all_bytes()[1]
-            && v[start + 2] == UND_VALUE.all_bytes()[2]
-        {
-            return Ok(Self(None));
-        }
-
         match TinyAsciiStr::from_bytes_manual_slice(v, start, end) {
-            Ok(s) if s.is_ascii_alphabetic() => Ok(Self(Some(s.to_ascii_lowercase()))),
+            Ok(s) if s.is_ascii_alphabetic() => Ok(Self(s.to_ascii_lowercase())),
             _ => Err(ParserError::InvalidLanguage),
         }
     }
@@ -104,8 +98,8 @@ impl Language {
     /// let lang = unsafe { Language::from_raw_unchecked(raw) };
     /// assert_eq!(lang, "en");
     /// ```
-    pub fn into_raw(self) -> Option<[u8; 3]> {
-        self.0.as_ref().map(TinyAsciiStr::all_bytes).copied()
+    pub fn into_raw(self) -> [u8; 3] {
+        *self.0.all_bytes()
     }
 
     /// Constructor which takes a raw value returned by
@@ -128,11 +122,8 @@ impl Language {
     ///
     /// This function accepts a [`[u8; 3]`] that is expected to be a valid [`TinyAsciiStr<3>`]
     /// representing a [`Language`] subtag in canonical syntax.
-    pub const unsafe fn from_raw_unchecked(v: Option<[u8; 3]>) -> Self {
-        Self(match v {
-            Some(v) => Some(TinyAsciiStr::from_bytes_unchecked(v)),
-            None => None,
-        })
+    pub const unsafe fn from_raw_unchecked(v: [u8; 3]) -> Self {
+        Self(TinyAsciiStr::from_bytes_unchecked(v))
     }
 
     /// Returns the default undefined language "und". Same as [`default()`](Default::default()), but is `const`.
@@ -148,7 +139,7 @@ impl Language {
     /// ```
     #[inline]
     pub const fn und() -> Self {
-        Self(None)
+        UND
     }
 
     /// A helper function for displaying
@@ -167,11 +158,12 @@ impl Language {
     ///
     /// `Notice`: For many use cases, such as comparison,
     /// [`Language`] implements [`PartialEq`]`<&`[`str`]`>` which allows for direct comparisons.
+    #[inline]
     pub fn as_str(&self) -> &str {
-        self.0.as_deref().unwrap_or("und")
+        self.0.as_str()
     }
 
-    /// Resets the [`Language`] subtag to an empty one.
+    /// Resets the [`Language`] subtag to an empty one (equal to `"und"`).
     ///
     /// # Examples
     ///
@@ -187,11 +179,12 @@ impl Language {
     ///
     /// assert_eq!(lang.as_str(), "und");
     /// ```
+    #[inline]
     pub fn clear(&mut self) {
-        self.0.take();
+        *self = UND
     }
 
-    /// Tests if the [`Language`] subtag is empty.
+    /// Tests if the [`Language`] subtag is empty (equal to `"und"`).
     ///
     /// # Examples
     ///
@@ -207,8 +200,9 @@ impl Language {
     ///
     /// assert_eq!(lang.is_empty(), true);
     /// ```
+    #[inline]
     pub fn is_empty(self) -> bool {
-        self.0.is_none()
+        self == UND
     }
 }
 
@@ -233,7 +227,7 @@ impl writeable::Writeable for Language {
 
     #[inline]
     fn write_len(&self) -> writeable::LengthHint {
-        writeable::LengthHint::exact(self.0.map_or(3, |t| t.len()))
+        writeable::LengthHint::exact(self.0.len())
     }
 }
 
@@ -256,8 +250,14 @@ impl<'l> From<&'l Language> for &'l str {
     }
 }
 
-impl From<Language> for Option<TinyAsciiStr<3>> {
+impl From<Language> for TinyAsciiStr<3> {
     fn from(input: Language) -> Self {
-        input.0.map(Into::into)
+        input.0
+    }
+}
+
+impl Default for Language {
+    fn default() -> Language {
+        Language::und()
     }
 }
diff --git a/provider/cldr/src/transform/locale_canonicalizer/aliases.rs b/provider/cldr/src/transform/locale_canonicalizer/aliases.rs
@@ -138,8 +138,8 @@ impl From<&cldr_serde::aliases::Resource> for AliasesV1 {
                         continue;
                     }
 
-                    let maybe_lang: Option<TinyAsciiStr<3>> = langid.language.into();
-                    if let Some(lang) = maybe_lang {
+                    if !langid.language.is_empty() {
+                        let lang: TinyAsciiStr<3> = langid.language.into();
                         if langid.region.is_none() && langid.variants.is_empty() {
                             // Relatively few aliases exist for two character language identifiers,
                             // so we store them separately to not slow down canonicalization of
@@ -275,9 +275,9 @@ fn test_rules_cmp() {
     assert_eq!(union_size(&rules[3]), 2);
 
     rules.sort_unstable_by(rules_cmp);
-    assert_eq!(rules[0], "und-hepburn-heploc");
-    assert_eq!(rules[1], "en-GB");
-    assert_eq!(rules[2], "fr-CA");
+    assert_eq!(rules[0], "en-GB");
+    assert_eq!(rules[1], "fr-CA");
+    assert_eq!(rules[2], "und-hepburn-heploc");
     assert_eq!(rules[3], "CA");
 }
 
diff --git a/provider/cldr/src/transform/locale_canonicalizer/likely_subtags.rs b/provider/cldr/src/transform/locale_canonicalizer/likely_subtags.rs
@@ -108,7 +108,8 @@ impl From<&cldr_serde::likely_subtags::Resource> for LikelySubtagsV1 {
             };
 
         for entry in other.supplemental.likely_subtags.iter() {
-            if let Some(lang) = entry.0.language.into() {
+            if !entry.0.language.is_empty() {
+                let lang = entry.0.language.into();
                 if let Some(script) = entry.0.script {
                     language_script.insert((lang, script.into()), extract_result(entry));
                 } else if let Some(region) = entry.0.region {
diff --git a/provider/testdata/data/json/locale_canonicalizer/aliases@1.json b/provider/testdata/data/json/locale_canonicalizer/aliases@1.json
@@ -1,10 +1,6 @@
 {
   "language": [],
   "language_variants": [
-    [
-      "und-hepburn-heploc",
-      "und-alalc97"
-    ],
     [
       "aa-saaho",
       "ssy"
@@ -29,6 +25,10 @@
       "no-nynorsk",
       "nn"
     ],
+    [
+      "und-hepburn-heploc",
+      "und-alalc97"
+    ],
     [
       "zh-guoyu",
       "zh"
diff --git a/provider/testdata/data/testdata.postcard b/provider/testdata/data/testdata.postcard