Skip to content

Commit dd9f2a1

Browse files
committed
Merge branch 'main' into locid
2 parents b20b35a + 0fd8cde commit dd9f2a1

File tree

6 files changed

+56
-45
lines changed

6 files changed

+56
-45
lines changed

components/locale_canonicalizer/src/locale_canonicalizer.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,8 @@ fn uts35_check_language_rules(
165165
locale: &mut Locale,
166166
alias_data: &DataPayload<AliasesV1Marker>,
167167
) -> CanonicalizationResult {
168-
let maybe_lang: Option<TinyAsciiStr<3>> = locale.id.language.into();
169-
if let Some(lang) = maybe_lang {
168+
if !locale.id.language.is_empty() {
169+
let lang: TinyAsciiStr<3> = locale.id.language.into();
170170
let replacement = if lang.len() == 2 {
171171
alias_data
172172
.get()
@@ -557,14 +557,24 @@ impl LocaleCanonicalizer {
557557
return CanonicalizationResult::Unmodified;
558558
}
559559

560-
if let Some(language) = langid.language.into() {
560+
if !langid.language.is_empty() {
561561
if let Some(region) = langid.region {
562-
maximize_locale!(langid, data.language_region, language, region.into());
562+
maximize_locale!(
563+
langid,
564+
data.language_region,
565+
langid.language.into(),
566+
region.into()
567+
);
563568
}
564569
if let Some(script) = langid.script {
565-
maximize_locale!(langid, data.language_script, language, script.into());
570+
maximize_locale!(
571+
langid,
572+
data.language_script,
573+
langid.language.into(),
574+
script.into()
575+
);
566576
}
567-
maximize_locale!(langid, data.language, language);
577+
maximize_locale!(langid, data.language, langid.language.into());
568578
}
569579
if let Some(script) = langid.script {
570580
if let Some(region) = langid.region {

components/locid/src/subtags/language.rs

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::parser::errors::ParserError;
66
use core::fmt;
77
use core::ops::RangeInclusive;
88
use core::str::FromStr;
9-
use tinystr::{tinystr, TinyAsciiStr};
9+
use tinystr::TinyAsciiStr;
1010

1111
/// A language subtag (examples: `"en"`, `"csb"`, `"zh"`, `"und"`, etc.)
1212
///
@@ -38,11 +38,13 @@ use tinystr::{tinystr, TinyAsciiStr};
3838
/// but that form has not been used and ICU4X does not support it right now.
3939
///
4040
/// [`unicode_language_id`]: https://unicode.org/reports/tr35/#unicode_language_id
41-
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
42-
pub struct Language(Option<TinyAsciiStr<{ *LANGUAGE_LENGTH.end() }>>);
41+
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
42+
pub struct Language(TinyAsciiStr<{ *LANGUAGE_LENGTH.end() }>);
4343

4444
const LANGUAGE_LENGTH: RangeInclusive<usize> = 2..=3;
45-
const UND_VALUE: TinyAsciiStr<3> = tinystr!(3, "und");
45+
// TODO(#348): Change this to invoke a const function.
46+
// Safe because "und" is a valid language subtag
47+
const UND: Language = Language(unsafe { TinyAsciiStr::from_bytes_unchecked(*b"und") });
4648

4749
impl Language {
4850
/// A constructor which takes a utf8 slice, parses it and
@@ -75,16 +77,8 @@ impl Language {
7577
return Err(ParserError::InvalidLanguage);
7678
}
7779

78-
if slen == 3
79-
&& v[start] == UND_VALUE.all_bytes()[0]
80-
&& v[start + 1] == UND_VALUE.all_bytes()[1]
81-
&& v[start + 2] == UND_VALUE.all_bytes()[2]
82-
{
83-
return Ok(Self(None));
84-
}
85-
8680
match TinyAsciiStr::from_bytes_manual_slice(v, start, end) {
87-
Ok(s) if s.is_ascii_alphabetic() => Ok(Self(Some(s.to_ascii_lowercase()))),
81+
Ok(s) if s.is_ascii_alphabetic() => Ok(Self(s.to_ascii_lowercase())),
8882
_ => Err(ParserError::InvalidLanguage),
8983
}
9084
}
@@ -104,8 +98,8 @@ impl Language {
10498
/// let lang = unsafe { Language::from_raw_unchecked(raw) };
10599
/// assert_eq!(lang, "en");
106100
/// ```
107-
pub fn into_raw(self) -> Option<[u8; 3]> {
108-
self.0.as_ref().map(TinyAsciiStr::all_bytes).copied()
101+
pub fn into_raw(self) -> [u8; 3] {
102+
*self.0.all_bytes()
109103
}
110104

111105
/// Constructor which takes a raw value returned by
@@ -128,11 +122,8 @@ impl Language {
128122
///
129123
/// This function accepts a [`[u8; 3]`] that is expected to be a valid [`TinyAsciiStr<3>`]
130124
/// representing a [`Language`] subtag in canonical syntax.
131-
pub const unsafe fn from_raw_unchecked(v: Option<[u8; 3]>) -> Self {
132-
Self(match v {
133-
Some(v) => Some(TinyAsciiStr::from_bytes_unchecked(v)),
134-
None => None,
135-
})
125+
pub const unsafe fn from_raw_unchecked(v: [u8; 3]) -> Self {
126+
Self(TinyAsciiStr::from_bytes_unchecked(v))
136127
}
137128

138129
/// Returns the default undefined language "und". Same as [`default()`](Default::default()), but is `const`.
@@ -148,7 +139,7 @@ impl Language {
148139
/// ```
149140
#[inline]
150141
pub const fn und() -> Self {
151-
Self(None)
142+
UND
152143
}
153144

154145
/// A helper function for displaying
@@ -167,11 +158,12 @@ impl Language {
167158
///
168159
/// `Notice`: For many use cases, such as comparison,
169160
/// [`Language`] implements [`PartialEq`]`<&`[`str`]`>` which allows for direct comparisons.
161+
#[inline]
170162
pub fn as_str(&self) -> &str {
171-
self.0.as_deref().unwrap_or("und")
163+
self.0.as_str()
172164
}
173165

174-
/// Resets the [`Language`] subtag to an empty one.
166+
/// Resets the [`Language`] subtag to an empty one (equal to `"und"`).
175167
///
176168
/// # Examples
177169
///
@@ -187,11 +179,12 @@ impl Language {
187179
///
188180
/// assert_eq!(lang.as_str(), "und");
189181
/// ```
182+
#[inline]
190183
pub fn clear(&mut self) {
191-
self.0.take();
184+
*self = UND
192185
}
193186

194-
/// Tests if the [`Language`] subtag is empty.
187+
/// Tests if the [`Language`] subtag is empty (equal to `"und"`).
195188
///
196189
/// # Examples
197190
///
@@ -207,8 +200,9 @@ impl Language {
207200
///
208201
/// assert_eq!(lang.is_empty(), true);
209202
/// ```
203+
#[inline]
210204
pub fn is_empty(self) -> bool {
211-
self.0.is_none()
205+
self == UND
212206
}
213207
}
214208

@@ -233,7 +227,7 @@ impl writeable::Writeable for Language {
233227

234228
#[inline]
235229
fn write_len(&self) -> writeable::LengthHint {
236-
writeable::LengthHint::exact(self.0.map_or(3, |t| t.len()))
230+
writeable::LengthHint::exact(self.0.len())
237231
}
238232
}
239233

@@ -256,8 +250,14 @@ impl<'l> From<&'l Language> for &'l str {
256250
}
257251
}
258252

259-
impl From<Language> for Option<TinyAsciiStr<3>> {
253+
impl From<Language> for TinyAsciiStr<3> {
260254
fn from(input: Language) -> Self {
261-
input.0.map(Into::into)
255+
input.0
256+
}
257+
}
258+
259+
impl Default for Language {
260+
fn default() -> Language {
261+
Language::und()
262262
}
263263
}

provider/cldr/src/transform/locale_canonicalizer/aliases.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ impl From<&cldr_serde::aliases::Resource> for AliasesV1 {
138138
continue;
139139
}
140140

141-
let maybe_lang: Option<TinyAsciiStr<3>> = langid.language.into();
142-
if let Some(lang) = maybe_lang {
141+
if !langid.language.is_empty() {
142+
let lang: TinyAsciiStr<3> = langid.language.into();
143143
if langid.region.is_none() && langid.variants.is_empty() {
144144
// Relatively few aliases exist for two character language identifiers,
145145
// so we store them separately to not slow down canonicalization of
@@ -275,9 +275,9 @@ fn test_rules_cmp() {
275275
assert_eq!(union_size(&rules[3]), 2);
276276

277277
rules.sort_unstable_by(rules_cmp);
278-
assert_eq!(rules[0], "und-hepburn-heploc");
279-
assert_eq!(rules[1], "en-GB");
280-
assert_eq!(rules[2], "fr-CA");
278+
assert_eq!(rules[0], "en-GB");
279+
assert_eq!(rules[1], "fr-CA");
280+
assert_eq!(rules[2], "und-hepburn-heploc");
281281
assert_eq!(rules[3], "CA");
282282
}
283283

provider/cldr/src/transform/locale_canonicalizer/likely_subtags.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ impl From<&cldr_serde::likely_subtags::Resource> for LikelySubtagsV1 {
108108
};
109109

110110
for entry in other.supplemental.likely_subtags.iter() {
111-
if let Some(lang) = entry.0.language.into() {
111+
if !entry.0.language.is_empty() {
112+
let lang = entry.0.language.into();
112113
if let Some(script) = entry.0.script {
113114
language_script.insert((lang, script.into()), extract_result(entry));
114115
} else if let Some(region) = entry.0.region {

provider/testdata/data/json/locale_canonicalizer/[email protected]

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
{
22
"language": [],
33
"language_variants": [
4-
[
5-
"und-hepburn-heploc",
6-
"und-alalc97"
7-
],
84
[
95
"aa-saaho",
106
"ssy"
@@ -29,6 +25,10 @@
2925
"no-nynorsk",
3026
"nn"
3127
],
28+
[
29+
"und-hepburn-heploc",
30+
"und-alalc97"
31+
],
3232
[
3333
"zh-guoyu",
3434
"zh"
0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)