Skip to content

Commit a430f66

Browse files
iainirelandsffc
andauthored
Adds UnicodePropertyMapV1 data struct for enumerated properties (#1161)
* Rename TrieTypeEnum to TrieType TrieType no longer exists, so we don't need an awkward name for TrieTypeEnum. * Implement Yokeable/ZeroCopyFrom for CodePointTrie and data struct * Cargo fmt + minor fixes * Rebase on yoke-generics * Add doc comments * Address feedback Funny how everyone is talking about C9 having an easy route and not GenG lmao. * Add additional derives * Update comment on DATA_GET_ERROR_VALUE Co-authored-by: Shane F. Carr <[email protected]> * Cargo fmt Co-authored-by: Shane F. Carr <[email protected]>
1 parent 42a820b commit a430f66

File tree

8 files changed

+116
-69
lines changed

8 files changed

+116
-69
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utils/codepointtrie/Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ denylist = ["bench"]
3232
all-features = true
3333

3434
[dependencies]
35+
icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] }
3536
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
3637
thiserror = "1.0"
37-
zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] }
38+
zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", "yoke"] }
3839

3940
[dev-dependencies]
4041
postcard = { version = "0.7", features = ["alloc"] }
@@ -45,3 +46,7 @@ zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"]
4546
[lib]
4647
bench = false # This option is required for Benchmark CI
4748
path = "src/lib.rs"
49+
50+
[features]
51+
default = ["provider_serde"]
52+
provider_serde = ["serde"]

utils/codepointtrie/src/codepointtrie.rs

Lines changed: 48 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,17 @@ use crate::error::Error;
66
use crate::impl_const::*;
77

88
use core::convert::TryFrom;
9+
use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom};
910
#[cfg(feature = "serde")]
1011
use serde::{Deserialize, Serialize};
1112
use zerovec::ZeroVec;
1213

13-
// Enums
14-
15-
/// The width of the elements in the data array of a [`CodePointTrie`].
16-
/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
17-
#[derive(Clone, Copy, PartialEq)]
18-
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
19-
pub enum ValueWidthEnum {
20-
Bits16 = 0,
21-
Bits32 = 1,
22-
Bits8 = 2,
23-
}
24-
2514
/// The type of trie represents whether the trie has an optimization that
2615
/// would make it small or fast.
2716
/// See [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
28-
#[derive(Clone, Copy, PartialEq)]
17+
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
2918
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
30-
pub enum TrieTypeEnum {
19+
pub enum TrieType {
3120
/// Represents the "fast" type code point tries for the
3221
/// [`TrieType`] trait. The "fast max" limit is set to `0xffff`.
3322
Fast = 0,
@@ -36,51 +25,29 @@ pub enum TrieTypeEnum {
3625
Small = 1,
3726
}
3827

39-
// ValueWidth trait
28+
// TrieValue trait
4029

4130
// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec"
4231

43-
/// A trait representing the width of the values stored in the data array of a
44-
/// [`CodePointTrie`]. This trait is used as a type parameter in constructing
45-
/// a `CodePointTrie`.
46-
pub trait ValueWidth: Copy + zerovec::ule::AsULE + 'static {
47-
/// This enum variant represents the specific instance of `ValueWidth` such
48-
/// that the enum discriminant values matches ICU4C's enum integer value.
49-
const ENUM_VALUE: ValueWidthEnum;
50-
/// This value is used to indicate an error in the Rust code in accessing
51-
/// a position in the trie's `data` array. In normal cases, the position in
52-
/// the `data` array will return either the correct value, or in case of a
53-
/// logical error in the trie's computation, the trie's own error value
54-
/// which is stored that in the `data` array.
32+
/// A trait representing the values stored in the data array of a [`CodePointTrie`].
33+
/// This trait is used as a type parameter in constructing a `CodePointTrie`.
34+
pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static {
35+
/// Last-resort fallback value to return if we cannot read data from the trie.
36+
///
37+
/// In most cases, the error value is read from the last element of the `data` array.
5538
const DATA_GET_ERROR_VALUE: Self;
56-
fn cast_to_widest(self) -> u32;
5739
}
5840

59-
impl ValueWidth for u8 {
60-
const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits8;
41+
impl TrieValue for u8 {
6142
const DATA_GET_ERROR_VALUE: u8 = u8::MAX;
62-
63-
fn cast_to_widest(self) -> u32 {
64-
self as u32
65-
}
6643
}
6744

68-
impl ValueWidth for u16 {
69-
const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits16;
45+
impl TrieValue for u16 {
7046
const DATA_GET_ERROR_VALUE: u16 = u16::MAX;
71-
72-
fn cast_to_widest(self) -> u32 {
73-
self as u32
74-
}
7547
}
7648

77-
impl ValueWidth for u32 {
78-
const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits32;
49+
impl TrieValue for u32 {
7950
const DATA_GET_ERROR_VALUE: u32 = u32::MAX;
80-
81-
fn cast_to_widest(self) -> u32 {
82-
self
83-
}
8451
}
8552

8653
/// This struct represents a de-serialized CodePointTrie that was exported from
@@ -90,16 +57,18 @@ impl ValueWidth for u32 {
9057
/// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie)
9158
/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup)
9259
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
93-
pub struct CodePointTrie<'trie, W: ValueWidth> {
60+
#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
61+
pub struct CodePointTrie<'trie, T: TrieValue> {
9462
header: CodePointTrieHeader,
9563
#[cfg_attr(feature = "serde", serde(borrow))]
9664
index: ZeroVec<'trie, u16>,
9765
#[cfg_attr(feature = "serde", serde(borrow))]
98-
data: ZeroVec<'trie, W>,
66+
data: ZeroVec<'trie, T>,
9967
}
10068

10169
/// This struct contains the fixed-length header fields of a [`CodePointTrie`].
10270
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
71+
#[derive(Copy, Clone, Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
10372
pub struct CodePointTrieHeader {
10473
/// The code point of the start of the last range of the trie. A
10574
/// range is defined as a partition of the code point space such that the
@@ -130,31 +99,31 @@ pub struct CodePointTrieHeader {
13099
pub null_value: u32,
131100
/// The enum value representing the type of trie, where trie type is as it
132101
/// is defined in ICU (ex: Fast, Small).
133-
pub trie_type: TrieTypeEnum,
102+
pub trie_type: TrieType,
134103
}
135104

136-
impl TryFrom<u8> for TrieTypeEnum {
105+
impl TryFrom<u8> for TrieType {
137106
type Error = crate::error::Error;
138107

139-
fn try_from(trie_type_int: u8) -> Result<TrieTypeEnum, crate::error::Error> {
108+
fn try_from(trie_type_int: u8) -> Result<TrieType, crate::error::Error> {
140109
match trie_type_int {
141-
0 => Ok(TrieTypeEnum::Fast),
142-
1 => Ok(TrieTypeEnum::Small),
110+
0 => Ok(TrieType::Fast),
111+
1 => Ok(TrieType::Small),
143112
_ => Err(crate::error::Error::FromDeserialized {
144113
reason: "Cannot parse value for trie_type",
145114
}),
146115
}
147116
}
148117
}
149118

150-
impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
119+
impl<'trie, T: TrieValue> CodePointTrie<'trie, T> {
151120
/// Returns a new [`CodePointTrie`] backed by borrowed data for the `index`
152121
/// array and `data` array, whose data values have width `W`.
153122
pub fn try_new(
154123
header: CodePointTrieHeader,
155124
index: ZeroVec<'trie, u16>,
156-
data: ZeroVec<'trie, W>,
157-
) -> Result<CodePointTrie<'trie, W>, Error> {
125+
data: ZeroVec<'trie, T>,
126+
) -> Result<CodePointTrie<'trie, T>, Error> {
158127
// Validation invariants are not needed here when constructing a new
159128
// `CodePointTrie` because:
160129
//
@@ -167,7 +136,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
167136
// - The `ZeroVec` serializer stores the length of the array along with the
168137
// ZeroVec data, meaning that a deserializer would also see that length info.
169138

170-
let trie: CodePointTrie<'trie, W> = CodePointTrie {
139+
let trie: CodePointTrie<'trie, T> = CodePointTrie {
171140
header,
172141
index,
173142
data,
@@ -183,7 +152,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
183152

184153
fn internal_small_index(&self, code_point: u32) -> u32 {
185154
let mut index1_pos: u32 = code_point >> SHIFT_1;
186-
if self.header.trie_type == TrieTypeEnum::Fast {
155+
if self.header.trie_type == TrieType::Fast {
187156
debug_assert!(
188157
FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start
189158
);
@@ -290,14 +259,14 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
290259
/// assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32
291260
/// assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32
292261
/// ```
293-
pub fn get(&self, code_point: u32) -> W {
262+
pub fn get(&self, code_point: u32) -> T {
294263
// All code points up to the fast max limit are represented
295264
// individually in the `index` array to hold their `data` array position, and
296265
// thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`).
297266
// Code points above the "fast max" limit require 4 lookups.
298267
let fast_max = match self.header.trie_type {
299-
TrieTypeEnum::Fast => FAST_TYPE_FAST_INDEXING_MAX,
300-
TrieTypeEnum::Small => SMALL_TYPE_FAST_INDEXING_MAX,
268+
TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
269+
TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
301270
};
302271
let data_pos: u32 = if code_point <= fast_max {
303272
Self::fast_index(self, code_point)
@@ -308,12 +277,14 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
308277
};
309278
// Returns the trie value (or trie's error value).
310279
// If we cannot read from the data array, then return the associated constant
311-
// DATA_GET_ERROR_VALUE for the instance type for W: ValueWidth.
280+
// DATA_GET_ERROR_VALUE for the instance type for T: TrieValue.
312281
self.data
313282
.get(data_pos as usize)
314-
.unwrap_or(W::DATA_GET_ERROR_VALUE)
283+
.unwrap_or(T::DATA_GET_ERROR_VALUE)
315284
}
285+
}
316286

287+
impl<'trie, T: TrieValue + Into<u32>> CodePointTrie<'trie, T> {
317288
/// Returns the value that is associated with `code_point` for this [`CodePointTrie`]
318289
/// as a `u32`.
319290
///
@@ -333,7 +304,20 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
333304
// Note: This API method maintains consistency with the corresponding
334305
// original ICU APIs.
335306
pub fn get_u32(&self, code_point: u32) -> u32 {
336-
self.get(code_point).cast_to_widest()
307+
self.get(code_point).into()
308+
}
309+
}
310+
311+
impl<'trie, T: TrieValue> Clone for CodePointTrie<'trie, T>
312+
where
313+
<T as zerovec::ule::AsULE>::ULE: Clone,
314+
{
315+
fn clone(&self) -> Self {
316+
CodePointTrie {
317+
header: self.header,
318+
index: self.index.clone(),
319+
data: self.data.clone(),
320+
}
337321
}
338322
}
339323

utils/codepointtrie/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@ pub mod codepointtrie;
3939
pub mod error;
4040
mod impl_const;
4141
pub mod planes;
42+
pub mod provider;

utils/codepointtrie/src/planes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ pub fn get_planes_trie() -> CodePointTrie<'static, u8> {
176176
let index3_null_offset = 0x2;
177177
let data_null_offset = 0x0;
178178
let null_value = 0x0;
179-
let trie_type = TrieTypeEnum::Small;
179+
let trie_type = TrieType::Small;
180180

181181
let trie_header = CodePointTrieHeader {
182182
high_start,

utils/codepointtrie/src/provider.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// This file is part of ICU4X. For terms of use, please see the file
2+
// called LICENSE at the top level of the ICU4X source tree
3+
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4+
5+
//! Data provider struct definitions for this ICU4X component.
6+
//!
7+
//! Read more about data providers: [`icu_provider`]
8+
9+
use crate::codepointtrie::{CodePointTrie, TrieValue};
10+
use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom};
11+
12+
/// A map efficiently storing data about individual characters.
13+
#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
14+
#[cfg_attr(
15+
feature = "provider_serde",
16+
derive(serde::Serialize, serde::Deserialize)
17+
)]
18+
pub struct UnicodePropertyMapV1<'data, T: TrieValue> {
19+
/// A codepoint trie storing the data
20+
#[cfg_attr(feature = "provider_serde", serde(borrow))]
21+
pub codepoint_trie: CodePointTrie<'data, T>,
22+
}
23+
24+
impl<'data, T: TrieValue> Clone for UnicodePropertyMapV1<'data, T>
25+
where
26+
<T as zerovec::ule::AsULE>::ULE: Clone,
27+
{
28+
fn clone(&self) -> Self {
29+
UnicodePropertyMapV1 {
30+
codepoint_trie: self.codepoint_trie.clone(),
31+
}
32+
}
33+
}
34+
35+
/// Marker type for UnicodePropertyMapV1.
36+
/// This is generated by hand because icu_provider::data_struct doesn't support generics yet.
37+
pub struct UnicodePropertyMapV1Marker<T: TrieValue> {
38+
_phantom: core::marker::PhantomData<T>,
39+
}
40+
41+
impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker<T> {
42+
type Yokeable = UnicodePropertyMapV1<'static, T>;
43+
type Cart = UnicodePropertyMapV1<'data, T>;
44+
}

utils/codepointtrie/tests/planes_test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ fn planes_trie_deserialize_check_test() {
4141

4242
let code_point_trie_struct = planes_enum_prop.code_point_trie.trie_struct;
4343

44-
let trie_type_enum = match TrieTypeEnum::try_from(code_point_trie_struct.trie_type_enum_val) {
44+
let trie_type_enum = match TrieType::try_from(code_point_trie_struct.trie_type_enum_val) {
4545
Ok(enum_val) => enum_val,
4646
_ => {
4747
panic!(

utils/codepointtrie/tests/test_util.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,24 @@ use icu_codepointtrie::codepointtrie::*;
66
use icu_codepointtrie::error::Error;
77

88
use core::convert::TryFrom;
9+
#[cfg(feature = "serde")]
10+
use serde::{Deserialize, Serialize};
911
use std::fs::File;
1012
use std::io::Read;
1113
use std::path::Path;
1214
use zerovec::ZeroVec;
1315

14-
pub fn check_trie<W: ValueWidth>(trie: &CodePointTrie<W>, check_ranges: &[u32]) {
16+
/// The width of the elements in the data array of a [`CodePointTrie`].
17+
/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
18+
#[derive(Clone, Copy, PartialEq)]
19+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
20+
pub enum ValueWidthEnum {
21+
Bits16 = 0,
22+
Bits32 = 1,
23+
Bits8 = 2,
24+
}
25+
26+
pub fn check_trie<T: TrieValue + Into<u32>>(trie: &CodePointTrie<T>, check_ranges: &[u32]) {
1527
assert_eq!(
1628
0,
1729
check_ranges.len() % 2,
@@ -152,7 +164,7 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) {
152164
test_struct.name
153165
);
154166

155-
let trie_type_enum = match TrieTypeEnum::try_from(test_struct.trie_type_enum_val) {
167+
let trie_type_enum = match TrieType::try_from(test_struct.trie_type_enum_val) {
156168
Ok(enum_val) => enum_val,
157169
_ => {
158170
panic!(

0 commit comments

Comments
 (0)