Skip to content

Commit 366417d

Browse files
Merge #218
218: Add UniDic implimentation r=ManyTheFish a=mosuka # Pull Request ## What does this PR do? - Add UniDic implementation to allow consistent tokenization for searching and indexing. - Please see [discussion comment](https://github.com/meilisearch/product/discussions/532#discussioncomment-5895057) ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Minoru Osuka <[email protected]>
2 parents 91e368a + 2b10d0c commit 366417d

File tree

2 files changed

+105
-39
lines changed

2 files changed

+105
-39
lines changed

charabia/Cargo.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ serde = "1.0"
2323
slice-group-by = "0.3.0"
2424
unicode-segmentation = "1.10.1"
2525
whatlang = "0.16.2"
26-
lindera-core = "=0.24.0"
27-
lindera-dictionary = "=0.24.0"
28-
lindera-tokenizer = { version = "=0.24.0", default-features = false, optional = true }
26+
lindera-core = "=0.25.0"
27+
lindera-dictionary = "=0.25.0"
28+
lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true }
2929
pinyin = { version = "0.9", default-features = false, features = [
3030
"with_tone",
3131
], optional = true }
@@ -43,7 +43,9 @@ chinese = ["dep:pinyin", "dep:jieba-rs"]
4343
hebrew = []
4444

4545
# allow japanese specialized tokenization
46-
japanese = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
46+
japanese = ["japanese-segmentation-unidic"]
47+
japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
48+
japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
4749
japanese-transliteration = ["dep:wana_kana"]
4850

4951
# allow korean specialized tokenization

charabia/src/segmenter/japanese.rs

Lines changed: 99 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
use lindera_core::mode::{Mode, Penalty};
1+
use lindera_core::mode::Mode;
2+
#[cfg(feature = "japanese-segmentation-ipadic")]
3+
use lindera_core::mode::Penalty;
24
use lindera_dictionary::{DictionaryConfig, DictionaryKind};
35
use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
46
use once_cell::sync::Lazy;
@@ -11,11 +13,21 @@ use crate::segmenter::Segmenter;
1113
pub struct JapaneseSegmenter;
1214

1315
static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
16+
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
17+
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");
18+
19+
#[cfg(feature = "japanese-segmentation-ipadic")]
1420
let config = TokenizerConfig {
1521
dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
1622
mode: Mode::Decompose(Penalty::default()),
1723
..TokenizerConfig::default()
1824
};
25+
#[cfg(feature = "japanese-segmentation-unidic")]
26+
let config = TokenizerConfig {
27+
dictionary: DictionaryConfig { kind: Some(DictionaryKind::UniDic), path: None },
28+
mode: Mode::Normal,
29+
..TokenizerConfig::default()
30+
};
1931
Tokenizer::from_config(config).unwrap()
2032
});
2133

@@ -32,41 +44,93 @@ mod test {
3244

3345
const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
3446

35-
const SEGMENTED: &[&str] = &[
36-
"関西",
37-
"国際",
38-
"空港",
39-
"限定",
40-
"トートバッグ",
41-
" ",
42-
"すもも",
43-
"も",
44-
"もも",
45-
"も",
46-
"もも",
47-
"の",
48-
"うち",
49-
];
47+
const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
48+
&[
49+
"関西",
50+
"国際",
51+
"空港",
52+
"限定",
53+
"トートバッグ",
54+
" ",
55+
"すもも",
56+
"も",
57+
"もも",
58+
"も",
59+
"もも",
60+
"の",
61+
"うち",
62+
]
63+
} else if cfg!(feature = "japanese-segmentation-unidic") {
64+
&[
65+
"関西",
66+
"国際",
67+
"空港",
68+
"限定",
69+
"トート",
70+
"バッグ",
71+
" ",
72+
"すもも",
73+
"も",
74+
"もも",
75+
"も",
76+
"もも",
77+
"の",
78+
"うち",
79+
]
80+
} else {
81+
&[]
82+
};
83+
84+
const TOKENIZED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
85+
&[
86+
"関西",
87+
"国際",
88+
"空港",
89+
"限定",
90+
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
91+
#[cfg(feature = "japanese-transliteration")]
92+
"とうとは\u{3099}っく\u{3099}",
93+
#[cfg(not(feature = "japanese-transliteration"))]
94+
"トートハ\u{3099}ック\u{3099}",
95+
" ",
96+
"すもも",
97+
"も",
98+
"もも",
99+
"も",
100+
"もも",
101+
"の",
102+
"うち",
103+
]
104+
} else if cfg!(feature = "japanese-segmentation-unidic") {
105+
&[
106+
"関西",
107+
"国際",
108+
"空港",
109+
"限定",
110+
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
111+
#[cfg(feature = "japanese-transliteration")]
112+
"とうと",
113+
#[cfg(not(feature = "japanese-transliteration"))]
114+
"トート",
115+
#[cfg(feature = "japanese-transliteration")]
116+
"は\u{3099}っく\u{3099}",
117+
#[cfg(not(feature = "japanese-transliteration"))]
118+
"ハ\u{3099}ック\u{3099}",
119+
" ",
120+
"すもも",
121+
"も",
122+
"もも",
123+
"も",
124+
"もも",
125+
"の",
126+
"うち",
127+
]
128+
} else {
129+
&[]
130+
};
50131

51-
const TOKENIZED: &[&str] = &[
52-
"関西",
53-
"国際",
54-
"空港",
55-
"限定",
56-
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
57-
#[cfg(feature = "japanese-transliteration")]
58-
"とうとは\u{3099}っく\u{3099}",
59-
#[cfg(not(feature = "japanese-transliteration"))]
60-
"トートハ\u{3099}ック\u{3099}",
61-
" ",
62-
"すもも",
63-
"も",
64-
"もも",
65-
"も",
66-
"もも",
67-
"の",
68-
"うち",
69-
];
132+
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
133+
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");
70134

71135
// Macro that run several tests on the Segmenter.
72136
test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);

0 commit comments

Comments
 (0)