Skip to content

Commit 4cadf24

Browse files
Merge #295
295: Add null byte as hard context separator r=ManyTheFish a=LukasKalbertodt # Pull Request ## Related issue Fixes https://github.com/orgs/meilisearch/discussions/744 ## What does this PR do? Adds `\0` as context separator. This allows one to use \0 as artificial separator, for example when concatting lots of small strings into a large string. See this discussion for context: https://github.com/orgs/meilisearch/discussions/744 ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [ ] Have you read the contributing guidelines? - [ ] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Lukas Kalbertodt <[email protected]>
2 parents d5e1b43 + 9a1cba6 commit 4cadf24

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

charabia/src/separators.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
/// - Zl Line Separator
1212
/// - Zp Paragraph Separator
1313
/// - Zs Space Separator
14-
/// plus ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators
14+
/// plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators
1515
/// and "`" to understand markdown formatted text
1616
#[rustfmt::skip]
1717
pub const DEFAULT_SEPARATORS: &[&str] = &[
18-
". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–",
18+
"\0", ". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–",
1919
"—", "―", "⸗", "⸚", "⸺", "⸻", "⹀", "〜", "〰", "゠", "︱", "︲", "﹘", "﹣", "-", "𐺭", ")",
2020
"]", "}", "༻", "༽", "᚜", "⁆", "⁾", "₎", "⌉", "⌋", "〉", "❩", "❫", "❭", "❯", "❱", "❳", "❵", "⟆",
2121
"⟧", "⟩", "⟫", "⟭", "⟯", "⦄", "⦆", "⦈", "⦊", "⦌", "⦎", "⦐", "⦒", "⦔", "⦖", "⦘", "⧙", "⧛", "⧽",
@@ -64,6 +64,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[
6464

6565
#[rustfmt::skip]
6666
pub const CONTEXT_SEPARATORS: &[&str] = &[
67+
"\0", // Null byte, can be used as artificial separator
6768
"᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph.
6869
"᚛", "᚜", // Oghams, mark start and end of text
6970
"!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin

0 commit comments

Comments
 (0)