Skip to content

Commit 9a1cba6

Browse files
Add null byte as hard context separator
This allows one to use \0 as artificial separator, for example when concatting lots of small strings into a large string. See this discussion for context: https://github.com/orgs/meilisearch/discussions/744
1 parent c983b9f commit 9a1cba6

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

charabia/src/separators.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
/// - Zl Line Separator
1212
/// - Zp Paragraph Separator
1313
/// - Zs Space Separator
14-
/// plus ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators
14+
/// plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators
1515
/// and "`" to understand markdown formatted text
1616
#[rustfmt::skip]
1717
pub const DEFAULT_SEPARATORS: &[&str] = &[
18-
". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–",
18+
"\0", ". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–",
1919
"—", "―", "⸗", "⸚", "⸺", "⸻", "⹀", "〜", "〰", "゠", "︱", "︲", "﹘", "﹣", "-", "𐺭", ")",
2020
"]", "}", "༻", "༽", "᚜", "⁆", "⁾", "₎", "⌉", "⌋", "〉", "❩", "❫", "❭", "❯", "❱", "❳", "❵", "⟆",
2121
"⟧", "⟩", "⟫", "⟭", "⟯", "⦄", "⦆", "⦈", "⦊", "⦌", "⦎", "⦐", "⦒", "⦔", "⦖", "⦘", "⧙", "⧛", "⧽",
@@ -64,6 +64,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[
6464

6565
#[rustfmt::skip]
6666
pub const CONTEXT_SEPARATORS: &[&str] = &[
67+
"\0", // Null byte, can be used as artificial separator
6768
"᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph.
6869
"᚛", "᚜", // Oghams, mark start and end of text
6970
"!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin

0 commit comments

Comments
 (0)