Skip to content

Commit 98a7337

Browse files
committed
syntax/unicode: lightly refactor Perl Unicode class handling
This nominally moves the logic for acquiring Unicode-aware Perl character classes into the `unicode` module, and also makes the calling code robust with respect to failures. This commit is prep work for making the availability of Unicode-aware Perl classes optional.
1 parent 5204ee4 commit 98a7337

File tree

2 files changed

+86
-96
lines changed

2 files changed

+86
-96
lines changed

regex-syntax/src/hir/translate.rs

+60-95
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
297297
}
298298
Ast::Class(ast::Class::Perl(ref x)) => {
299299
if self.flags().unicode() {
300-
let cls = self.hir_perl_unicode_class(x);
300+
let cls = self.hir_perl_unicode_class(x)?;
301301
let hcls = hir::Class::Unicode(cls);
302302
self.push(HirFrame::Expr(Hir::class(hcls)));
303303
} else {
@@ -450,7 +450,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
450450
}
451451
ast::ClassSetItem::Perl(ref x) => {
452452
if self.flags().unicode() {
453-
let xcls = self.hir_perl_unicode_class(x);
453+
let xcls = self.hir_perl_unicode_class(x)?;
454454
let mut cls = self.pop().unwrap().unwrap_class_unicode();
455455
cls.union(&xcls);
456456
self.push(HirFrame::ClassUnicode(cls));
@@ -800,47 +800,36 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
800800
property_value: value,
801801
},
802802
};
803-
match unicode::class(query) {
804-
Ok(mut class) => {
805-
self.unicode_fold_and_negate(ast_class.negated, &mut class);
806-
Ok(class)
807-
}
808-
Err(unicode::Error::PropertyNotFound) => {
809-
Err(self
810-
.error(ast_class.span, ErrorKind::UnicodePropertyNotFound))
811-
}
812-
Err(unicode::Error::PropertyValueNotFound) => Err(self.error(
813-
ast_class.span,
814-
ErrorKind::UnicodePropertyValueNotFound,
815-
)),
803+
let mut result = self.convert_unicode_class_error(
804+
&ast_class.span,
805+
unicode::class(query),
806+
);
807+
if let Ok(ref mut class) = result {
808+
self.unicode_fold_and_negate(ast_class.negated, class);
816809
}
810+
result
817811
}
818812

819813
fn hir_perl_unicode_class(
820814
&self,
821815
ast_class: &ast::ClassPerl,
822-
) -> hir::ClassUnicode {
816+
) -> Result<hir::ClassUnicode> {
823817
use ast::ClassPerlKind::*;
824-
use unicode_tables::perl_word::PERL_WORD;
825818

826819
assert!(self.flags().unicode());
827-
let mut class = match ast_class.kind {
828-
Digit => {
829-
let query = ClassQuery::Binary("Decimal_Number");
830-
unicode::class(query).unwrap()
831-
}
832-
Space => {
833-
let query = ClassQuery::Binary("Whitespace");
834-
unicode::class(query).unwrap()
835-
}
836-
Word => unicode::hir_class(PERL_WORD),
820+
let result = match ast_class.kind {
821+
Digit => unicode::perl_digit(),
822+
Space => unicode::perl_space(),
823+
Word => unicode::perl_word(),
837824
};
825+
let mut class =
826+
self.convert_unicode_class_error(&ast_class.span, result)?;
838827
// We needn't apply case folding here because the Perl Unicode classes
839828
// are already closed under Unicode simple case folding.
840829
if ast_class.negated {
841830
class.negate();
842831
}
843-
class
832+
Ok(class)
844833
}
845834

846835
fn hir_perl_byte_class(
@@ -863,6 +852,28 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
863852
class
864853
}
865854

855+
/// Converts the given Unicode specific error to an HIR translation error.
856+
///
857+
/// The span given should approximate the position at which an error would
858+
/// occur.
859+
fn convert_unicode_class_error(
860+
&self,
861+
span: &Span,
862+
result: unicode::Result<hir::ClassUnicode>,
863+
) -> Result<hir::ClassUnicode> {
864+
result.map_err(|err| {
865+
let sp = span.clone();
866+
match err {
867+
unicode::Error::PropertyNotFound => {
868+
self.error(sp, ErrorKind::UnicodePropertyNotFound)
869+
}
870+
unicode::Error::PropertyValueNotFound => {
871+
self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
872+
}
873+
}
874+
})
875+
}
876+
866877
fn unicode_fold_and_negate(
867878
&self,
868879
negated: bool,
@@ -1017,74 +1028,28 @@ fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
10171028

10181029
fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
10191030
use ast::ClassAsciiKind::*;
1020-
1021-
// The contortions below with `const` appear necessary for older versions
1022-
// of Rust.
1023-
type T = &'static [(char, char)];
10241031
match *kind {
1025-
Alnum => {
1026-
const X: T = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1027-
X
1028-
}
1029-
Alpha => {
1030-
const X: T = &[('A', 'Z'), ('a', 'z')];
1031-
X
1032-
}
1033-
Ascii => {
1034-
const X: T = &[('\x00', '\x7F')];
1035-
X
1036-
}
1037-
Blank => {
1038-
const X: T = &[('\t', '\t'), (' ', ' ')];
1039-
X
1040-
}
1041-
Cntrl => {
1042-
const X: T = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1043-
X
1044-
}
1045-
Digit => {
1046-
const X: T = &[('0', '9')];
1047-
X
1048-
}
1049-
Graph => {
1050-
const X: T = &[('!', '~')];
1051-
X
1052-
}
1053-
Lower => {
1054-
const X: T = &[('a', 'z')];
1055-
X
1056-
}
1057-
Print => {
1058-
const X: T = &[(' ', '~')];
1059-
X
1060-
}
1061-
Punct => {
1062-
const X: T = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1063-
X
1064-
}
1065-
Space => {
1066-
const X: T = &[
1067-
('\t', '\t'),
1068-
('\n', '\n'),
1069-
('\x0B', '\x0B'),
1070-
('\x0C', '\x0C'),
1071-
('\r', '\r'),
1072-
(' ', ' '),
1073-
];
1074-
X
1075-
}
1076-
Upper => {
1077-
const X: T = &[('A', 'Z')];
1078-
X
1079-
}
1080-
Word => {
1081-
const X: T = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')];
1082-
X
1083-
}
1084-
Xdigit => {
1085-
const X: T = &[('0', '9'), ('A', 'F'), ('a', 'f')];
1086-
X
1087-
}
1032+
Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
1033+
Alpha => &[('A', 'Z'), ('a', 'z')],
1034+
Ascii => &[('\x00', '\x7F')],
1035+
Blank => &[('\t', '\t'), (' ', ' ')],
1036+
Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
1037+
Digit => &[('0', '9')],
1038+
Graph => &[('!', '~')],
1039+
Lower => &[('a', 'z')],
1040+
Print => &[(' ', '~')],
1041+
Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
1042+
Space => &[
1043+
('\t', '\t'),
1044+
('\n', '\n'),
1045+
('\x0B', '\x0B'),
1046+
('\x0C', '\x0C'),
1047+
('\r', '\r'),
1048+
(' ', ' '),
1049+
],
1050+
Upper => &[('A', 'Z')],
1051+
Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
1052+
Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
10881053
}
10891054
}
10901055

regex-syntax/src/unicode.rs

+26-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ use unicode_tables::script_extension;
1414
use unicode_tables::sentence_break;
1515
use unicode_tables::word_break;
1616

17-
type Result<T> = result::Result<T, Error>;
17+
/// A type alias for errors specific to Unicode handling of classes.
18+
pub type Result<T> = result::Result<T, Error>;
1819

1920
/// An error that occurs when dealing with Unicode.
2021
///
@@ -265,6 +266,30 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
265266
}
266267
}
267268

269+
/// Returns a Unicode aware class for \w.
270+
///
271+
/// This returns an error if the data is not available for \w.
272+
pub fn perl_word() -> Result<hir::ClassUnicode> {
273+
use unicode_tables::perl_word::PERL_WORD;
274+
Ok(hir_class(PERL_WORD))
275+
}
276+
277+
/// Returns a Unicode aware class for \s.
278+
///
279+
/// This returns an error if the data is not available for \s.
280+
pub fn perl_space() -> Result<hir::ClassUnicode> {
281+
let query = ClassQuery::Binary("Whitespace");
282+
class(query)
283+
}
284+
285+
/// Returns a Unicode aware class for \d.
286+
///
287+
/// This returns an error if the data is not available for \d.
288+
pub fn perl_digit() -> Result<hir::ClassUnicode> {
289+
let query = ClassQuery::Binary("Decimal_Number");
290+
class(query)
291+
}
292+
268293
/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
269294
pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
270295
let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges

0 commit comments

Comments
 (0)