Skip to content

Commit 1e82d44

Browse files
committed
syntax: remove all uses of 'as'
It turns out that all uses of 'as' in the regex-syntax crate can be replaced with either explicitly infallible routines (like 'u32::from(char)'), or with routines that will panic on failure. These panics are strictly better than truncating casts that might otherwise lead to subtle bugs in the context of this crate. (Namely, we never really care about the perf effects here, since regex parsing is just never a bottleneck.)
1 parent 58dd511 commit 1e82d44

File tree

9 files changed

+181
-156
lines changed

9 files changed

+181
-156
lines changed

regex-syntax/src/ast/mod.rs

+6-5
Original file line numberDiff line numberDiff line change
@@ -615,11 +615,12 @@ impl Literal {
615615
/// If this literal was written as a `\x` hex escape, then this returns
616616
/// the corresponding byte value. Otherwise, this returns `None`.
617617
pub fn byte(&self) -> Option<u8> {
618-
let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
619-
if self.c as u32 <= 255 && self.kind == short_hex {
620-
Some(self.c as u8)
621-
} else {
622-
None
618+
match self.kind {
619+
LiteralKind::HexFixed(HexLiteralKind::X) => {
620+
// MSRV(1.59): Use 'u8::try_from(self.c)' instead.
621+
u8::try_from(u32::from(self.c)).ok()
622+
}
623+
_ => None,
623624
}
624625
}
625626
}

regex-syntax/src/ast/print.rs

+7-7
Original file line numberDiff line numberDiff line change
@@ -213,24 +213,24 @@ impl<W: fmt::Write> Writer<W> {
213213
match ast.kind {
214214
Verbatim => self.wtr.write_char(ast.c),
215215
Punctuation => write!(self.wtr, r"\{}", ast.c),
216-
Octal => write!(self.wtr, r"\{:o}", ast.c as u32),
216+
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
217217
HexFixed(ast::HexLiteralKind::X) => {
218-
write!(self.wtr, r"\x{:02X}", ast.c as u32)
218+
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
219219
}
220220
HexFixed(ast::HexLiteralKind::UnicodeShort) => {
221-
write!(self.wtr, r"\u{:04X}", ast.c as u32)
221+
write!(self.wtr, r"\u{:04X}", u32::from(ast.c))
222222
}
223223
HexFixed(ast::HexLiteralKind::UnicodeLong) => {
224-
write!(self.wtr, r"\U{:08X}", ast.c as u32)
224+
write!(self.wtr, r"\U{:08X}", u32::from(ast.c))
225225
}
226226
HexBrace(ast::HexLiteralKind::X) => {
227-
write!(self.wtr, r"\x{{{:X}}}", ast.c as u32)
227+
write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c))
228228
}
229229
HexBrace(ast::HexLiteralKind::UnicodeShort) => {
230-
write!(self.wtr, r"\u{{{:X}}}", ast.c as u32)
230+
write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c))
231231
}
232232
HexBrace(ast::HexLiteralKind::UnicodeLong) => {
233-
write!(self.wtr, r"\U{{{:X}}}", ast.c as u32)
233+
write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c))
234234
}
235235
Special(ast::SpecialLiteralKind::Bell) => {
236236
self.wtr.write_str(r"\a")

regex-syntax/src/hir/interval.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ impl Bound for u8 {
481481
u8::MAX
482482
}
483483
fn as_u32(self) -> u32 {
484-
self as u32
484+
u32::from(self)
485485
}
486486
fn increment(self) -> Self {
487487
self.checked_add(1).unwrap()
@@ -499,20 +499,20 @@ impl Bound for char {
499499
'\u{10FFFF}'
500500
}
501501
fn as_u32(self) -> u32 {
502-
self as u32
502+
u32::from(self)
503503
}
504504

505505
fn increment(self) -> Self {
506506
match self {
507507
'\u{D7FF}' => '\u{E000}',
508-
c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
508+
c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
509509
}
510510
}
511511

512512
fn decrement(self) -> Self {
513513
match self {
514514
'\u{E000}' => '\u{D7FF}',
515-
c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
515+
c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
516516
}
517517
}
518518
}

regex-syntax/src/hir/literal/mod.rs

+18-17
Original file line numberDiff line numberDiff line change
@@ -475,8 +475,8 @@ impl Literals {
475475
base = vec![Literal::empty()];
476476
}
477477
for r in cls.iter() {
478-
let (s, e) = (r.start as u32, r.end as u32 + 1);
479-
for c in (s..e).filter_map(char::from_u32) {
478+
let (s, e) = (u32::from(r.start), u32::from(r.end));
479+
for c in (s..=e).filter_map(char::from_u32) {
480480
for mut lit in base.clone() {
481481
let mut bytes = c.to_string().into_bytes();
482482
if reverse {
@@ -502,8 +502,7 @@ impl Literals {
502502
base = vec![Literal::empty()];
503503
}
504504
for r in cls.iter() {
505-
let (s, e) = (r.start as u32, r.end as u32 + 1);
506-
for b in (s..e).map(|b| b as u8) {
505+
for b in r.start..=r.end {
507506
for mut lit in base.clone() {
508507
lit.push(b);
509508
self.lits.push(lit);
@@ -784,7 +783,10 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
784783
lits: &mut Literals,
785784
mut f: F,
786785
) {
787-
if min == 0 {
786+
// If 'min' somehow overflows usize, then we just treat it as 0, which is
787+
// the most conservative thing we can do.
788+
let umin = usize::try_from(min).unwrap_or(0);
789+
if umin == 0 {
788790
// This is a bit conservative. If `max` is set, then we could
789791
// treat this as a finite set of alternations. For now, we
790792
// just treat it as `e*`.
@@ -797,11 +799,11 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
797799
lits,
798800
);
799801
} else {
800-
if min > 0 {
801-
let n = cmp::min(lits.limit_size, min as usize);
802+
if umin > 0 {
803+
let n = cmp::min(lits.limit_size, umin);
802804
let es = iter::repeat(e.clone()).take(n).collect();
803805
f(&Hir::concat(es), lits);
804-
if n < min as usize || lits.contains_empty() {
806+
if n < umin || lits.contains_empty() {
805807
lits.cut();
806808
}
807809
}
@@ -928,12 +930,13 @@ fn escape_unicode(bytes: &[u8]) -> String {
928930
let mut space_escaped = String::new();
929931
for c in show.chars() {
930932
if c.is_whitespace() {
931-
let escaped = if c as u32 <= 0x7F {
932-
escape_byte(c as u8)
933-
} else if c as u32 <= 0xFFFF {
934-
format!(r"\u{{{:04x}}}", c as u32)
933+
let cp = u32::from(c);
934+
let escaped = if cp <= 0x7F {
935+
escape_byte(u8::try_from(cp).unwrap())
936+
} else if cp <= 0xFFFF {
937+
format!(r"\u{{{:04x}}}", cp)
935938
} else {
936-
format!(r"\U{{{:08x}}}", c as u32)
939+
format!(r"\U{{{:08x}}}", cp)
937940
};
938941
space_escaped.push_str(&escaped);
939942
} else {
@@ -959,13 +962,11 @@ fn escape_byte(byte: u8) -> String {
959962
}
960963

961964
fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
962-
cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
963-
as usize
965+
cls.iter().map(|&r| r.len()).sum()
964966
}
965967

966968
fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
967-
cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
968-
as usize
969+
cls.iter().map(|&r| r.len()).sum()
969970
}
970971

971972
#[cfg(test)]

regex-syntax/src/hir/mod.rs

+34-7
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,13 @@ pub enum ErrorKind {
9090
__Nonexhaustive,
9191
}
9292

93+
// BREADCRUMBS:
94+
//
95+
// Remove EmptyClassNotAllowed
96+
// Make errors non_exhaustive
97+
// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc)
98+
// Get rid of deprecated things
99+
93100
impl ErrorKind {
94101
// TODO: Remove this method entirely on the next breaking semver release.
95102
#[allow(deprecated)]
@@ -1013,12 +1020,12 @@ impl fmt::Debug for ClassUnicodeRange {
10131020
{
10141021
self.start.to_string()
10151022
} else {
1016-
format!("0x{:X}", self.start as u32)
1023+
format!("0x{:X}", u32::from(self.start))
10171024
};
10181025
let end = if !self.end.is_whitespace() && !self.end.is_control() {
10191026
self.end.to_string()
10201027
} else {
1021-
format!("0x{:X}", self.end as u32)
1028+
format!("0x{:X}", u32::from(self.end))
10221029
};
10231030
f.debug_struct("ClassUnicodeRange")
10241031
.field("start", &start)
@@ -1058,10 +1065,9 @@ impl Interval for ClassUnicodeRange {
10581065
if !unicode::contains_simple_case_mapping(self.start, self.end)? {
10591066
return Ok(());
10601067
}
1061-
let start = self.start as u32;
1062-
let end = (self.end as u32).saturating_add(1);
1068+
let (start, end) = (u32::from(self.start), u32::from(self.end));
10631069
let mut next_simple_cp = None;
1064-
for cp in (start..end).filter_map(char::from_u32) {
1070+
for cp in (start..=end).filter_map(char::from_u32) {
10651071
if next_simple_cp.map_or(false, |next| cp < next) {
10661072
continue;
10671073
}
@@ -1104,6 +1110,18 @@ impl ClassUnicodeRange {
11041110
pub fn end(&self) -> char {
11051111
self.end
11061112
}
1113+
1114+
/// Returns the number of codepoints in this range.
1115+
pub fn len(&self) -> usize {
1116+
let diff = 1 + u32::from(self.end) - u32::from(self.start);
1117+
// This is likely to panic in 16-bit targets since a usize can only fit
1118+
// 2^16. It's not clear what to do here, other than to return an error
1119+
// when building a Unicode class that contains a range whose length
1120+
// overflows usize. (Which, to be honest, is probably quite common on
1121+
// 16-bit targets. For example, this would imply that '.' and '\p{any}'
1122+
// would be impossible to build.)
1123+
usize::try_from(diff).expect("char class len fits in usize")
1124+
}
11071125
}
11081126

11091127
/// A set of characters represented by arbitrary bytes (where one byte
@@ -1291,18 +1309,27 @@ impl ClassBytesRange {
12911309
pub fn end(&self) -> u8 {
12921310
self.end
12931311
}
1312+
1313+
/// Returns the number of bytes in this range.
1314+
pub fn len(&self) -> usize {
1315+
usize::from(self.end.checked_sub(self.start).unwrap())
1316+
.checked_add(1)
1317+
.unwrap()
1318+
}
12941319
}
12951320

12961321
impl fmt::Debug for ClassBytesRange {
12971322
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
12981323
let mut debug = f.debug_struct("ClassBytesRange");
12991324
if self.start <= 0x7F {
1300-
debug.field("start", &(self.start as char));
1325+
let ch = char::try_from(self.start).unwrap();
1326+
debug.field("start", &ch);
13011327
} else {
13021328
debug.field("start", &self.start);
13031329
}
13041330
if self.end <= 0x7F {
1305-
debug.field("end", &(self.end as char));
1331+
let ch = char::try_from(self.start).unwrap();
1332+
debug.field("end", &ch);
13061333
} else {
13071334
debug.field("end", &self.end);
13081335
}

regex-syntax/src/hir/print.rs

+4-6
Original file line numberDiff line numberDiff line change
@@ -217,18 +217,16 @@ impl<W: fmt::Write> Writer<W> {
217217
}
218218

219219
fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
220-
let c = b as char;
221-
if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
222-
self.write_literal_char(c)
220+
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
221+
self.write_literal_char(char::try_from(b).unwrap())
223222
} else {
224223
write!(self.wtr, "(?-u:\\x{:02X})", b)
225224
}
226225
}
227226

228227
fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
229-
let c = b as char;
230-
if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
231-
self.write_literal_char(c)
228+
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
229+
self.write_literal_char(char::try_from(b).unwrap())
232230
} else {
233231
write!(self.wtr, "\\x{:02X}", b)
234232
}

0 commit comments

Comments
 (0)