From b180d67c2c79b6362c037603f27bfebb86922d0a Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 22 Apr 2025 12:37:07 -0600 Subject: [PATCH 1/2] Have the parser reject quant bounds over UInt16.max --- .../Regex/Parse/LexicalAnalysis.swift | 34 ++++++++++++++++--- Tests/RegexTests/LexTests.swift | 19 +++++++++++ Tests/RegexTests/MatchTests.swift | 6 ++++ 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index b38a07e12..80a5dcb0e 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -331,7 +331,9 @@ extension Parser { /// /// Diagnoses on overflow /// - mutating func lexNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number? { + mutating func lexNumber( + _ kind: RadixKind = .decimal + ) -> AST.Atom.Number? { guard let str = tryEatPrefix(kind.characterFilter) else { return nil } @@ -342,6 +344,26 @@ extension Parser { return .init(i, at: str.location) } + /// Try to eat a quantification bound, such as appears in `/x{3,12}` + /// + /// Returns: `nil` if there's no number, otherwise the number + /// + /// Diagnoses on overflow. Currenlty, we will diagnose for any values over `UInt16.max` + /// + mutating func lexQuantBound() -> AST.Atom.Number? { + let kind = RadixKind.decimal + guard let str = tryEatPrefix(kind.characterFilter) else { + return nil + } + guard let i = UInt16(str.value, radix: kind.radix) else { + error(.numberOverflow(str.value), at: str.location) + return .init(nil, at: str.location) + } + + return .init(Int(i), at: str.location) + } + + /// Expect a number of a given `kind`, diagnosing if a number cannot be /// parsed. mutating func expectNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number { @@ -492,7 +514,7 @@ extension Parser { return p.tryEating { p in guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), + let range = p.lexQuantRange(trivia: &trivia), p.tryEat("}") else { return nil } return range.value @@ -519,12 +541,14 @@ extension Parser { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange(trivia: inout [AST.Trivia]) -> Located? { + mutating func lexQuantRange( + trivia: inout [AST.Trivia] + ) -> Located? { recordLoc { p in p.tryEating { p in if let t = p.lexWhitespace() { trivia.append(t) } - let lowerOpt = p.lexNumber() + let lowerOpt = p.lexQuantBound() if let t = p.lexWhitespace() { trivia.append(t) } @@ -546,7 +570,7 @@ extension Parser { if let t = p.lexWhitespace() { trivia.append(t) } - var upperOpt = p.lexNumber() + var upperOpt = p.lexQuantBound() if closedRange == false { // If we have an open range, the upper bound should be adjusted down. upperOpt?.value? -= 1 diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 53775e66e..ccfd18eb8 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -63,6 +63,25 @@ extension RegexTests { _ = p.lexNumber() } + let invalidQuantBounds: Array = [ + "65536", // UInt16.max + 1 + "2147483646", // Int32.max - 1 + "9223372036854775806", // Int64.max - 1 + ] + + for invalidNum in invalidQuantBounds { + let regexes: Array = [ + "x{\(invalidNum)}", + "x{1,\(invalidNum)}", + "x{\(invalidNum),1}", + ] + for regex in regexes { + diagnose(regex, expecting: .numberOverflow(invalidNum)) { p in + _ = p.parse() + } + } + } + // TODO: want to dummy print out source ranges, etc, test that. } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c52560d66..017005e5b 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -751,6 +751,12 @@ extension RegexTests { firstMatchTest("(?U)a??", input: "a", match: "a") firstMatchTest("(?U)a??a", input: "aaa", match: "aa") + // Quantification syntax is somewhat dependent on the contents. + // In JS, PCRE2, Python, and some others, /x{-1}/ will be literally "x{-1}" + // Note that Java8 and Rust throw an (unhelpful) error + firstMatchTest("x{-1}", input: "x{-1}", match: "x{-1}") + firstMatchTest("x{-1}", input: "xax{-2}bx{-1}c", match: "x{-1}") + // TODO: After captures, easier to test these } From 4458dc5cc035c6b6aaf0f6bb003872030f18773b Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 22 Apr 2025 12:39:52 -0600 Subject: [PATCH 2/2] Update Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift --- Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 80a5dcb0e..b9693d97a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -348,7 +348,7 @@ extension Parser { /// /// Returns: `nil` if there's no number, otherwise the number /// - /// Diagnoses on overflow. Currenlty, we will diagnose for any values over `UInt16.max` + /// Diagnoses on overflow. Currently, we will diagnose for any values over `UInt16.max` /// mutating func lexQuantBound() -> AST.Atom.Number? { let kind = RadixKind.decimal