Skip to content

Commit 570ffe1

Browse files
committed
Parser.String.regex
1 parent f52be32 commit 570ffe1

File tree

6 files changed

+108
-2
lines changed

6 files changed

+108
-2
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,15 @@ jobs:
2626
.spago
2727
output
2828
29-
- name: Install dev dependencies
29+
- name: Install dependencies
3030
run: spago install
3131

3232
- name: Build source
3333
run: spago build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning''
3434

35+
- name: Install dev dependencies
36+
run: spago -x spago-dev.dhall install
37+
3538
- name: Run tests
3639
run: spago -x spago-dev.dhall test --no-install
3740

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ Breaking changes:
88

99
New features:
1010

11+
Add `regex` parser.
12+
1113
Bugfixes:
1214

1315
Other improvements:

bower.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"purescript-newtype": "^v4.0.0",
2727
"purescript-numbers": "^v8.0.0",
2828
"purescript-prelude": "^v5.0.1",
29+
"purescript-record": "^v3.0.0",
2930
"purescript-strings": "^v5.0.0",
3031
"purescript-tailrec": "^v5.0.1",
3132
"purescript-transformers": "^v5.1.0",

spago.dhall

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
, "newtype"
1515
, "numbers"
1616
, "prelude"
17+
, "record"
1718
, "strings"
1819
, "tailrec"
1920
, "transformers"

src/Text/Parsing/Parser/String.purs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,27 @@ module Text.Parsing.Parser.String
3636
, noneOf
3737
, noneOfCodePoints
3838
, match
39+
, regex
40+
, RegexFlagsRow
3941
) where
4042

4143
import Prelude hiding (between)
4244

4345
import Control.Monad.State (get, put)
4446
import Data.Array (notElem)
47+
import Data.Array.NonEmpty as NonEmptyArray
4548
import Data.Char (fromCharCode)
4649
import Data.CodePoint.Unicode (isSpace)
50+
import Data.Either (Either(..))
4751
import Data.Foldable (elem)
4852
import Data.Maybe (Maybe(..))
4953
import Data.String (CodePoint, Pattern(..), length, null, singleton, splitAt, stripPrefix, uncons)
5054
import Data.String.CodeUnits as SCU
55+
import Data.String.Regex as Regex
56+
import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
5157
import Data.Tuple (Tuple(..), fst)
58+
import Prim.Row (class Nub, class Union)
59+
import Record (merge)
5260
import Text.Parsing.Parser (ParseState(..), ParserT, consume, fail)
5361
import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>), (<~?>))
5462
import Text.Parsing.Parser.Pos (Position(..))
@@ -208,3 +216,82 @@ match p = do
208216
-- | to something other than `newtype CodePoint = CodePoint Int`.
209217
unCodePoint :: CodePoint -> Int
210218
unCodePoint = unsafeCoerce
219+
220+
-- | Parser which uses the `Data.String.Regex` module to match the regular
221+
-- | expression pattern passed as the `String`
222+
-- | argument to the parser.
223+
-- |
224+
-- | This parser will try to match the regular expression pattern starting
225+
-- | at the current parser position. On success, it will return the matched
226+
-- | substring.
227+
-- |
228+
-- | This parser may be useful for quickly consuming a large section of the
229+
-- | input `String`, because in a JavaScript runtime environment the `RegExp`
230+
-- | runtime is a lot faster than primitive parsers.
231+
-- |
232+
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
233+
-- |
234+
-- | The `Record flags` argument to the parser is for `Regex` flags. Here are
235+
-- | the default flags.
236+
-- |
237+
-- | ```purescript
238+
-- | { dotAll: true
239+
-- | ignoreCase: false
240+
-- | unicode: true
241+
-- | }
242+
-- | ```
243+
-- |
244+
-- | If you want to use the defaults then pass
245+
-- | `{}` as the flags argument. For case-insensitive pattern matching, pass
246+
-- | `{ignoreCase: true}` as the flags argument.
247+
-- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly
248+
-- | nonsense in the context of parsing
249+
-- | and use of the other flags may cause strange behavior in the parser.
250+
-- |
251+
-- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags)
252+
-- |
253+
-- | If the `Regex` pattern string fails to compile then this parser will fail.
254+
-- | (Note: It’s not possible to use a precompiled `Regex` because this parser
255+
-- | must set flags and make adjustments to the `Regex` pattern string.)
256+
regex
257+
:: forall m flags f_
258+
. Monad m
259+
=> Union flags RegexFlagsRow f_
260+
=> Nub f_ RegexFlagsRow
261+
=> Record flags
262+
-> String
263+
-> ParserT String m String
264+
regex flags pattern =
265+
-- Prefix a ^ to ensure the pattern only matches the current position in the parse
266+
case Regex.regex ("^(" <> pattern <> ")") flags' of
267+
Left paterr ->
268+
fail $ "Regex pattern error " <> paterr
269+
Right regexobj -> do
270+
ParseState input position _ <- get
271+
case NonEmptyArray.head <$> Regex.match regexobj input of
272+
Just (Just matched) -> do
273+
let remainder = SCU.drop (SCU.length matched) input
274+
put $ ParseState remainder (updatePosString position matched) true
275+
pure matched
276+
_ -> fail $ "No Regex pattern match"
277+
where
278+
flags' = RegexFlags
279+
( merge flags
280+
{ dotAll: true
281+
, global: false
282+
, ignoreCase: false
283+
, multiline: false
284+
, sticky: false
285+
, unicode: true
286+
} :: RegexFlagsRec
287+
)
288+
289+
-- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`.
290+
type RegexFlagsRow =
291+
( dotAll :: Boolean
292+
, global :: Boolean
293+
, ignoreCase :: Boolean
294+
, multiline :: Boolean
295+
, sticky :: Boolean
296+
, unicode :: Boolean
297+
)

test/Main.purs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import Text.Parsing.Parser.Combinators (between, chainl, chainl1Rec, chainlRec,
2525
import Text.Parsing.Parser.Expr (Assoc(..), Operator(..), buildExprParser)
2626
import Text.Parsing.Parser.Language (haskellDef, haskellStyle, javaStyle)
2727
import Text.Parsing.Parser.Pos (Position(..), initialPos)
28-
import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
28+
import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace)
2929
import Text.Parsing.Parser.String.Basic (intDecimal, number, letter)
3030
import Text.Parsing.Parser.Token (TokenParser, makeTokenParser, match, token, when)
3131
import Text.Parsing.Parser.Token as Parser.Token
@@ -683,6 +683,18 @@ main = do
683683

684684
parseTest "-300" (-300) intDecimal
685685

686+
parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof)
687+
parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof)
688+
parseTest "regexregex" "regexregex" (regex {} "(regex)*")
689+
parseTest "regexregex" "regex" (regex {} "(^regex)*")
690+
parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex")
691+
692+
-- Maybe it is nonsense to allow multiline regex.
693+
-- Because an end-of-line regex pattern `$` will match but then the
694+
-- newline character will not be consumed.
695+
-- Also why does this test fail? I think it should succeed.
696+
-- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+")
697+
686698
stackSafeLoopsTest
687699

688700
tokenParserIdentifierTest

0 commit comments

Comments
 (0)