diff --git a/src/Text/Parsing/StringParser/CodePoints.purs b/src/Text/Parsing/StringParser/CodePoints.purs deleted file mode 100644 index 54dde1d..0000000 --- a/src/Text/Parsing/StringParser/CodePoints.purs +++ /dev/null @@ -1,155 +0,0 @@ --- | Primitive parsers for strings, parsing based on code points. --- | --- | These functions will be much slower than the `CodeUnits` alternatives, but --- | will behave correctly in the presence of Unicode characters made up of --- | multiple code units. -module Text.Parsing.StringParser.CodePoints - ( eof - , anyChar - , anyDigit - , string - , satisfy - , char - , whiteSpace - , skipSpaces - , oneOf - , noneOf - , lowerCaseChar - , upperCaseChar - , anyLetter - , alphaNum - , regex - ) where - -import Prelude - -import Control.Alt ((<|>)) -import Data.Array ((..)) -import Data.Array.NonEmpty as NEA -import Data.Char (fromCharCode, toCharCode) -import Data.Either (Either(..)) -import Data.Enum (fromEnum) -import Data.Foldable (class Foldable, foldMap, elem, notElem) -import Data.Maybe (Maybe(..)) -import Data.String.CodePoints (codePointAt, drop, indexOf', length, stripPrefix) -import Data.String.CodeUnits (singleton) -import Data.String.Pattern (Pattern(..)) -import Data.String.Regex as Regex -import Data.String.Regex.Flags (noFlags) -import Text.Parsing.StringParser (Parser(..), ParseError(..), try, fail) -import Text.Parsing.StringParser.Combinators (many, ()) - --- | Match the end of the file. -eof :: Parser Unit -eof = Parser \s -> - case s of - { str, pos } | pos < length str -> Left { pos, error: ParseError "Expected EOF" } - _ -> Right { result: unit, suffix: s } - --- | Match any character. -anyChar :: Parser Char -anyChar = Parser \{ str, pos } -> - case codePointAt pos str of - Just cp -> case toChar cp of - Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } } - Nothing -> Left { pos, error: ParseError $ "CodePoint " <> show cp <> " is not a character" } - Nothing -> Left { pos, error: ParseError "Unexpected EOF" } - where - toChar = fromCharCode <<< fromEnum - --- | Match any digit. -anyDigit :: Parser Char -anyDigit = try do - c <- anyChar - if c >= '0' && c <= '9' - then pure c - else fail $ "Character " <> show c <> " is not a digit" - --- | Match the specified string. -string :: String -> Parser String -string nt = Parser \s -> - case s of - { str, pos } | indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + length nt } } - { pos } -> Left { pos, error: ParseError ("Expected '" <> nt <> "'.") } - --- | Match a character satisfying the given predicate. -satisfy :: (Char -> Boolean) -> Parser Char -satisfy f = try do - c <- anyChar - if f c - then pure c - else fail $ "Character " <> show c <> " did not satisfy predicate" - --- | Match the specified character. -char :: Char -> Parser Char -char c = satisfy (_ == c) "Could not match character " <> show c - --- | Match many whitespace characters. -whiteSpace :: Parser String -whiteSpace = do - cs <- many (satisfy \ c -> c == '\n' || c == '\r' || c == ' ' || c == '\t') - pure (foldMap singleton cs) - --- | Skip many whitespace characters. -skipSpaces :: Parser Unit -skipSpaces = void whiteSpace - --- | Match one of the characters in the foldable structure. -oneOf :: forall f. Foldable f => f Char -> Parser Char -oneOf = satisfy <<< flip elem - --- | Match any character not in the foldable structure. -noneOf :: forall f. Foldable f => f Char -> Parser Char -noneOf = satisfy <<< flip notElem - --- | Match any lower case character. -lowerCaseChar :: Parser Char -lowerCaseChar = try do - c <- anyChar - if toCharCode c `elem` (97 .. 122) - then pure c - else fail $ "Expected a lower case character but found " <> show c - --- | Match any upper case character. -upperCaseChar :: Parser Char -upperCaseChar = try do - c <- anyChar - if toCharCode c `elem` (65 .. 90) - then pure c - else fail $ "Expected an upper case character but found " <> show c - --- | Match any letter. -anyLetter :: Parser Char -anyLetter = lowerCaseChar <|> upperCaseChar "Expected a letter" - --- | Match a letter or a number. -alphaNum :: Parser Char -alphaNum = anyLetter <|> anyDigit "Expected a letter or a number" - --- | match the regular expression -regex :: String -> Parser String -regex pat = - case Regex.regex pattern noFlags of - Left _ -> - fail $ "Text.Parsing.StringParser.String.regex': illegal regex " <> pat - Right r -> - matchRegex r - where - -- ensure the pattern only matches the current position in the parse - pattern = - case stripPrefix (Pattern "^") pat of - Nothing -> - "^" <> pat - _ -> - pat - matchRegex :: Regex.Regex -> Parser String - matchRegex r = - Parser \{ str, pos } -> - let - remainder = drop pos str - in - case NEA.head <$> Regex.match r remainder of - Just (Just matched) -> - Right { result: matched, suffix: { str, pos: pos + length matched } } - _ -> - Left { pos, error: ParseError "no match" } diff --git a/src/Text/Parsing/StringParser/CodeUnits.purs b/src/Text/Parsing/StringParser/CodeUnits.purs index 7100480..7150be3 100644 --- a/src/Text/Parsing/StringParser/CodeUnits.purs +++ b/src/Text/Parsing/StringParser/CodeUnits.purs @@ -6,6 +6,7 @@ module Text.Parsing.StringParser.CodeUnits ( eof , anyChar + , anyCodePoint , anyDigit , string , satisfy @@ -30,6 +31,7 @@ import Data.Char (toCharCode) import Data.Either (Either(..)) import Data.Foldable (class Foldable, foldMap, elem, notElem) import Data.Maybe (Maybe(..)) +import Data.String as SCP import Data.String.CodeUnits (charAt, singleton) import Data.String.CodeUnits as SCU import Data.String.Pattern (Pattern(..)) @@ -45,13 +47,22 @@ eof = Parser \s -> { str, pos } | pos < SCU.length str -> Left { pos, error: ParseError "Expected EOF" } _ -> Right { result: unit, suffix: s } --- | Match any character. +-- | Match any character. This is limited by `Char` to any code points +-- | that are below `0xFFFF`. If you need to use higher code points +-- | (e.g. emoji), see `anyCodePoint` and `string`. anyChar :: Parser Char anyChar = Parser \{ str, pos } -> case charAt pos str of Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } } Nothing -> Left { pos, error: ParseError "Unexpected EOF" } +-- | Match any code point, including those above `0xFFFF` +anyCodePoint :: Parser SCP.CodePoint +anyCodePoint = Parser \rec@{ str, pos } -> + case SCP.codePointAt 0 (SCU.drop pos str) of + Just cp -> Right { result: cp, suffix: { str, pos: pos + SCU.length (SCP.singleton cp) } } + Nothing -> Left { pos, error: ParseError "Unexpected EOF" } + -- | Match any digit. anyDigit :: Parser Char anyDigit = try do diff --git a/test/CodePoints.purs b/test/CodePoints.purs deleted file mode 100644 index 76c52d6..0000000 --- a/test/CodePoints.purs +++ /dev/null @@ -1,102 +0,0 @@ -module Test.CodePoints where - -import Prelude hiding (between) - -import Control.Alt ((<|>)) -import Data.Either (isLeft, isRight, Either(..)) -import Data.Foldable (fold) -import Data.List (List(Nil), (:)) -import Data.List.Lazy (take, repeat) -import Data.List.NonEmpty (NonEmptyList(..)) -import Data.NonEmpty ((:|)) -import Data.String.CodeUnits (singleton) -import Data.String.Common as SC -import Data.Unfoldable (replicate) -import Effect (Effect) -import Test.Assert (assert', assert) -import Text.Parsing.StringParser (Parser, runParser, try) -import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) -import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) -import Text.Parsing.StringParser.CodePoints (anyDigit, char, eof, string, anyChar, regex) - -parens :: forall a. Parser a -> Parser a -parens = between (string "(") (string ")") - -nested :: Parser Int -nested = fix $ \p -> (do - _ <- string "a" - pure 0) <|> ((+) 1) <$> parens p - -opTest :: Parser String -opTest = chainl (singleton <$> anyChar) (string "+" $> append) "" - -digit :: Parser Int -digit = string "0" $> 0 - <|> string "1" $> 1 - <|> string "2" $> 2 - <|> string "3" $> 3 - <|> string "4" $> 4 - <|> string "5" $> 5 - <|> string "6" $> 6 - <|> string "7" $> 7 - <|> string "8" $> 8 - <|> string "9" $> 9 - -exprTest :: Parser Int -exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight] - , [Infix (string "*" >>= \_ -> pure mul) AssocRight] - , [Infix (string "-" >>= \_ -> pure sub) AssocRight] - , [Infix (string "+" >>= \_ -> pure add) AssocRight] - ] digit - -tryTest :: Parser String - -- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching -tryTest = - try (string "aa" <> string "bb") <|> - (string "aa" <> string "cc") - -canParse :: forall a. Parser a -> String -> Boolean -canParse p input = isRight $ runParser p input - -parseFail :: forall a. Parser a -> String -> Boolean -parseFail p input = isLeft $ runParser p input - -expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean -expectResult res p input = runParser p input == Right res - -testCodePoints :: Effect Unit -testCodePoints = do - assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a") - assert' "many failing after" $ parseFail (do - as <- many (string "a") - eof - pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" ) - - assert $ expectResult 3 nested "(((a)))" - assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa" - assert $ parseFail (many1 (string "a")) "" - assert $ canParse (parens (do - _ <- string "a" - optionMaybe $ string "b")) "(ab)" - assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a" - assert $ canParse (do - as <- string "a" `endBy1` string "," - eof - pure as) "a,a,a," - assert' "opTest" $ expectResult "abc" opTest "a+b+c" - assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5" - assert' "tryTest "$ canParse tryTest "aacc" - assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/" - assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:" - assert $ expectResult "aaaa" (regex "a+") "aaaab" - assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab" - assert $ expectResult Nil (manyTill (string "a") (string "b")) "b" - assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab" - assert $ parseFail (many1Till (string "a") (string "b")) "b" - -- check against overflow - assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" - -- check correct order - assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" - assert $ expectResult "\x458CA" (string "\x458CA" <* char ']' <* eof ) "\x458CA]" - assert $ expectResult "\x458CA" (string "\x458CA" <* string ")" <* eof ) "\x458CA)" - assert $ expectResult '\xEEE2' (char '\xEEE2' <* eof ) "\xEEE2" diff --git a/test/CodeUnits.purs b/test/CodeUnits.purs index ff81cec..7221895 100644 --- a/test/CodeUnits.purs +++ b/test/CodeUnits.purs @@ -9,15 +9,16 @@ import Data.List (List(Nil), (:)) import Data.List.Lazy (take, repeat) import Data.List.NonEmpty (NonEmptyList(..)) import Data.NonEmpty ((:|)) +import Data.String.CodePoints as SCP import Data.String.CodeUnits (singleton) import Data.String.Common as SC import Data.Unfoldable (replicate) import Effect (Effect) import Test.Assert (assert', assert) import Text.Parsing.StringParser (Parser, runParser, try) +import Text.Parsing.StringParser.CodeUnits (anyChar, anyCodePoint, anyDigit, eof, regex, string) import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) -import Text.Parsing.StringParser.CodeUnits (anyDigit, eof, string, anyChar, regex) parens :: forall a. Parser a -> Parser a parens = between (string "(") (string ")") @@ -97,3 +98,22 @@ testCodeUnits = do assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" -- check correct order assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" + -- check anyCodePoint + let anyCodePointStr = map SCP.singleton anyCodePoint + let anyCharStr = map singleton anyChar + assert $ expectResult (NonEmptyList ("πŸ”" :| "🍺":Nil)) (many1 $ anyCodePointStr) "πŸ”πŸΊ" + assert $ expectResult "πŸ”" (anyChar *> anyCodePointStr <* anyChar) "aπŸ”a" + assert $ expectResult ({a: "πŸ”", b: "🍺"}) ({a:_, b:_} <$> (anyCodePointStr <* void anyChar) <*> anyCodePointStr) "πŸ”a🍺" + assert $ expectResult ({a: "a", b: "b", c:"c"}) ({a:_, b:_, c:_} <$> anyCodePointStr <*> anyCodePointStr <*> anyCodePointStr) "abc" + -- check string + assert $ expectResult "πŸ”πŸΊ" (string "πŸ”πŸΊ") "πŸ”πŸΊ" + assert $ expectResult (NonEmptyList ("πŸ”πŸΊ" :| "πŸ”πŸΊ":"πŸ”πŸΊ":Nil)) (many1 $ string "πŸ”πŸΊ") "πŸ”πŸΊπŸ”πŸΊπŸ”πŸΊ" + assert $ expectResult (NonEmptyList ("aπŸ”πŸΊ":|"aπŸ”πŸΊ":"aπŸ”πŸΊ":Nil)) (many1 $ string "aπŸ”πŸΊ") "aπŸ”πŸΊaπŸ”πŸΊaπŸ”πŸΊ" + assert $ expectResult (NonEmptyList ("πŸ”a🍺":|"πŸ”a🍺":"πŸ”a🍺":Nil)) (many1 $ string "πŸ”a🍺") "πŸ”aπŸΊπŸ”aπŸΊπŸ”a🍺" + assert $ expectResult (NonEmptyList ("πŸ”πŸΊa" :| "πŸ”πŸΊa":"πŸ”πŸΊa":Nil)) (many1 $ string "πŸ”πŸΊa") "πŸ”πŸΊaπŸ”πŸΊaπŸ”πŸΊa" + assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1 $ string "a") "aaa" + assert $ expectResult (NonEmptyList ("abc" :| "abc":"abc":Nil)) (many1 $ string "abc") "abcabcabc" + assert $ expectResult (NonEmptyList ("abc" :| "abc":"abc":Nil)) (many1 $ string "abc") "abcabcabc" + assert $ expectResult (NonEmptyList ("abcοΏ½def" :| Nil)) (many1 $ string "abcοΏ½def") "abcοΏ½def" + + assert $ expectResult "πŸ”\xd83c" (string "πŸ”\xd83c") "πŸ”πŸΊ" diff --git a/test/Examples.purs b/test/Examples.purs index e28da30..aa406ac 100644 --- a/test/Examples.purs +++ b/test/Examples.purs @@ -9,7 +9,7 @@ import Data.List.Types (NonEmptyList) import Effect (Effect) import Effect.Console (log, logShow) import Text.Parsing.StringParser (Parser, fail, runParser, unParser) -import Text.Parsing.StringParser.CodePoints (anyChar, char, eof, regex, skipSpaces, string) +import Text.Parsing.StringParser.CodeUnits (anyChar, char, eof, regex, skipSpaces, string) import Text.Parsing.StringParser.Combinators (between, endBy1, lookAhead, many, many1, sepBy1, ()) -- Serves only to make this file runnable diff --git a/test/Main.purs b/test/Main.purs index c1f178f..8625ad1 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -4,13 +4,9 @@ import Prelude import Effect (Effect) import Effect.Console (log) -import Test.CodePoints (testCodePoints) import Test.CodeUnits (testCodeUnits) main :: Effect Unit main = do - log "Testing CodePoint parsing\n" - testCodePoints - log "\n\nTesting CodeUnit parsing\n" testCodeUnits