takeWhile

jamesdbrock · jamesdbrock · commit c4579cad3f0b · 2022-11-14T22:45:49.000+09:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ Breaking changes:
 
 New features:
 
+- Add `Parsing.String.Basic.takeWhile`, `takeWhile1` (#218 by @jamesdbrock)
+
 Other improvements:
 
 ## [v10.1.0](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v10.1.0) - 2022-11-10
diff --git a/src/Parsing.purs b/src/Parsing.purs
@@ -430,8 +430,7 @@ failWithPosition message pos = throwError (ParseError message pos)
 -- |
 -- | lmap (parseErrorHuman input 30) $ runParser input do
 -- |   inContext ("Megacity list: " <> _) do
--- |     cityname <- inContext ("city name: " <> _) do
--- |       fst <$> match (skipMany letter)
+-- |     cityname <- inContext ("city name: " <> _) (takeWhile isLetter)
 -- |     skipSpaces
 -- |     population <- inContext ("population: " <> _) intDecimal
 -- |     pure $ Tuple cityname population
diff --git a/src/Parsing/Combinators.purs b/src/Parsing/Combinators.purs
@@ -113,7 +113,7 @@ infixl 4 withErrorMessage as <?>
 -- |
 -- |```purescript
 -- |parseBang :: Parser Char
--- |parseBang = char '!' <~?> \_ -> "Expected a bang"
+-- |parseBang = char '!' <~?> \_ -> "a bang"
 -- |```
 withLazyErrorMessage :: forall m s a. ParserT s m a -> (Unit -> String) -> ParserT s m a
 withLazyErrorMessage p msg = p <|> defer \_ -> fail ("Expected " <> msg unit)
diff --git a/src/Parsing/String.purs b/src/Parsing/String.purs
@@ -3,8 +3,8 @@
 -- |
 -- | All of these primitive parsers will consume when they succeed.
 -- |
--- | All of these primitive parsers will not consume when they
--- | fail.
+-- | All of these primitive parsers will not consume and will automatically
+-- | backtrack when they fail.
 -- |
 -- | The behavior of these primitive parsers is based on the behavior of the
 -- | `Data.String` module in the __strings__ package.
@@ -181,22 +181,6 @@ updatePosSingle (Position { index, line, column }) cp after = case fromEnum cp o
 
 -- | Combinator which returns both the result of a parse and the slice of
 -- | the input that was consumed while it was being parsed.
--- |
--- | Because `String`s are not `Char` arrays in PureScript, `many` and `some`
--- | on `Char` parsers need to
--- | be used with `Data.String.CodeUnits.fromCharArray` to
--- | construct a `String`.
--- |
--- | ```
--- | fromCharArray <$> Data.Array.many (char 'x')
--- | ```
--- |
--- | It’s more efficient to achieve the same result by using this `match` combinator
--- | instead of `fromCharArray`.
--- |
--- | ```
--- | fst <$> match (Combinators.skipMany (char 'x'))
--- | ```
 match :: forall m a. ParserT String m a -> ParserT String m (Tuple String a)
 match p = do
   ParseState input1 _ _ <- getParserT
diff --git a/src/Parsing/String/Basic.purs b/src/Parsing/String/Basic.purs
@@ -2,7 +2,9 @@
 -- |
 -- | #### unicode dependency
 -- |
--- | Some of the parsers in this module depend on the __unicode__ package.
+-- | Some of the parsers in this module depend on the
+-- | [__unicode__](https://pursuit.purescript.org/packages/purescript-unicode)
+-- | package.
 -- | The __unicode__ package is large; about half a megabyte unminified.
 -- | If code which depends on __parsing__ is “tree-shaken”
 -- | “dead-code-eliminated,” then
@@ -24,6 +26,8 @@ module Parsing.String.Basic
   , alphaNum
   , intDecimal
   , number
+  , takeWhile
+  , takeWhile1
   , whiteSpace
   , skipSpaces
   , oneOf
@@ -41,7 +45,8 @@ import Data.Int as Data.Int
 import Data.Maybe (Maybe(..))
 import Data.Number (infinity, nan)
 import Data.Number as Data.Number
-import Data.String (CodePoint, singleton, takeWhile)
+import Data.String (CodePoint, singleton)
+import Data.String as String
 import Data.String.CodePoints (codePointFromChar)
 import Data.String.CodeUnits as SCU
 import Data.Tuple (fst)
@@ -112,7 +117,7 @@ number =
         section <- numberRegex
         -- https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/parseFloat
         case Data.Number.fromString section of
-          Nothing -> fail $ "Number.fromString failed"
+          Nothing -> fail "Expected Number"
           Just x -> pure x
     ] <|> fail "Expected Number"
 
@@ -134,7 +139,7 @@ intDecimal :: forall m. ParserT String m Int
 intDecimal = tryRethrow do
   section <- intDecimalRegex <|> fail "Expected Int"
   case Data.Int.fromString section of
-    Nothing -> fail $ "Int.fromString failed"
+    Nothing -> fail "Expected Int"
     Just x -> pure x
 
 -- Non-exported regex is compiled at startup time.
@@ -153,17 +158,14 @@ satisfyCP p = satisfy (p <<< codePointFromChar)
 -- | Always succeeds. Will consume only when matched whitespace string
 -- | is non-empty.
 whiteSpace :: forall m. ParserT String m String
-whiteSpace = fst <$> match skipSpaces
+whiteSpace = takeWhile isSpace
 
 -- | Skip whitespace characters satisfying `Data.CodePoint.Unicode.isSpace`
 -- | and throw them away.
 -- |
 -- | Always succeeds. Will only consume when some characters are skipped.
 skipSpaces :: forall m. ParserT String m Unit
-skipSpaces = consumeWith \input -> do
-  let consumed = takeWhile isSpace input
-  let remainder = SCU.drop (SCU.length consumed) input
-  Right { value: unit, consumed, remainder }
+skipSpaces = void whiteSpace
 
 -- | Match one of the BMP `Char`s in the array.
 oneOf :: forall m. Array Char -> ParserT String m Char
@@ -180,3 +182,68 @@ oneOfCodePoints ss = satisfyCodePoint (flip elem ss) <~?> \_ -> "one of " <> sho
 -- | Match any Unicode character not in the array.
 noneOfCodePoints :: forall m. Array CodePoint -> ParserT String m CodePoint
 noneOfCodePoints ss = satisfyCodePoint (flip notElem ss) <~?> \_ -> "none of " <> show (singleton <$> ss)
+
+-- | Take the longest `String` for which the characters satisfy the
+-- | predicate.
+-- |
+-- | See [__`Data.CodePoint.Unicode`__](https://pursuit.purescript.org/packages/purescript-unicode/docs/Data.CodePoint.Unicode)
+-- | for useful predicates.
+-- |
+-- | Example:
+-- |
+-- | ```
+-- | runParser "Tackling the Awkward Squad" do
+-- |   takeWhile Data.CodePoint.Unicode.isLetter
+-- | ```
+-- | ---
+-- | ```
+-- | Right "Tackling"
+-- | ```
+-- |
+-- | You should prefer `takeWhile isLetter` to
+-- | `fromCharArray <$> Data.Array.many letter`.
+takeWhile :: forall m. (CodePoint -> Boolean) -> ParserT String m String
+takeWhile predicate =
+  consumeWith \s ->
+    let
+      value = String.takeWhile predicate s
+    in
+      Right
+        { consumed: value
+        , remainder: SCU.drop (SCU.length value) s
+        , value
+        }
+
+
+-- | Take the longest `String` for which the characters satisfy the
+-- | predicate. Require at least 1 character. You should supply an
+-- | expectation description for the error
+-- | message for when the predicate fails on the first character.
+-- |
+-- | See [__`Data.CodePoint.Unicode`__](https://pursuit.purescript.org/packages/purescript-unicode/docs/Data.CodePoint.Unicode)
+-- | for useful predicates.
+-- |
+-- | Example:
+-- |
+-- | ```
+-- | runParser "Tackling the Awkward Squad" do
+-- |   takeWhile1 Data.CodePoint.Unicode.isLetter <?> "letter"
+-- | ```
+-- | ---
+-- | ```
+-- | Right "Tackling"
+-- | ```
+takeWhile1 :: forall m. (CodePoint -> Boolean) -> ParserT String m String
+takeWhile1 predicate =
+  consumeWith \s ->
+    let
+      value = String.takeWhile predicate s
+      len = SCU.length value
+    in
+      if len > 0
+      then Right
+        { consumed: value
+        , remainder: SCU.drop (SCU.length value) s
+        , value
+        }
+      else Left "character predicate"
diff --git a/test/Main.purs b/test/Main.purs
@@ -13,6 +13,7 @@ import Control.Monad.State (State, lift, modify, runState)
 import Data.Array (some, toUnfoldable)
 import Data.Array as Array
 import Data.Bifunctor (lmap, rmap)
+import Data.CodePoint.Unicode as CodePoint.Unicode
 import Data.Either (Either(..), either, fromLeft, hush)
 import Data.Foldable (oneOf)
 import Data.List (List(..), fromFoldable, (:))
@@ -41,7 +42,7 @@ import Parsing.Combinators.Array as Combinators.Array
 import Parsing.Expr (Assoc(..), Operator(..), buildExprParser)
 import Parsing.Language (haskellDef, haskellStyle, javaStyle)
 import Parsing.String (anyChar, anyCodePoint, anyTill, char, eof, match, parseErrorHuman, regex, rest, satisfy, string, takeN)
-import Parsing.String.Basic (intDecimal, letter, noneOfCodePoints, number, oneOfCodePoints, skipSpaces, whiteSpace)
+import Parsing.String.Basic (intDecimal, letter, noneOfCodePoints, number, oneOfCodePoints, skipSpaces, takeWhile, takeWhile1, whiteSpace)
 import Parsing.String.Basic as String.Basic
 import Parsing.String.Replace (breakCap, replace, replaceT, splitCap, splitCapT)
 import Parsing.Token (TokenParser, makeTokenParser, token, when)
@@ -712,8 +713,7 @@ main = do
     assertEqual' "region 1"
       { actual: runParser input do
           inContext ("Megacity list: " <> _) do
-            cityname <- inContext ("city name: " <> _) do
-              fst <$> match (Combinators.skipMany letter)
+            cityname <- inContext ("city name: " <> _) (takeWhile CodePoint.Unicode.isLetter)
             skipSpaces
             population <- inContext ("population: " <> _) intDecimal
             pure $ Tuple cityname population
@@ -725,6 +725,18 @@ main = do
     , expected: Left $ ParseError "Expected 'c'" (Position { index: 1, column: 2, line: 1 })
     }
 
+  assertEqual' "takeWhile 1"
+    { actual: runParser "Tackling the Awkward" do
+        takeWhile CodePoint.Unicode.isLetter <* string " the Awkward"
+    , expected: Right "Tackling"
+    }
+
+  assertEqual' "takeWhile1 1"
+    { actual: runParser "3ackling the Awkward" do
+        takeWhile1 CodePoint.Unicode.isLetter <* string " the Awkward" <?> "letter"
+    , expected: Left $ ParseError "Expected letter" (Position {index:0,line:1,column:1})
+    }
+
   log "\nTESTS number\n"
 
   -- assert' "Number.fromString" $ Just infinity == Data.Number.fromString "Infinity"