From 1ffa3f86e794e56bf8736fe79f4c0dbd9f52068c Mon Sep 17 00:00:00 2001
From: James Brock <jamesbrock@gmail.com>
Date: Mon, 14 Nov 2022 21:39:32 +0900
Subject: [PATCH] takeWhile

---
 CHANGELOG.md                  |  2 +
 src/Parsing.purs              |  3 +-
 src/Parsing/Combinators.purs  |  2 +-
 src/Parsing/String.purs       | 20 +-------
 src/Parsing/String/Basic.purs | 86 ++++++++++++++++++++++++++++++-----
 test/Main.purs                | 19 ++++++--
 6 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf343be..5f5998c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ Breaking changes:
 
 New features:
 
+- Add `Parsing.String.Basic.takeWhile`, `takeWhile1` (#218 by @jamesdbrock)
+
 Other improvements:
 
 ## [v10.1.0](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v10.1.0) - 2022-11-10
diff --git a/src/Parsing.purs b/src/Parsing.purs
index a1f9fda..ffff1b1 100644
--- a/src/Parsing.purs
+++ b/src/Parsing.purs
@@ -430,8 +430,7 @@ failWithPosition message pos = throwError (ParseError message pos)
 -- |
 -- | lmap (parseErrorHuman input 30) $ runParser input do
 -- |   inContext ("Megacity list: " <> _) do
--- |     cityname <- inContext ("city name: " <> _) do
--- |       fst <$> match (skipMany letter)
+-- |     cityname <- inContext ("city name: " <> _) (takeWhile isLetter)
 -- |     skipSpaces
 -- |     population <- inContext ("population: " <> _) intDecimal
 -- |     pure $ Tuple cityname population
diff --git a/src/Parsing/Combinators.purs b/src/Parsing/Combinators.purs
index 63d1b5f..6592f11 100644
--- a/src/Parsing/Combinators.purs
+++ b/src/Parsing/Combinators.purs
@@ -113,7 +113,7 @@ infixl 4 withErrorMessage as <?>
 -- |
 -- |```purescript
 -- |parseBang :: Parser Char
--- |parseBang = char '!' <~?> \_ -> "Expected a bang"
+-- |parseBang = char '!' <~?> \_ -> "a bang"
 -- |```
 withLazyErrorMessage :: forall m s a. ParserT s m a -> (Unit -> String) -> ParserT s m a
 withLazyErrorMessage p msg = p <|> defer \_ -> fail ("Expected " <> msg unit)
diff --git a/src/Parsing/String.purs b/src/Parsing/String.purs
index d8869f9..29c7652 100644
--- a/src/Parsing/String.purs
+++ b/src/Parsing/String.purs
@@ -3,8 +3,8 @@
 -- |
 -- | All of these primitive parsers will consume when they succeed.
 -- |
--- | All of these primitive parsers will not consume when they
--- | fail.
+-- | All of these primitive parsers will not consume and will automatically
+-- | backtrack when they fail.
 -- |
 -- | The behavior of these primitive parsers is based on the behavior of the
 -- | `Data.String` module in the __strings__ package.
@@ -181,22 +181,6 @@ updatePosSingle (Position { index, line, column }) cp after = case fromEnum cp o
 
 -- | Combinator which returns both the result of a parse and the slice of
 -- | the input that was consumed while it was being parsed.
--- |
--- | Because `String`s are not `Char` arrays in PureScript, `many` and `some`
--- | on `Char` parsers need to
--- | be used with `Data.String.CodeUnits.fromCharArray` to
--- | construct a `String`.
--- |
--- | ```
--- | fromCharArray <$> Data.Array.many (char 'x')
--- | ```
--- |
--- | It’s more efficient to achieve the same result by using this `match` combinator
--- | instead of `fromCharArray`.
--- |
--- | ```
--- | fst <$> match (Combinators.skipMany (char 'x'))
--- | ```
 match :: forall m a. ParserT String m a -> ParserT String m (Tuple String a)
 match p = do
   ParseState input1 _ _ <- getParserT
diff --git a/src/Parsing/String/Basic.purs b/src/Parsing/String/Basic.purs
index bf8121c..2b05b26 100644
--- a/src/Parsing/String/Basic.purs
+++ b/src/Parsing/String/Basic.purs
@@ -2,7 +2,9 @@
 -- |
 -- | #### unicode dependency
 -- |
--- | Some of the parsers in this module depend on the __unicode__ package.
+-- | Some of the parsers in this module depend on the
+-- | [__unicode__](https://pursuit.purescript.org/packages/purescript-unicode)
+-- | package.
 -- | The __unicode__ package is large; about half a megabyte unminified.
 -- | If code which depends on __parsing__ is “tree-shaken”
 -- | “dead-code-eliminated,” then
@@ -24,6 +26,8 @@ module Parsing.String.Basic
   , alphaNum
   , intDecimal
   , number
+  , takeWhile
+  , takeWhile1
   , whiteSpace
   , skipSpaces
   , oneOf
@@ -41,13 +45,13 @@ import Data.Int as Data.Int
 import Data.Maybe (Maybe(..))
 import Data.Number (infinity, nan)
 import Data.Number as Data.Number
-import Data.String (CodePoint, singleton, takeWhile)
+import Data.String (CodePoint, singleton)
+import Data.String as String
 import Data.String.CodePoints (codePointFromChar)
 import Data.String.CodeUnits as SCU
-import Data.Tuple (fst)
 import Parsing (ParserT, fail)
 import Parsing.Combinators (choice, tryRethrow, (<?>), (<|>), (<~?>))
-import Parsing.String (consumeWith, match, regex, satisfy, satisfyCodePoint, string)
+import Parsing.String (consumeWith, regex, satisfy, satisfyCodePoint, string)
 import Partial.Unsafe (unsafeCrashWith)
 
 -- | Parse a digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isDecDigit`.
@@ -112,7 +116,7 @@ number =
         section <- numberRegex
         -- https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/parseFloat
         case Data.Number.fromString section of
-          Nothing -> fail $ "Number.fromString failed"
+          Nothing -> fail "Expected Number"
           Just x -> pure x
     ] <|> fail "Expected Number"
 
@@ -134,7 +138,7 @@ intDecimal :: forall m. ParserT String m Int
 intDecimal = tryRethrow do
   section <- intDecimalRegex <|> fail "Expected Int"
   case Data.Int.fromString section of
-    Nothing -> fail $ "Int.fromString failed"
+    Nothing -> fail "Expected Int"
     Just x -> pure x
 
 -- Non-exported regex is compiled at startup time.
@@ -153,17 +157,14 @@ satisfyCP p = satisfy (p <<< codePointFromChar)
 -- | Always succeeds. Will consume only when matched whitespace string
 -- | is non-empty.
 whiteSpace :: forall m. ParserT String m String
-whiteSpace = fst <$> match skipSpaces
+whiteSpace = takeWhile isSpace
 
 -- | Skip whitespace characters satisfying `Data.CodePoint.Unicode.isSpace`
 -- | and throw them away.
 -- |
 -- | Always succeeds. Will only consume when some characters are skipped.
 skipSpaces :: forall m. ParserT String m Unit
-skipSpaces = consumeWith \input -> do
-  let consumed = takeWhile isSpace input
-  let remainder = SCU.drop (SCU.length consumed) input
-  Right { value: unit, consumed, remainder }
+skipSpaces = void whiteSpace
 
 -- | Match one of the BMP `Char`s in the array.
 oneOf :: forall m. Array Char -> ParserT String m Char
@@ -180,3 +181,66 @@ oneOfCodePoints ss = satisfyCodePoint (flip elem ss) <~?> \_ -> "one of " <> sho
 -- | Match any Unicode character not in the array.
 noneOfCodePoints :: forall m. Array CodePoint -> ParserT String m CodePoint
 noneOfCodePoints ss = satisfyCodePoint (flip notElem ss) <~?> \_ -> "none of " <> show (singleton <$> ss)
+
+-- | Take the longest `String` for which the characters satisfy the
+-- | predicate.
+-- |
+-- | See [__`Data.CodePoint.Unicode`__](https://pursuit.purescript.org/packages/purescript-unicode/docs/Data.CodePoint.Unicode)
+-- | for useful predicates.
+-- |
+-- | Example:
+-- |
+-- | ```
+-- | runParser "Tackling the Awkward Squad" do
+-- |   takeWhile Data.CodePoint.Unicode.isLetter
+-- | ```
+-- | ---
+-- | ```
+-- | Right "Tackling"
+-- | ```
+-- |
+-- | You should prefer `takeWhile isLetter` to
+-- | `fromCharArray <$> Data.Array.many letter`.
+takeWhile :: forall m. (CodePoint -> Boolean) -> ParserT String m String
+takeWhile predicate =
+  consumeWith \s ->
+    let
+      value = String.takeWhile predicate s
+    in
+      Right
+        { consumed: value
+        , remainder: SCU.drop (SCU.length value) s
+        , value
+        }
+
+-- | Take the longest `String` for which the characters satisfy the
+-- | predicate. Require at least 1 character. You should supply an
+-- | expectation description for the error
+-- | message for when the predicate fails on the first character.
+-- |
+-- | See [__`Data.CodePoint.Unicode`__](https://pursuit.purescript.org/packages/purescript-unicode/docs/Data.CodePoint.Unicode)
+-- | for useful predicates.
+-- |
+-- | Example:
+-- |
+-- | ```
+-- | runParser "Tackling the Awkward Squad" do
+-- |   takeWhile1 Data.CodePoint.Unicode.isLetter <?> "a letter"
+-- | ```
+-- | ---
+-- | ```
+-- | Right "Tackling"
+-- | ```
+takeWhile1 :: forall m. (CodePoint -> Boolean) -> ParserT String m String
+takeWhile1 predicate =
+  consumeWith \s ->
+    let
+      value = String.takeWhile predicate s
+      len = SCU.length value
+    in
+      if len > 0 then Right
+        { consumed: value
+        , remainder: SCU.drop (SCU.length value) s
+        , value
+        }
+      else Left "Expected character satisfying predicate"
diff --git a/test/Main.purs b/test/Main.purs
index ce07d9d..f9b6abf 100644
--- a/test/Main.purs
+++ b/test/Main.purs
@@ -13,6 +13,7 @@ import Control.Monad.State (State, lift, modify, runState)
 import Data.Array (some, toUnfoldable)
 import Data.Array as Array
 import Data.Bifunctor (lmap, rmap)
+import Data.CodePoint.Unicode as CodePoint.Unicode
 import Data.Either (Either(..), either, fromLeft, hush)
 import Data.Foldable (oneOf)
 import Data.List (List(..), fromFoldable, (:))
@@ -36,12 +37,11 @@ import Effect.Unsafe (unsafePerformEffect)
 import Node.Process (lookupEnv)
 import Parsing (ParseError(..), ParseState(..), Parser, ParserT, Position(..), consume, fail, getParserT, initialPos, parseErrorMessage, parseErrorPosition, position, region, runParser)
 import Parsing.Combinators (advance, between, chainl, chainl1, chainr, chainr1, choice, empty, endBy, endBy1, lookAhead, many, many1, many1Till, many1Till_, manyIndex, manyTill, manyTill_, notFollowedBy, optionMaybe, replicateA, sepBy, sepBy1, sepEndBy, sepEndBy1, skipMany, skipMany1, try, tryRethrow, (<?>), (<??>), (<~?>))
-import Parsing.Combinators as Combinators
 import Parsing.Combinators.Array as Combinators.Array
 import Parsing.Expr (Assoc(..), Operator(..), buildExprParser)
 import Parsing.Language (haskellDef, haskellStyle, javaStyle)
 import Parsing.String (anyChar, anyCodePoint, anyTill, char, eof, match, parseErrorHuman, regex, rest, satisfy, string, takeN)
-import Parsing.String.Basic (intDecimal, letter, noneOfCodePoints, number, oneOfCodePoints, skipSpaces, whiteSpace)
+import Parsing.String.Basic (intDecimal, letter, noneOfCodePoints, number, oneOfCodePoints, skipSpaces, takeWhile, takeWhile1, whiteSpace)
 import Parsing.String.Basic as String.Basic
 import Parsing.String.Replace (breakCap, replace, replaceT, splitCap, splitCapT)
 import Parsing.Token (TokenParser, makeTokenParser, token, when)
@@ -712,8 +712,7 @@ main = do
     assertEqual' "region 1"
       { actual: runParser input do
           inContext ("Megacity list: " <> _) do
-            cityname <- inContext ("city name: " <> _) do
-              fst <$> match (Combinators.skipMany letter)
+            cityname <- inContext ("city name: " <> _) (takeWhile CodePoint.Unicode.isLetter)
             skipSpaces
             population <- inContext ("population: " <> _) intDecimal
             pure $ Tuple cityname population
@@ -725,6 +724,18 @@ main = do
     , expected: Left $ ParseError "Expected 'c'" (Position { index: 1, column: 2, line: 1 })
     }
 
+  assertEqual' "takeWhile 1"
+    { actual: runParser "Tackling the Awkward" do
+        takeWhile CodePoint.Unicode.isLetter <* string " the Awkward"
+    , expected: Right "Tackling"
+    }
+
+  assertEqual' "takeWhile1 1"
+    { actual: runParser "3ackling the Awkward" do
+        takeWhile1 CodePoint.Unicode.isLetter <* string " the Awkward" <?> "letter"
+    , expected: Left $ ParseError "Expected letter" (Position { index: 0, line: 1, column: 1 })
+    }
+
   log "\nTESTS number\n"
 
   -- assert' "Number.fromString" $ Just infinity == Data.Number.fromString "Infinity"