From a1413b1607b8cc4aee57e99bc6818f955f533e85 Mon Sep 17 00:00:00 2001
From: James Brock <jamesbrock@gmail.com>
Date: Fri, 24 Sep 2021 16:31:35 +0900
Subject: [PATCH] Unicode correctness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Correctly handle UTF-16 surrogate pairs in `String`s.
We keep all of the API, but we change the primitive parsers so that instead
of succeeding and incorrectly returning half of a surrogate pair, they will fail.

All prior tests pass with no modifications. Add a few new tests.

Non-breaking changes
====================

Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing
`CodePoint`s.

Add the `match` combinator.

Move `updatePosString` to the `Text.Parsing.Parser.String` module and don't
export it.

Split dev dependencies into spago-dev.dhall.

Add benchmark suite.

Add astral UTF-16 test.

Breaking changes
================

Change the definition of `whiteSpace` and `skipSpaces` to
`Data.CodePoint.Unicode.isSpace`.

To make this library handle Unicode correctly, it is necessary to
either alter the `StringLike` class or delete it.
We decided to delete it. The `String` module will now operate only
on inputs of the concrete `String` type.
`StringLike` has no laws, and during the five years of its life,
no-one on Github has ever written another instance of `StringLike`.
https://github.com/search?l=&q=StringLike+language%3APureScript&type=code
The last time someone tried to alter `StringLike`, this is what
happened:
https://github.com/purescript-contrib/purescript-parsing/pull/62

Breaking changes which won’t be caught by the compiler
======================================================

Fundamentally, we change the way we consume the next input character from
`Data.String.CodeUnits.uncons` to `Data.String.CodePoints.uncons`.

`anyChar` will no longer always succeed. It will only succeed on a Basic
Multilingual Plane character. The new parser `anyCodePoint` will always succeed.

We are not quite “making the default `CodePoint`”, as was discussed in
https://github.com/purescript-contrib/purescript-parsing/pull/76#issuecomment-398403864 .
Rather we are keeping most of the current API and making it work
properly with astral Unicode.

We keep the `Char` parsers for backward compatibility.
We also keep the `Char` parsers for ergonomic reasons. For example
the parser `char :: forall s m. Monad m => Char -> ParserT s m Char`.
This parser is usually called with a literal like `char 'a'`. It would
be annoying to call this parser with `char (codePointFromChar 'a')`.

Benchmarks
==========

For Unicode correctness, we're now consuming characters with
`Data.String.CodePoints.uncons` instead of
`Data.String.CodeUnits.uncons`. If that were going to effect
performance, then the effect would show up in the `runParser parse23`
benchmark, but it doesn’t.

Before
------

```
runParser parse23
mean   = 43.36 ms
stddev = 6.75 ms
min    = 41.12 ms
max    = 124.65 ms

runParser parseSkidoo
mean   = 22.53 ms
stddev = 3.86 ms
min    = 21.40 ms
max    = 61.76 ms
```

After
-----

```
runParser parse23
mean   = 42.90 ms
stddev = 6.01 ms
min    = 40.97 ms
max    = 115.74 ms

runParser parseSkidoo
mean   = 22.03 ms
stddev = 2.79 ms
min    = 20.78 ms
max    = 53.34 ms
```
---
 .github/workflows/ci.yml              |   6 +-
 CHANGELOG.md                          |  22 ++-
 CONTRIBUTING.md                       |  13 ++
 bench/Main.purs                       | 134 +++++++++++++++++++
 spago-dev.dhall                       |  22 +++
 spago.dhall                           |   7 +-
 src/Text/Parsing/Parser/Language.purs |   2 +-
 src/Text/Parsing/Parser/Pos.purs      |  14 +-
 src/Text/Parsing/Parser/String.purs   | 184 ++++++++++++++++++--------
 src/Text/Parsing/Parser/Token.purs    |  16 +--
 test/Main.purs                        |  28 +++-
 11 files changed, 354 insertions(+), 94 deletions(-)
 create mode 100644 bench/Main.purs
 create mode 100644 spago-dev.dhall

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cd71fe8..b8d697e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -25,10 +25,10 @@ jobs:
             output
 
       - name: Install dependencies
-        run: spago install
+        run: spago -x spago-dev.dhall install
 
       - name: Build source
-        run: spago build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning''
+        run: spago -x spago-dev.dhall build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning''
 
       - name: Run tests
-        run: spago test --no-install
+        run: spago -x spago-dev.dhall test --no-install
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 709b3ed..5ecc1ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,10 +6,28 @@ Notable changes to this project are documented in this file. The format is based
 
 Breaking changes:
 
+- `anyChar` will no longer always succeed. It will only succeed on a Basic
+  Multilingual Plane character. The new parser `anyCodePoint` will always
+  succeed. (#119 by @jamesdbrock)
+- Delete the `StringLike` typeclass. Users must delete all `StringLike`
+  constraints. (#119 by @jamesdbrock)
+- Move `updatePosString` to the `String` module and don’t
+  export it. (#119 by @jamesdbrock)
+- Change the definition of `whiteSpace` and `skipSpaces` to
+  `Data.CodePoint.Unicode.isSpace`. (#119 by @jamesdbrock)
+
 New features:
 
+- Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing
+  `CodePoint`s. (#119 by @jamesdbrock)
+- Add `match` combinator (#119 by @jamesdbrock)
+- Add benchmark suite (#119 by @jamesdbrock)
+- Split the dev dependencies out into `spago-dev.dhall`.
+
 Bugfixes:
 
+- Unicode correctness (#119 by @jamesdbrock)
+
 Other improvements:
 
 ## [v6.0.2](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v6.0.2) - 2021-05-09
@@ -26,12 +44,12 @@ Other improvements:
 ## [v6.0.0](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v6.0.0) - 2021-02-26
 
 Breaking changes:
-- Improved performance of `string` and update `StringLike` to have `stripPrefix` as a class member instead of `indexOf` (#93) 
+- Improved performance of `string` and update `StringLike` to have `stripPrefix` as a class member instead of `indexOf` (#93)
 - Non-empty combinators now return `NonEmptyList` (#102)
 - Added support for PureScript 0.14 and dropped support for all previous versions (#101)
 
 New features:
-- Derived `Generic` instance of Position (#87) 
+- Derived `Generic` instance of Position (#87)
 
 Bugfixes:
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9c4c59..768c507 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -3,3 +3,16 @@
 Thanks for your interest in contributing to `parsing`! We welcome new contributions regardless of your level of experience or familiarity with PureScript.
 
 Every library in the Contributors organization shares a simple handbook that helps new contributors get started. With that in mind, please [read the short contributing guide on purescript-contrib/governance](https://github.com/purescript-contrib/governance/blob/main/contributing.md) before contributing to this library.
+
+# Development
+
+This package includes a `spago-dev.dhall` which provides the dependencies
+for development and testing.
+
+## Testing
+
+To run the test suite:
+
+```
+spago -x spago-dev.dhall test
+```
\ No newline at end of file
diff --git a/bench/Main.purs b/bench/Main.purs
new file mode 100644
index 0000000..d5d5bf2
--- /dev/null
+++ b/bench/Main.purs
@@ -0,0 +1,134 @@
+-- | # Benchmarking
+-- |
+-- |     spago -x spago-dev.dhall run --main Bench.Main
+-- |
+-- | This benchmark suite is intended to guide changes to this package so that
+-- | we can compare the benchmarks of different commits.
+-- |
+-- | This benchmark suite also compares parsers to equivalent Regex. This
+-- | provides an answer to the common question “How much slower is this package
+-- | than Regex?” Answer: approximately 100×. The Regex benchmarks also give
+-- | us a rough way to calibrate benchmarks run on different platforms.
+-- |
+-- | # Profiling
+-- |
+-- | https://nodejs.org/en/docs/guides/simple-profiling/
+-- | https://nodesource.com/blog/diagnostics-in-NodeJS-2
+-- |
+-- |     spago -x spago-dev.dhall build --source-maps
+-- |     purs bundle output/**/*.js --source-maps --output ./index.bundle.js
+-- |
+-- |
+-- |     spago -x spago-dev.dhall build --source-maps --purs-args '--codegen corefn,sourcemaps'
+-- |     zephyr Bench.Main.main --codegen sourcemaps,js
+-- |     purs bundle dce-output/**/*.js --source-maps --module Bench.Main --main Bench.Main --output ./index.dce.bundle.js
+-- |     node index.dce.bundle.js
+-- |
+-- |     spago -x spago-dev.dhall build --source-maps --purs-args '--codegen corefn,sourcemaps'
+-- |     purs bundle output/**/*.js --source-maps --module Bench.Main --main Bench.Main --output ./index.bundle.js
+-- |     node index.bundle.js
+-- |     node --prof --enable-source-maps ./index.bundle.js
+-- |     node --prof-process --source-map ./index.bundle.js.map isolate--.log > prof.txt
+-- |
+-- |     node --prof --enable-source-maps -e 'require("./output/Bench.Main/index.js").main()'
+-- |     node --prof-process isolate--.log
+-- |
+-- |     spago -x spago-dev.dhall build
+-- |     node --prof -e 'require("./output/Bench.Main/index.js").main()'
+-- |     node --prof-process isolate--.log > prof.txt
+
+module Bench.Main where
+
+import Prelude
+
+import Data.Array (fold, replicate)
+import Data.Either (either)
+import Data.List (manyRec)
+import Data.List.Types (List)
+import Data.String.Regex (Regex, regex)
+import Data.String.Regex as Regex
+import Data.String.Regex.Flags (RegexFlags(..))
+import Effect (Effect)
+import Effect.Console (log)
+import Effect.Exception (throw)
+import Effect.Unsafe (unsafePerformEffect)
+import Performance.Minibench (benchWith)
+import Text.Parsing.Parser (Parser, runParser)
+import Text.Parsing.Parser.String (string)
+import Text.Parsing.Parser.Token (digit)
+import Text.Parsing.StringParser as StringParser
+import Text.Parsing.StringParser.CodePoints as StringParser.CodePoints
+import Text.Parsing.StringParser.CodeUnits as StringParser.CodeUnits
+
+string23 :: String
+string23 = "23"
+string23_2 :: String
+string23_2 = fold $ replicate 2 string23
+string23_10000 :: String
+string23_10000 = fold $ replicate 10000 string23
+
+stringSkidoo :: String
+stringSkidoo = "skidoo"
+stringSkidoo_2 :: String
+stringSkidoo_2 = fold $ replicate 2 stringSkidoo
+stringSkidoo_10000 :: String
+stringSkidoo_10000 = fold $ replicate 10000 stringSkidoo
+
+parse23 :: Parser String (List Char)
+parse23 = manyRec digit
+
+parse23Points :: StringParser.Parser (List Char)
+parse23Points = manyRec StringParser.CodePoints.anyDigit
+
+parse23Units :: StringParser.Parser (List Char)
+parse23Units = manyRec StringParser.CodeUnits.anyDigit
+
+pattern23 :: Regex
+pattern23 = either (unsafePerformEffect <<< throw) identity $
+  regex "\\d" $ RegexFlags
+    { dotAll: true
+    , global: true
+    , ignoreCase: false
+    , multiline: true
+    , sticky: false
+    , unicode: true
+    }
+
+parseSkidoo :: Parser String (List String)
+parseSkidoo = manyRec $ string "skidoo"
+
+patternSkidoo :: Regex
+patternSkidoo = either (unsafePerformEffect <<< throw) identity $
+  regex "skidoo" $ RegexFlags
+    { dotAll: true
+    , global: true
+    , ignoreCase: false
+    , multiline: true
+    , sticky: false
+    , unicode: true
+    }
+
+main :: Effect Unit
+main = do
+  -- log $ show $ runParser string23_2 parse23
+  -- log $ show $ Regex.match pattern23 string23_2
+  -- log $ show $ runParser stringSkidoo_2 parseSkidoo
+  -- log $ show $ Regex.match patternSkidoo stringSkidoo_2
+  log "runParser parse23"
+  benchWith 200
+    $ \_ -> runParser string23_10000 parse23
+  log "StringParser.runParser parse23Points"
+  benchWith 20
+    $ \_ -> StringParser.runParser parse23Points string23_10000
+  log "StringParser.runParser parse23Units"
+  benchWith 200
+    $ \_ -> StringParser.runParser parse23Units string23_10000
+  log "Regex.match pattern23"
+  benchWith 200
+    $ \_ -> Regex.match pattern23 string23_10000
+  log "runParser parseSkidoo"
+  benchWith 200
+    $ \_ -> runParser stringSkidoo_10000 parseSkidoo
+  log "Regex.match patternSkidoo"
+  benchWith 200
+    $ \_ -> Regex.match patternSkidoo stringSkidoo_10000
diff --git a/spago-dev.dhall b/spago-dev.dhall
new file mode 100644
index 0000000..9bbec6e
--- /dev/null
+++ b/spago-dev.dhall
@@ -0,0 +1,22 @@
+-- Spago configuration for testing, benchmarking, development.
+--
+-- See:
+-- * ./CONTRIBUTING.md
+-- * https://github.com/purescript/spago#devdependencies-testdependencies-or-in-general-a-situation-with-many-configurations
+--
+
+let conf = ./spago.dhall
+
+in conf //
+{ sources = [ "src/**/*.purs", "test/**/*.purs", "bench/**/*.purs" ]
+, dependencies = conf.dependencies #
+  [ "assert"
+  , "console"
+  , "effect"
+  , "psci-support"
+  , "minibench"
+  , "exceptions"
+  , "string-parsers"
+  ]
+, packages = ./packages.dhall
+}
diff --git a/spago.dhall b/spago.dhall
index 139bc69..fe22dd2 100644
--- a/spago.dhall
+++ b/spago.dhall
@@ -1,10 +1,7 @@
 { name = "parsing"
 , dependencies =
   [ "arrays"
-  , "assert"
-  , "console"
   , "control"
-  , "effect"
   , "either"
   , "foldable-traversable"
   , "identity"
@@ -14,13 +11,13 @@
   , "maybe"
   , "newtype"
   , "prelude"
-  , "psci-support"
   , "strings"
   , "tailrec"
   , "transformers"
   , "tuples"
   , "unicode"
+  , "unsafe-coerce"
   ]
 , packages = ./packages.dhall
-, sources = [ "src/**/*.purs", "test/**/*.purs" ]
+, sources = [ "src/**/*.purs" ]
 }
diff --git a/src/Text/Parsing/Parser/Language.purs b/src/Text/Parsing/Parser/Language.purs
index c6d71e2..cc59758 100644
--- a/src/Text/Parsing/Parser/Language.purs
+++ b/src/Text/Parsing/Parser/Language.purs
@@ -13,7 +13,7 @@ import Prelude
 import Control.Alt ((<|>))
 import Text.Parsing.Parser (ParserT)
 import Text.Parsing.Parser.String (char, oneOf)
-import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser, alphaNum, letter)
+import Text.Parsing.Parser.Token (GenLanguageDef(..), LanguageDef, TokenParser, alphaNum, letter, makeTokenParser, unGenLanguageDef)
 
 -----------------------------------------------------------
 -- Styles: haskellStyle, javaStyle
diff --git a/src/Text/Parsing/Parser/Pos.purs b/src/Text/Parsing/Parser/Pos.purs
index 6a20dff..0421773 100644
--- a/src/Text/Parsing/Parser/Pos.purs
+++ b/src/Text/Parsing/Parser/Pos.purs
@@ -1,10 +1,8 @@
 module Text.Parsing.Parser.Pos where
 
 import Prelude
+
 import Data.Generic.Rep (class Generic)
-import Data.Foldable (foldl)
-import Data.Newtype (wrap)
-import Data.String (split)
 
 -- | `Position` represents the position of the parser in the input.
 -- |
@@ -27,13 +25,3 @@ derive instance ordPosition :: Ord Position
 -- | The `Position` before any input has been parsed.
 initialPos :: Position
 initialPos = Position { line: 1, column: 1 }
-
--- | Updates a `Position` by adding the columns and lines in `String`.
-updatePosString :: Position -> String -> Position
-updatePosString pos' str = foldl updatePosChar pos' (split (wrap "") str)
-  where
-  updatePosChar (Position pos) c = case c of
-    "\n" -> Position { line: pos.line + 1, column: 1 }
-    "\r" -> Position { line: pos.line + 1, column: 1 }
-    "\t" -> Position { line: pos.line,     column: pos.column + 8 - ((pos.column - 1) `mod` 8) }
-    _    -> Position { line: pos.line,     column: pos.column + 1 }
diff --git a/src/Text/Parsing/Parser/String.purs b/src/Text/Parsing/Parser/String.purs
index a5deb01..dd303f2 100644
--- a/src/Text/Parsing/Parser/String.purs
+++ b/src/Text/Parsing/Parser/String.purs
@@ -1,92 +1,160 @@
 -- | Primitive parsers for working with an input stream of type `String`.
-
-module Text.Parsing.Parser.String where
+-- |
+-- | The behavior of these primitive parsers is based on the behavior of the
+-- | `Data.String` module in the __strings__ package.
+-- | In most JavaScript runtime environments, the `String`
+-- | is little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16).
+-- |
+-- | The primitive parsers which return `Char` will only succeed when the character
+-- | being parsed is a code point in the
+-- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
+-- | (the “BMP”). These parsers can be convenient because of the good support
+-- | that PureScript has for writing `Char` literals like `'あ'`, `'β'`, `'C'`.
+-- |
+-- | The other primitive parsers, which return `CodePoint` and `String` types,
+-- | can parse the full Unicode character set. All of the primitive parsers
+-- | in this module can be used together.
+module Text.Parsing.Parser.String
+( string
+, eof
+, anyChar
+, anyCodePoint
+, satisfy
+, satisfyCodePoint
+, char
+, whiteSpace
+, skipSpaces
+, oneOf
+, noneOf
+, match
+)
+where
 
 import Prelude hiding (between)
 
-import Control.Monad.State (gets, modify_)
-import Data.Array (many)
-import Data.Foldable (elem, notElem)
+import Control.Monad.State (get, put)
+import Data.Array (notElem)
+import Data.Char (fromCharCode)
+import Data.CodePoint.Unicode (isSpace)
+import Data.Foldable (elem)
 import Data.Maybe (Maybe(..))
-import Data.Newtype (wrap)
-import Data.String (Pattern)
-import Data.String as S
+import Data.String (CodePoint, Pattern(..), null, stripPrefix, uncons)
 import Data.String.CodeUnits as SCU
+import Data.Tuple (Tuple(..), fst)
 import Text.Parsing.Parser (ParseState(..), ParserT, fail)
-import Text.Parsing.Parser.Combinators (tryRethrow, (<?>))
-import Text.Parsing.Parser.Pos (updatePosString)
-
--- | This class exists to abstract over streams which support the string-like
--- | operations which this modules needs.
-class StringLike s where
-  drop :: Int -> s -> s
-  stripPrefix :: Pattern -> s -> Maybe s
-  null :: s -> Boolean
-  uncons :: s -> Maybe { head :: Char, tail :: s }
-
-instance stringLikeString :: StringLike String where
-  uncons = SCU.uncons
-  drop = S.drop
-  stripPrefix = S.stripPrefix
-  null = S.null
+import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>))
+import Text.Parsing.Parser.Pos (Position(..))
+import Unsafe.Coerce (unsafeCoerce)
 
 -- | Match end-of-file.
-eof :: forall s m. StringLike s => Monad m => ParserT s m Unit
+eof :: forall m. Monad m => ParserT String m Unit
 eof = do
-  input <- gets \(ParseState input _ _) -> input
+  ParseState input _ _ <- get
   unless (null input) (fail "Expected EOF")
 
 -- | Match the specified string.
-string :: forall s m. StringLike s => Monad m => String -> ParserT s m String
+string :: forall m. Monad m => String -> ParserT String m String
 string str = do
-  input <- gets \(ParseState input _ _) -> input
-  case stripPrefix (wrap str) input of
+  ParseState input position _ <- get
+  case stripPrefix (Pattern str) input of
     Just remainder -> do
-      modify_ \(ParseState _ position _) ->
-        ParseState remainder
-                   (updatePosString position str)
-                   true
+      put $ ParseState remainder (updatePosString position str) true
       pure str
     _ -> fail ("Expected " <> show str)
 
--- | Match any character.
-anyChar :: forall s m. StringLike s => Monad m => ParserT s m Char
-anyChar = do
-  input <- gets \(ParseState input _ _) -> input
+-- | Match any BMP `Char`.
+-- | Parser will fail if the character is not in the Basic Multilingual Plane.
+anyChar :: forall m. Monad m => ParserT String m Char
+anyChar = tryRethrow do
+  cp :: Int <- unCodePoint <$> anyCodePoint
+  -- the `fromCharCode` function doesn't check if this is beyond the
+  -- BMP, so we check that ourselves.
+  -- https://github.com/purescript/purescript-strings/issues/153
+  if cp > 65535 -- BMP
+    then fail "Not a Char"
+  else case fromCharCode cp of
+    Nothing -> fail "Not a Char"
+    Just c -> pure c
+
+-- | Match any Unicode character.
+-- | Always succeeds.
+anyCodePoint :: forall m. Monad m => ParserT String m CodePoint
+anyCodePoint = do
+  ParseState input position _ <- get
   case uncons input of
     Nothing -> fail "Unexpected EOF"
     Just { head, tail } -> do
-      modify_ \(ParseState _ position _) ->
-        ParseState tail
-                   (updatePosString position (SCU.singleton head))
-                   true
+      put $ ParseState tail (updatePosSingle position head) true
       pure head
 
--- | Match a character satisfying the specified predicate.
-satisfy :: forall s m. StringLike s => Monad m => (Char -> Boolean) -> ParserT s m Char
+-- | Match a BMP `Char` satisfying the predicate.
+satisfy :: forall m. Monad m => (Char -> Boolean) -> ParserT String m Char
 satisfy f = tryRethrow do
   c <- anyChar
-  if f c then pure c
-         else fail $ "Character '" <> SCU.singleton c <> "' did not satisfy predicate"
+  if f c
+    then pure c
+    else fail "Predicate unsatisfied"
 
--- | Match the specified character
-char :: forall s m. StringLike s => Monad m => Char -> ParserT s m Char
+-- | Match a Unicode character satisfying the predicate.
+satisfyCodePoint :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m CodePoint
+satisfyCodePoint f = tryRethrow do
+  c <- anyCodePoint
+  if f c
+    then pure c
+    else fail "Predicate unsatisfied"
+
+-- | Match the specified BMP `Char`.
+char :: forall m. Monad m => Char -> ParserT String m Char
 char c = satisfy (_ == c) <?> show c
 
--- | Match zero or more whitespace characters.
-whiteSpace :: forall s m. StringLike s => Monad m => ParserT s m String
-whiteSpace = do
-  cs <- many $ satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t'
-  pure $ SCU.fromCharArray cs
+-- | Match zero or more whitespace characters satisfying
+-- | `Data.CodePoint.Unicode.isSpace`.
+whiteSpace :: forall m. Monad m => ParserT String m String
+whiteSpace = fst <$> match skipSpaces
 
 -- | Skip whitespace characters.
-skipSpaces :: forall s m. StringLike s => Monad m => ParserT s m Unit
-skipSpaces = void whiteSpace
+skipSpaces :: forall m. Monad m => ParserT String m Unit
+skipSpaces = skipMany (satisfyCodePoint isSpace)
 
--- | Match one of the characters in the array.
-oneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
+-- | Match one of the BMP `Char`s in the array.
+oneOf :: forall m. Monad m => Array Char -> ParserT String m Char
 oneOf ss = satisfy (flip elem ss) <?> ("one of " <> show ss)
 
--- | Match any character not in the array.
-noneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
+-- | Match any BMP `Char` not in the array.
+noneOf :: forall m. Monad m => Array Char -> ParserT String m Char
 noneOf ss = satisfy (flip notElem ss) <?> ("none of " <> show ss)
+
+-- | Updates a `Position` by adding the columns and lines in `String`.
+updatePosString :: Position -> String -> Position
+updatePosString pos str = case uncons str of
+  Nothing -> pos
+  Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
+
+-- | Updates a `Position` by adding the columns and lines in a
+-- | single `CodePoint`.
+updatePosSingle :: Position -> CodePoint -> Position
+updatePosSingle (Position {line,column}) cp = case unCodePoint cp of
+  10 -> Position { line: line + 1, column: 1 } -- "\n"
+  13 -> Position { line: line + 1, column: 1 } -- "\r"
+  9  -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
+  _  -> Position { line, column: column + 1 }
+
+-- | Combinator which returns both the result of a parse and the portion of
+-- | the input that was consumed while it was being parsed.
+match :: forall m a. Monad m => ParserT String m a -> ParserT String m (Tuple String a)
+match p = do
+  ParseState input1 _ _ <- get
+  x <- p
+  ParseState input2 _ _ <- get
+  -- We use the `SCU.length`, which is in units of “code units”
+  -- instead of `Data.String.length`. which is in units of “code points”.
+  -- This is more efficient, and it will be correct as long as we can assume
+  -- the invariant that the `ParseState input` always begins on a code point
+  -- boundary.
+  pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
+
+-- | The CodePoint newtype constructor is not exported, so here's a helper.
+-- | This will break at runtime if the definition of CodePoint ever changes
+-- | to something other than `newtype CodePoint = CodePoint Int`.
+unCodePoint :: CodePoint -> Int
+unCodePoint = unsafeCoerce
\ No newline at end of file
diff --git a/src/Text/Parsing/Parser/Token.purs b/src/Text/Parsing/Parser/Token.purs
index 970af21..0e852fc 100644
--- a/src/Text/Parsing/Parser/Token.purs
+++ b/src/Text/Parsing/Parser/Token.purs
@@ -21,17 +21,14 @@ module Text.Parsing.Parser.Token
   )
     where
 
-import Prelude hiding (when,between)
+import Prelude hiding (when, between)
 
 import Control.Lazy (fix)
 import Control.Monad.State (gets, modify_)
 import Control.MonadPlus (guard, (<|>))
 import Data.Array as Array
-import Data.String.CodeUnits (toChar, singleton) as CodeUnits
-import Data.String.CodePoints (CodePoint, codePointFromChar)
 import Data.Char (fromCharCode, toCharCode)
-import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper, hexDigitToInt)
-import Data.String.Unicode as Unicode
+import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper)
 import Data.Either (Either(..))
 import Data.Foldable (foldl, foldr)
 import Data.Identity (Identity)
@@ -40,14 +37,17 @@ import Data.List (List(..))
 import Data.List as List
 import Data.List.NonEmpty (NonEmptyList)
 import Data.Maybe (Maybe(..), maybe)
-import Data.String (null, toLower)
+import Data.String (CodePoint, null, toLower)
+import Data.String.CodePoints (codePointFromChar)
+import Data.String.CodeUnits (toChar, singleton) as CodeUnits
 import Data.String.CodeUnits as SCU
+import Data.String.Unicode as Unicode
 import Data.Tuple (Tuple(..))
 import Math (pow)
 import Text.Parsing.Parser (ParseState(..), ParserT, fail)
 import Text.Parsing.Parser.Combinators (skipMany1, try, tryRethrow, skipMany, notFollowedBy, option, choice, between, sepBy1, sepBy, (<?>), (<??>))
 import Text.Parsing.Parser.Pos (Position)
-import Text.Parsing.Parser.String (satisfy, oneOf, noneOf, string, char)
+import Text.Parsing.Parser.String (char, noneOf, oneOf, satisfy, satisfyCodePoint, string)
 
 -- | Create a parser which Returns the first token in the stream.
 token :: forall m a. Monad m => (a -> Position) -> ParserT (List a) m a
@@ -746,7 +746,7 @@ whiteSpace' langDef@(LanguageDef languageDef)
         skipMany (simpleSpace <|> oneLineComment langDef <|> multiLineComment langDef <?> "")
 
 simpleSpace :: forall m . Monad m => ParserT String m Unit
-simpleSpace = skipMany1 (satisfyCP isSpace)
+simpleSpace = skipMany1 (satisfyCodePoint isSpace)
 
 oneLineComment :: forall m . Monad m => GenLanguageDef String m -> ParserT String m Unit
 oneLineComment (LanguageDef languageDef) =
diff --git a/test/Main.purs b/test/Main.purs
index 7c3ed2d..a581f3e 100644
--- a/test/Main.purs
+++ b/test/Main.purs
@@ -1,6 +1,6 @@
 module Test.Main where
 
-import Prelude hiding (between,when)
+import Prelude hiding (between, when)
 
 import Control.Alt ((<|>))
 import Control.Lazy (fix)
@@ -9,7 +9,9 @@ import Data.Either (Either(..))
 import Data.List (List(..), fromFoldable, many)
 import Data.List.NonEmpty (cons, cons')
 import Data.Maybe (Maybe(..))
+import Data.String.CodePoints as SCP
 import Data.String.CodeUnits (fromCharArray, singleton)
+import Data.String.CodeUnits as SCU
 import Data.Tuple (Tuple(..))
 import Effect (Effect)
 import Effect.Console (logShow)
@@ -19,8 +21,8 @@ import Text.Parsing.Parser.Combinators (endBy1, sepBy1, optionMaybe, try, chainl
 import Text.Parsing.Parser.Expr (Assoc(..), Operator(..), buildExprParser)
 import Text.Parsing.Parser.Language (javaStyle, haskellStyle, haskellDef)
 import Text.Parsing.Parser.Pos (Position(..), initialPos)
-import Text.Parsing.Parser.String (eof, string, char, satisfy, anyChar)
-import Text.Parsing.Parser.Token (TokenParser, match, when, token, makeTokenParser)
+import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, satisfy, string, whiteSpace)
+import Text.Parsing.Parser.Token (TokenParser, letter, makeTokenParser, match, token, when)
 
 parens :: forall m a. Monad m => ParserT String m a -> ParserT String m a
 parens = between (string "(") (string ")")
@@ -39,7 +41,7 @@ parseTest input expected p = case runParser input p of
 
 parseErrorTestPosition :: forall s a. Show a => Parser s a -> s -> Position -> Effect Unit
 parseErrorTestPosition p input expected = case runParser input p of
-  Right _ -> assert' "error: ParseError expected!" false
+  Right x -> assert' ("ParseError expected at " <> show expected <> " but parsed " <> show x) false
   Left err -> do
     let pos = parseErrorPosition err
     assert' ("expected: " <> show expected <> ", pos: " <> show pos) (expected == pos)
@@ -448,6 +450,24 @@ main = do
   parseTest "1*2+3/4-5" (-3) exprTest
   parseTest "ab?" "ab" manySatisfyTest
 
+  parseErrorTestPosition
+    anyChar
+    "𝅘𝅥𝅯"
+    (Position {column:1,line:1})
+
+  parseTest "𝅘𝅥𝅘𝅥𝅮x𝅘𝅥𝅯" ["𝅘𝅥", "𝅘𝅥𝅮", "x", "𝅘𝅥𝅯"] do
+    quarter <- anyCodePoint
+    eighth <- (singleton <$> char 'x') <|> string "𝅘𝅥𝅮"
+    letterx <- string "𝅘𝅥𝅯" <|> string "x"
+    sixteenth <- string "𝅘𝅥𝅯" <|> (singleton <$> char 'x')
+    pure $ [SCP.singleton quarter, eighth, letterx, sixteenth]
+
+  parseTest "aa  bb" ["aa", "  ", "bb"] do
+    aa <- SCU.fromCharArray <$> some letter
+    w <- whiteSpace
+    bb <- SCU.fromCharArray <$> some letter
+    pure [aa, w, bb]
+
   let tokpos = const initialPos
   parseTest (fromFoldable [A, B]) A (token tokpos)
   parseTest (fromFoldable [B, A]) B (token tokpos)