haskell · 23Skidoo · Dec 4, 2017 · Dec 3, 2017 · Dec 3, 2017
diff --git a/Cabal/Cabal.cabal b/Cabal/Cabal.cabal
@@ -389,13 +389,15 @@ test-suite unit-tests
     UnitTests.Distribution.Simple.Program.Internal
     UnitTests.Distribution.Simple.Utils
     UnitTests.Distribution.System
+    UnitTests.Distribution.Utils.Generic
     UnitTests.Distribution.Utils.NubList
     UnitTests.Distribution.Utils.ShortText
     UnitTests.Distribution.Version
   main-is: UnitTests.hs
   build-depends:
     array,
     base,
+    bytestring,
     containers,
     directory,
     filepath,
@@ -404,6 +406,7 @@ test-suite unit-tests
     tasty-hunit,
     tasty-quickcheck,
     tagged,
+    text,
     pretty,
     QuickCheck >= 2.7 && < 2.11,
     Cabal

diff --git a/Cabal/Distribution/Utils/Generic.hs b/Cabal/Distribution/Utils/Generic.hs
@@ -163,15 +163,29 @@ writeFileAtomic targetPath content = do
 -- * Unicode stuff
 -- ------------------------------------------------------------
 
+-- | Decode 'String' from UTF8-encoded 'BS.ByteString'
+--
+-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
+-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
+--
 fromUTF8BS :: SBS.ByteString -> String
 fromUTF8BS = decodeStringUtf8 . SBS.unpack
 
+-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
+--
 fromUTF8LBS :: BS.ByteString -> String
 fromUTF8LBS = decodeStringUtf8 . BS.unpack
 
+-- | Encode 'String' to to UTF8-encoded 'SBS.ByteString'
+--
+-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
+-- as the replacement character (i.e. @U+FFFD@).
+--
 toUTF8BS :: String -> SBS.ByteString
 toUTF8BS = SBS.pack . encodeStringUtf8
 
+-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
+--
 toUTF8LBS :: String -> BS.ByteString
 toUTF8LBS = BS.pack . encodeStringUtf8
 

diff --git a/Cabal/Distribution/Utils/String.hs b/Cabal/Distribution/Utils/String.hs
@@ -10,7 +10,8 @@ import Data.Char (chr,ord)
 
 -- | Decode 'String' from UTF8-encoded octets.
 --
--- Invalid data will be decoded as the replacement character (@U+FFFD@)
+-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
+-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
 --
 -- See also 'encodeStringUtf8'
 decodeStringUtf8 :: [Word8] -> String
@@ -40,9 +41,7 @@ decodeStringUtf8 = go
 
     moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
     moreBytes 1 overlong cs' acc
-      | overlong <= acc && acc <= 0x10FFFF
-     && (acc < 0xD800 || 0xDFFF < acc)
-     && (acc < 0xFFFE || 0xFFFF < acc)
+      | overlong <= acc, acc <= 0x10FFFF, (acc < 0xD800 || 0xDFFF < acc)
       = chr acc : go cs'
 
       | otherwise
@@ -61,6 +60,9 @@ decodeStringUtf8 = go
 
 -- | Encode 'String' to a list of UTF8-encoded octets
 --
+-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
+-- as the replacement character (i.e. @U+FFFD@).
+--
 -- See also 'decodeUtf8'
 encodeStringUtf8 :: String -> [Word8]
 encodeStringUtf8 []        = []
@@ -70,6 +72,12 @@ encodeStringUtf8 (c:cs)
   | c <= '\x7FF' = (0xC0 .|.  w8ShiftR  6          )
                  : (0x80 .|. (w8          .&. 0x3F))
                  : encodeStringUtf8 cs
+  | c <= '\xD7FF'= (0xE0 .|.  w8ShiftR 12          )
+                 : (0x80 .|. (w8ShiftR  6 .&. 0x3F))
+                 : (0x80 .|. (w8          .&. 0x3F))
+                 : encodeStringUtf8 cs
+  | c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD
+                 : encodeStringUtf8 cs
   | c <= '\xFFFF'= (0xE0 .|.  w8ShiftR 12          )
                  : (0x80 .|. (w8ShiftR  6 .&. 0x3F))
                  : (0x80 .|. (w8          .&. 0x3F))

diff --git a/Cabal/tests/UnitTests.hs b/Cabal/tests/UnitTests.hs
@@ -20,6 +20,7 @@ import qualified UnitTests.Distribution.Compat.Graph
 import qualified UnitTests.Distribution.Simple.Program.Internal
 import qualified UnitTests.Distribution.Simple.Utils
 import qualified UnitTests.Distribution.System
+import qualified UnitTests.Distribution.Utils.Generic
 import qualified UnitTests.Distribution.Utils.NubList
 import qualified UnitTests.Distribution.Utils.ShortText
 import qualified UnitTests.Distribution.Version (versionTests)
@@ -44,6 +45,8 @@ tests mtimeChangeCalibrated =
         UnitTests.Distribution.Simple.Program.Internal.tests
     , testGroup "Distribution.Simple.Utils"
         UnitTests.Distribution.Simple.Utils.tests
+    , testGroup "Distribution.Utils.Generic"
+        UnitTests.Distribution.Utils.Generic.tests
     , testGroup "Distribution.Utils.NubList"
         UnitTests.Distribution.Utils.NubList.tests
     , testGroup "Distribution.Utils.ShortText"

diff --git a/Cabal/tests/UnitTests/Distribution/Utils/Generic.hs b/Cabal/tests/UnitTests/Distribution/Utils/Generic.hs
@@ -0,0 +1,66 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+-- to suppress WARNING in "Distribution.Compat.Prelude.Internal"
+{-# OPTIONS_GHC -fno-warn-deprecations #-}
+
+module UnitTests.Distribution.Utils.Generic ( tests ) where
+
+import Prelude ()
+import Distribution.Compat.Prelude.Internal
+
+import Distribution.Utils.Generic
+
+import qualified Data.ByteString.Char8 as BS
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+
+import Test.Tasty
+import Test.Tasty.HUnit
+import Test.Tasty.QuickCheck
+
+tests :: [TestTree]
+tests =
+    [ -- fromUTF8BS / toUTF8BS
+      testCase "fromUTF8BS mempty" testFromUTF8BSEmpty
+    , testCase "toUTF8BS mempty" testToUTF8BSEmpty
+    , testCase "toUTF8BS [U+D800..U+DFFF]" testToUTF8BSSurr
+    , testCase "toUTF8BS [U+0000..U+7F]"   testToUTF8BSAscii
+    , testCase "toUTF8BS [U+0000..U+10FFFF]" testToUTF8BSText
+    , testCase "fromUTF8BS.toUTF8BS [U+0000..U+10FFFF]" testToFromUTF8BS
+
+    , testProperty "fromUTF8BS.toUTF8BS == id" prop_toFromUTF8BS
+    , testProperty "toUTF8BS == encodeUtf8" prop_toUTF8BS
+    ]
+
+testFromUTF8BSEmpty :: Assertion
+testFromUTF8BSEmpty = mempty @=? fromUTF8BS mempty
+
+testToUTF8BSEmpty :: Assertion
+testToUTF8BSEmpty = mempty @=? toUTF8BS mempty
+
+testToUTF8BSSurr :: Assertion
+testToUTF8BSSurr = BS.concat (replicate 2048 u_fffd) @=? toUTF8BS surrogates
+  where
+    surrogates = ['\xD800'..'\xDFFF']
+    u_fffd = "\xEF\xBF\xBD"
+
+testToUTF8BSText :: Assertion
+testToUTF8BSText = T.encodeUtf8 (T.pack txt) @=? toUTF8BS txt
+  where
+    txt = ['\x00'..'\x10FFFF']
+
+testToUTF8BSAscii :: Assertion
+testToUTF8BSAscii = BS.pack txt @=? toUTF8BS txt
+  where
+    txt = ['\x00'..'\x7F']
+
+testToFromUTF8BS :: Assertion
+testToFromUTF8BS = txt @=? (fromUTF8BS . toUTF8BS) txt
+  where
+    txt = ['\x0000'..'\xD7FF'] ++ ['\xE000'..'\x10FFFF']
+
+prop_toFromUTF8BS :: [Char] -> Property
+prop_toFromUTF8BS txt = txt === (fromUTF8BS . toUTF8BS) txt
+
+prop_toUTF8BS :: [Char] -> Property
+prop_toUTF8BS txt = T.encodeUtf8 (T.pack txt) === toUTF8BS txt