diff --git a/Cabal/Cabal.cabal b/Cabal/Cabal.cabal index 362bd6819f6..17b2b7478e7 100644 --- a/Cabal/Cabal.cabal +++ b/Cabal/Cabal.cabal @@ -389,6 +389,7 @@ test-suite unit-tests UnitTests.Distribution.Simple.Program.Internal UnitTests.Distribution.Simple.Utils UnitTests.Distribution.System + UnitTests.Distribution.Utils.Generic UnitTests.Distribution.Utils.NubList UnitTests.Distribution.Utils.ShortText UnitTests.Distribution.Version @@ -396,6 +397,7 @@ test-suite unit-tests build-depends: array, base, + bytestring, containers, directory, filepath, @@ -404,6 +406,7 @@ test-suite unit-tests tasty-hunit, tasty-quickcheck, tagged, + text, pretty, QuickCheck >= 2.7 && < 2.11, Cabal diff --git a/Cabal/Distribution/Utils/Generic.hs b/Cabal/Distribution/Utils/Generic.hs index 322d528fec8..f77e2bb892f 100644 --- a/Cabal/Distribution/Utils/Generic.hs +++ b/Cabal/Distribution/Utils/Generic.hs @@ -163,15 +163,29 @@ writeFileAtomic targetPath content = do -- * Unicode stuff -- ------------------------------------------------------------ +-- | Decode 'String' from UTF8-encoded 'BS.ByteString' +-- +-- Invalid data in the UTF8 stream (this includes code-points @U+D800@ +-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@). +-- fromUTF8BS :: SBS.ByteString -> String fromUTF8BS = decodeStringUtf8 . SBS.unpack +-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's +-- fromUTF8LBS :: BS.ByteString -> String fromUTF8LBS = decodeStringUtf8 . BS.unpack +-- | Encode 'String' to to UTF8-encoded 'SBS.ByteString' +-- +-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded +-- as the replacement character (i.e. @U+FFFD@). +-- toUTF8BS :: String -> SBS.ByteString toUTF8BS = SBS.pack . encodeStringUtf8 +-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's +-- toUTF8LBS :: String -> BS.ByteString toUTF8LBS = BS.pack . encodeStringUtf8 diff --git a/Cabal/Distribution/Utils/String.hs b/Cabal/Distribution/Utils/String.hs index 46637bd6d50..d02e769f6cf 100644 --- a/Cabal/Distribution/Utils/String.hs +++ b/Cabal/Distribution/Utils/String.hs @@ -10,7 +10,8 @@ import Data.Char (chr,ord) -- | Decode 'String' from UTF8-encoded octets. -- --- Invalid data will be decoded as the replacement character (@U+FFFD@) +-- Invalid data in the UTF8 stream (this includes code-points @U+D800@ +-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@). -- -- See also 'encodeStringUtf8' decodeStringUtf8 :: [Word8] -> String @@ -40,9 +41,7 @@ decodeStringUtf8 = go moreBytes :: Int -> Int -> [Word8] -> Int -> [Char] moreBytes 1 overlong cs' acc - | overlong <= acc && acc <= 0x10FFFF - && (acc < 0xD800 || 0xDFFF < acc) - && (acc < 0xFFFE || 0xFFFF < acc) + | overlong <= acc, acc <= 0x10FFFF, (acc < 0xD800 || 0xDFFF < acc) = chr acc : go cs' | otherwise @@ -61,6 +60,9 @@ decodeStringUtf8 = go -- | Encode 'String' to a list of UTF8-encoded octets -- +-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded +-- as the replacement character (i.e. @U+FFFD@). +-- -- See also 'decodeUtf8' encodeStringUtf8 :: String -> [Word8] encodeStringUtf8 [] = [] @@ -70,6 +72,12 @@ encodeStringUtf8 (c:cs) | c <= '\x7FF' = (0xC0 .|. w8ShiftR 6 ) : (0x80 .|. (w8 .&. 0x3F)) : encodeStringUtf8 cs + | c <= '\xD7FF'= (0xE0 .|. w8ShiftR 12 ) + : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) + : (0x80 .|. (w8 .&. 0x3F)) + : encodeStringUtf8 cs + | c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD + : encodeStringUtf8 cs | c <= '\xFFFF'= (0xE0 .|. w8ShiftR 12 ) : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) : (0x80 .|. (w8 .&. 0x3F)) diff --git a/Cabal/tests/UnitTests.hs b/Cabal/tests/UnitTests.hs index 08cba874ec4..48bf6825dd0 100644 --- a/Cabal/tests/UnitTests.hs +++ b/Cabal/tests/UnitTests.hs @@ -20,6 +20,7 @@ import qualified UnitTests.Distribution.Compat.Graph import qualified UnitTests.Distribution.Simple.Program.Internal import qualified UnitTests.Distribution.Simple.Utils import qualified UnitTests.Distribution.System +import qualified UnitTests.Distribution.Utils.Generic import qualified UnitTests.Distribution.Utils.NubList import qualified UnitTests.Distribution.Utils.ShortText import qualified UnitTests.Distribution.Version (versionTests) @@ -44,6 +45,8 @@ tests mtimeChangeCalibrated = UnitTests.Distribution.Simple.Program.Internal.tests , testGroup "Distribution.Simple.Utils" UnitTests.Distribution.Simple.Utils.tests + , testGroup "Distribution.Utils.Generic" + UnitTests.Distribution.Utils.Generic.tests , testGroup "Distribution.Utils.NubList" UnitTests.Distribution.Utils.NubList.tests , testGroup "Distribution.Utils.ShortText" diff --git a/Cabal/tests/UnitTests/Distribution/Utils/Generic.hs b/Cabal/tests/UnitTests/Distribution/Utils/Generic.hs new file mode 100644 index 00000000000..a4a32149518 --- /dev/null +++ b/Cabal/tests/UnitTests/Distribution/Utils/Generic.hs @@ -0,0 +1,66 @@ +{-# LANGUAGE OverloadedStrings #-} + +-- to suppress WARNING in "Distribution.Compat.Prelude.Internal" +{-# OPTIONS_GHC -fno-warn-deprecations #-} + +module UnitTests.Distribution.Utils.Generic ( tests ) where + +import Prelude () +import Distribution.Compat.Prelude.Internal + +import Distribution.Utils.Generic + +import qualified Data.ByteString.Char8 as BS +import qualified Data.Text as T +import qualified Data.Text.Encoding as T + +import Test.Tasty +import Test.Tasty.HUnit +import Test.Tasty.QuickCheck + +tests :: [TestTree] +tests = + [ -- fromUTF8BS / toUTF8BS + testCase "fromUTF8BS mempty" testFromUTF8BSEmpty + , testCase "toUTF8BS mempty" testToUTF8BSEmpty + , testCase "toUTF8BS [U+D800..U+DFFF]" testToUTF8BSSurr + , testCase "toUTF8BS [U+0000..U+7F]" testToUTF8BSAscii + , testCase "toUTF8BS [U+0000..U+10FFFF]" testToUTF8BSText + , testCase "fromUTF8BS.toUTF8BS [U+0000..U+10FFFF]" testToFromUTF8BS + + , testProperty "fromUTF8BS.toUTF8BS == id" prop_toFromUTF8BS + , testProperty "toUTF8BS == encodeUtf8" prop_toUTF8BS + ] + +testFromUTF8BSEmpty :: Assertion +testFromUTF8BSEmpty = mempty @=? fromUTF8BS mempty + +testToUTF8BSEmpty :: Assertion +testToUTF8BSEmpty = mempty @=? toUTF8BS mempty + +testToUTF8BSSurr :: Assertion +testToUTF8BSSurr = BS.concat (replicate 2048 u_fffd) @=? toUTF8BS surrogates + where + surrogates = ['\xD800'..'\xDFFF'] + u_fffd = "\xEF\xBF\xBD" + +testToUTF8BSText :: Assertion +testToUTF8BSText = T.encodeUtf8 (T.pack txt) @=? toUTF8BS txt + where + txt = ['\x00'..'\x10FFFF'] + +testToUTF8BSAscii :: Assertion +testToUTF8BSAscii = BS.pack txt @=? toUTF8BS txt + where + txt = ['\x00'..'\x7F'] + +testToFromUTF8BS :: Assertion +testToFromUTF8BS = txt @=? (fromUTF8BS . toUTF8BS) txt + where + txt = ['\x0000'..'\xD7FF'] ++ ['\xE000'..'\x10FFFF'] + +prop_toFromUTF8BS :: [Char] -> Property +prop_toFromUTF8BS txt = txt === (fromUTF8BS . toUTF8BS) txt + +prop_toUTF8BS :: [Char] -> Property +prop_toUTF8BS txt = T.encodeUtf8 (T.pack txt) === toUTF8BS txt