Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cabal/Cabal.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,15 @@ test-suite unit-tests
UnitTests.Distribution.Simple.Program.Internal
UnitTests.Distribution.Simple.Utils
UnitTests.Distribution.System
UnitTests.Distribution.Utils.Generic
UnitTests.Distribution.Utils.NubList
UnitTests.Distribution.Utils.ShortText
UnitTests.Distribution.Version
main-is: UnitTests.hs
build-depends:
array,
base,
bytestring,
containers,
directory,
filepath,
Expand All @@ -404,6 +406,7 @@ test-suite unit-tests
tasty-hunit,
tasty-quickcheck,
tagged,
text,
pretty,
QuickCheck >= 2.7 && < 2.11,
Cabal
Expand Down
14 changes: 14 additions & 0 deletions Cabal/Distribution/Utils/Generic.hs
Original file line number Diff line number Diff line change
Expand Up @@ -163,15 +163,29 @@ writeFileAtomic targetPath content = do
-- * Unicode stuff
-- ------------------------------------------------------------

-- | Decode 'String' from UTF8-encoded 'BS.ByteString'
--
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
fromUTF8BS :: SBS.ByteString -> String
fromUTF8BS = decodeStringUtf8 . SBS.unpack

-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
--
fromUTF8LBS :: BS.ByteString -> String
fromUTF8LBS = decodeStringUtf8 . BS.unpack

-- | Encode 'String' to to UTF8-encoded 'SBS.ByteString'
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
toUTF8BS :: String -> SBS.ByteString
toUTF8BS = SBS.pack . encodeStringUtf8

-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
--
toUTF8LBS :: String -> BS.ByteString
toUTF8LBS = BS.pack . encodeStringUtf8

Expand Down
16 changes: 12 additions & 4 deletions Cabal/Distribution/Utils/String.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ import Data.Char (chr,ord)

-- | Decode 'String' from UTF8-encoded octets.
--
-- Invalid data will be decoded as the replacement character (@U+FFFD@)
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
-- See also 'encodeStringUtf8'
decodeStringUtf8 :: [Word8] -> String
Expand Down Expand Up @@ -40,9 +41,7 @@ decodeStringUtf8 = go

moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
moreBytes 1 overlong cs' acc
| overlong <= acc && acc <= 0x10FFFF
&& (acc < 0xD800 || 0xDFFF < acc)
&& (acc < 0xFFFE || 0xFFFF < acc)
| overlong <= acc, acc <= 0x10FFFF, (acc < 0xD800 || 0xDFFF < acc)
= chr acc : go cs'

| otherwise
Expand All @@ -61,6 +60,9 @@ decodeStringUtf8 = go

-- | Encode 'String' to a list of UTF8-encoded octets
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
-- See also 'decodeUtf8'
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 [] = []
Expand All @@ -70,6 +72,12 @@ encodeStringUtf8 (c:cs)
| c <= '\x7FF' = (0xC0 .|. w8ShiftR 6 )
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xD7FF'= (0xE0 .|. w8ShiftR 12 )
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD
: encodeStringUtf8 cs
| c <= '\xFFFF'= (0xE0 .|. w8ShiftR 12 )
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
Expand Down
3 changes: 3 additions & 0 deletions Cabal/tests/UnitTests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import qualified UnitTests.Distribution.Compat.Graph
import qualified UnitTests.Distribution.Simple.Program.Internal
import qualified UnitTests.Distribution.Simple.Utils
import qualified UnitTests.Distribution.System
import qualified UnitTests.Distribution.Utils.Generic
import qualified UnitTests.Distribution.Utils.NubList
import qualified UnitTests.Distribution.Utils.ShortText
import qualified UnitTests.Distribution.Version (versionTests)
Expand All @@ -44,6 +45,8 @@ tests mtimeChangeCalibrated =
UnitTests.Distribution.Simple.Program.Internal.tests
, testGroup "Distribution.Simple.Utils"
UnitTests.Distribution.Simple.Utils.tests
, testGroup "Distribution.Utils.Generic"
UnitTests.Distribution.Utils.Generic.tests
, testGroup "Distribution.Utils.NubList"
UnitTests.Distribution.Utils.NubList.tests
, testGroup "Distribution.Utils.ShortText"
Expand Down
66 changes: 66 additions & 0 deletions Cabal/tests/UnitTests/Distribution/Utils/Generic.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{-# LANGUAGE OverloadedStrings #-}

-- to suppress WARNING in "Distribution.Compat.Prelude.Internal"
{-# OPTIONS_GHC -fno-warn-deprecations #-}

module UnitTests.Distribution.Utils.Generic ( tests ) where

import Prelude ()
import Distribution.Compat.Prelude.Internal

import Distribution.Utils.Generic

import qualified Data.ByteString.Char8 as BS
import qualified Data.Text as T
import qualified Data.Text.Encoding as T

import Test.Tasty
import Test.Tasty.HUnit
import Test.Tasty.QuickCheck

tests :: [TestTree]
tests =
[ -- fromUTF8BS / toUTF8BS
testCase "fromUTF8BS mempty" testFromUTF8BSEmpty
, testCase "toUTF8BS mempty" testToUTF8BSEmpty
, testCase "toUTF8BS [U+D800..U+DFFF]" testToUTF8BSSurr
, testCase "toUTF8BS [U+0000..U+7F]" testToUTF8BSAscii
, testCase "toUTF8BS [U+0000..U+10FFFF]" testToUTF8BSText
, testCase "fromUTF8BS.toUTF8BS [U+0000..U+10FFFF]" testToFromUTF8BS

, testProperty "fromUTF8BS.toUTF8BS == id" prop_toFromUTF8BS
, testProperty "toUTF8BS == encodeUtf8" prop_toUTF8BS
]

testFromUTF8BSEmpty :: Assertion
testFromUTF8BSEmpty = mempty @=? fromUTF8BS mempty

testToUTF8BSEmpty :: Assertion
testToUTF8BSEmpty = mempty @=? toUTF8BS mempty

testToUTF8BSSurr :: Assertion
testToUTF8BSSurr = BS.concat (replicate 2048 u_fffd) @=? toUTF8BS surrogates
where
surrogates = ['\xD800'..'\xDFFF']
u_fffd = "\xEF\xBF\xBD"

testToUTF8BSText :: Assertion
testToUTF8BSText = T.encodeUtf8 (T.pack txt) @=? toUTF8BS txt
where
txt = ['\x00'..'\x10FFFF']

testToUTF8BSAscii :: Assertion
testToUTF8BSAscii = BS.pack txt @=? toUTF8BS txt
where
txt = ['\x00'..'\x7F']

testToFromUTF8BS :: Assertion
testToFromUTF8BS = txt @=? (fromUTF8BS . toUTF8BS) txt
where
txt = ['\x0000'..'\xD7FF'] ++ ['\xE000'..'\x10FFFF']

prop_toFromUTF8BS :: [Char] -> Property
prop_toFromUTF8BS txt = txt === (fromUTF8BS . toUTF8BS) txt

prop_toUTF8BS :: [Char] -> Property
prop_toUTF8BS txt = T.encodeUtf8 (T.pack txt) === toUTF8BS txt