Skip to content

Commit 3daf6ed

Browse files
committed
Use text package to decode UTF-8
1 parent f8e6106 commit 3daf6ed

File tree

3 files changed

+15
-113
lines changed

3 files changed

+15
-113
lines changed

Cabal-syntax/src/Distribution/Utils/Generic.hs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ import Distribution.Compat.Prelude
8686
import Prelude ()
8787

8888
import Data.Char (isAsciiLower, isAsciiUpper)
89-
import Distribution.Utils.String
9089

9190
import Data.Bits (shiftL, (.&.), (.|.))
9291
import qualified Data.ByteString as SBS
@@ -95,6 +94,11 @@ import Data.List
9594
( isInfixOf
9695
)
9796
import qualified Data.Set as Set
97+
import qualified Data.Text as T
98+
import qualified Data.Text.Encoding as T
99+
import qualified Data.Text.Encoding.Error as T
100+
import qualified Data.Text.Lazy as TL
101+
import qualified Data.Text.Lazy.Encoding as TL
98102

99103
import qualified Control.Exception as Exception
100104
import System.Directory
@@ -212,22 +216,22 @@ writeFileAtomic targetPath content = do
212216
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
213217
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
214218
fromUTF8BS :: SBS.ByteString -> String
215-
fromUTF8BS = decodeStringUtf8 . SBS.unpack
219+
fromUTF8BS = T.unpack . T.decodeUtf8With T.lenientDecode
216220

217221
-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
218222
fromUTF8LBS :: LBS.ByteString -> String
219-
fromUTF8LBS = decodeStringUtf8 . LBS.unpack
223+
fromUTF8LBS = TL.unpack . TL.decodeUtf8With T.lenientDecode
220224

221225
-- | Encode 'String' to UTF8-encoded 'SBS.ByteString'
222226
--
223227
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
224228
-- as the replacement character (i.e. @U+FFFD@).
225229
toUTF8BS :: String -> SBS.ByteString
226-
toUTF8BS = SBS.pack . encodeStringUtf8
230+
toUTF8BS = T.encodeUtf8 . T.pack
227231

228232
-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
229233
toUTF8LBS :: String -> LBS.ByteString
230-
toUTF8LBS = LBS.pack . encodeStringUtf8
234+
toUTF8LBS = TL.encodeUtf8 . TL.pack
231235

232236
-- | Check that strict 'ByteString' is valid UTF8. Returns 'Just offset' if it's not.
233237
validateUTF8 :: SBS.ByteString -> Maybe Int

Cabal-syntax/src/Distribution/Utils/ShortText.hs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,20 +20,18 @@ module Distribution.Utils.ShortText
2020
-- * Operations
2121
, null
2222
, length
23-
24-
-- * internal utilities
25-
, decodeStringUtf8
26-
, encodeStringUtf8
2723
) where
2824

2925
import Distribution.Compat.Prelude hiding (length, null)
3026
import Prelude ()
3127

32-
import Distribution.Utils.String (decodeStringUtf8, encodeStringUtf8)
3328
import Distribution.Utils.Structured (Structured (..), nominalStructure)
3429

3530
import qualified Data.ByteString as BS
3631
import qualified Data.List as List
32+
import qualified Data.Text as T
33+
import qualified Data.Text.Encoding as T
34+
import qualified Data.Text.Encoding.Error as T
3735

3836
import qualified Data.ByteString.Short as BS.Short
3937

@@ -69,9 +67,9 @@ instance Binary ShortText where
6967
put = put . unST
7068
get = fmap ST get
7169

72-
toShortText = ST . BS.Short.pack . encodeStringUtf8
70+
toShortText = ST . BS.Short.toShort . T.encodeUtf8 . T.pack
7371

74-
fromShortText = decodeStringUtf8 . BS.Short.unpack . unST
72+
fromShortText = T.unpack . T.decodeUtf8With T.lenientDecode . BS.Short.fromShort . unST
7573

7674
unsafeFromUTF8BS = ST . BS.Short.toShort
7775

Lines changed: 1 addition & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,10 @@
11
module Distribution.Utils.String
2-
( -- * Encode to/from UTF8
3-
decodeStringUtf8
4-
, encodeStringUtf8
5-
, trim
2+
( trim
63
) where
74

8-
import Data.Bits
9-
import Data.Char (chr, ord)
105
import Data.List (dropWhileEnd)
11-
import Data.Word
126
import GHC.Unicode (isSpace)
137

14-
-- | Decode 'String' from UTF8-encoded octets.
15-
--
16-
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
17-
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
18-
--
19-
-- See also 'encodeStringUtf8'
20-
decodeStringUtf8 :: [Word8] -> String
21-
decodeStringUtf8 = go
22-
where
23-
go :: [Word8] -> String
24-
go [] = []
25-
go (c : cs)
26-
| c <= 0x7F = chr (fromIntegral c) : go cs
27-
| c <= 0xBF = replacementChar : go cs
28-
| c <= 0xDF = twoBytes c cs
29-
| c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF)
30-
| c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7)
31-
| c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3)
32-
| c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
33-
| otherwise = replacementChar : go cs
34-
35-
twoBytes :: Word8 -> [Word8] -> String
36-
twoBytes c0 (c1 : cs')
37-
| c1 .&. 0xC0 == 0x80 =
38-
let d =
39-
(fromIntegral (c0 .&. 0x1F) `shiftL` 6)
40-
.|. fromIntegral (c1 .&. 0x3F)
41-
in if d >= 0x80
42-
then chr d : go cs'
43-
else replacementChar : go cs'
44-
twoBytes _ cs' = replacementChar : go cs'
45-
46-
moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
47-
moreBytes 1 overlong cs' acc
48-
| overlong <= acc
49-
, acc <= 0x10FFFF
50-
, acc < 0xD800 || 0xDFFF < acc =
51-
chr acc : go cs'
52-
| otherwise =
53-
replacementChar : go cs'
54-
moreBytes byteCount overlong (cn : cs') acc
55-
| cn .&. 0xC0 == 0x80 =
56-
moreBytes
57-
(byteCount - 1)
58-
overlong
59-
cs'
60-
((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
61-
moreBytes _ _ cs' _ =
62-
replacementChar : go cs'
63-
64-
replacementChar = '\xfffd'
65-
66-
-- | Encode 'String' to a list of UTF8-encoded octets
67-
--
68-
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
69-
-- as the replacement character (i.e. @U+FFFD@).
70-
--
71-
-- See also 'decodeUtf8'
72-
encodeStringUtf8 :: String -> [Word8]
73-
encodeStringUtf8 [] = []
74-
encodeStringUtf8 (c : cs)
75-
| c <= '\x07F' =
76-
w8
77-
: encodeStringUtf8 cs
78-
| c <= '\x7FF' =
79-
(0xC0 .|. w8ShiftR 6)
80-
: (0x80 .|. (w8 .&. 0x3F))
81-
: encodeStringUtf8 cs
82-
| c <= '\xD7FF' =
83-
(0xE0 .|. w8ShiftR 12)
84-
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
85-
: (0x80 .|. (w8 .&. 0x3F))
86-
: encodeStringUtf8 cs
87-
| c <= '\xDFFF' =
88-
0xEF
89-
: 0xBF
90-
: 0xBD -- U+FFFD
91-
: encodeStringUtf8 cs
92-
| c <= '\xFFFF' =
93-
(0xE0 .|. w8ShiftR 12)
94-
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
95-
: (0x80 .|. (w8 .&. 0x3F))
96-
: encodeStringUtf8 cs
97-
| otherwise =
98-
(0xf0 .|. w8ShiftR 18)
99-
: (0x80 .|. (w8ShiftR 12 .&. 0x3F))
100-
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
101-
: (0x80 .|. (w8 .&. 0x3F))
102-
: encodeStringUtf8 cs
103-
where
104-
w8 = fromIntegral (ord c) :: Word8
105-
w8ShiftR :: Int -> Word8
106-
w8ShiftR = fromIntegral . shiftR (ord c)
107-
1088
-- @since 3.8.0.0
1099
trim :: String -> String
11010
trim = dropWhile isSpace . dropWhileEnd isSpace

0 commit comments

Comments
 (0)