Use text package to decode UTF-8

Bodigrim · Bodigrim · commit 3daf6edcdf73 · 2026-01-24T22:14:29.000Z
diff --git a/Cabal-syntax/src/Distribution/Utils/Generic.hs b/Cabal-syntax/src/Distribution/Utils/Generic.hs
@@ -86,7 +86,6 @@ import Distribution.Compat.Prelude
 import Prelude ()
 
 import Data.Char (isAsciiLower, isAsciiUpper)
-import Distribution.Utils.String
 
 import Data.Bits (shiftL, (.&.), (.|.))
 import qualified Data.ByteString as SBS
@@ -95,6 +94,11 @@ import Data.List
   ( isInfixOf
   )
 import qualified Data.Set as Set
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Encoding.Error as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
 
 import qualified Control.Exception as Exception
 import System.Directory
@@ -212,22 +216,22 @@ writeFileAtomic targetPath content = do
 -- Invalid data in the UTF8 stream (this includes code-points @U+D800@
 -- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
 fromUTF8BS :: SBS.ByteString -> String
-fromUTF8BS = decodeStringUtf8 . SBS.unpack
+fromUTF8BS = T.unpack . T.decodeUtf8With T.lenientDecode
 
 -- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
 fromUTF8LBS :: LBS.ByteString -> String
-fromUTF8LBS = decodeStringUtf8 . LBS.unpack
+fromUTF8LBS = TL.unpack . TL.decodeUtf8With T.lenientDecode
 
 -- | Encode 'String' to UTF8-encoded 'SBS.ByteString'
 --
 -- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
 -- as the replacement character (i.e. @U+FFFD@).
 toUTF8BS :: String -> SBS.ByteString
-toUTF8BS = SBS.pack . encodeStringUtf8
+toUTF8BS = T.encodeUtf8 . T.pack
 
 -- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
 toUTF8LBS :: String -> LBS.ByteString
-toUTF8LBS = LBS.pack . encodeStringUtf8
+toUTF8LBS = TL.encodeUtf8 . TL.pack
 
 -- | Check that strict 'ByteString' is valid UTF8. Returns 'Just offset' if it's not.
 validateUTF8 :: SBS.ByteString -> Maybe Int
diff --git a/Cabal-syntax/src/Distribution/Utils/ShortText.hs b/Cabal-syntax/src/Distribution/Utils/ShortText.hs
@@ -20,20 +20,18 @@ module Distribution.Utils.ShortText
     -- * Operations
   , null
   , length
-
-    -- * internal utilities
-  , decodeStringUtf8
-  , encodeStringUtf8
   ) where
 
 import Distribution.Compat.Prelude hiding (length, null)
 import Prelude ()
 
-import Distribution.Utils.String (decodeStringUtf8, encodeStringUtf8)
 import Distribution.Utils.Structured (Structured (..), nominalStructure)
 
 import qualified Data.ByteString as BS
 import qualified Data.List as List
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Encoding.Error as T
 
 import qualified Data.ByteString.Short as BS.Short
 
@@ -69,9 +67,9 @@ instance Binary ShortText where
   put = put . unST
   get = fmap ST get
 
-toShortText = ST . BS.Short.pack . encodeStringUtf8
+toShortText = ST . BS.Short.toShort . T.encodeUtf8 . T.pack
 
-fromShortText = decodeStringUtf8 . BS.Short.unpack . unST
+fromShortText = T.unpack . T.decodeUtf8With T.lenientDecode . BS.Short.fromShort . unST
 
 unsafeFromUTF8BS = ST . BS.Short.toShort
 
diff --git a/Cabal-syntax/src/Distribution/Utils/String.hs b/Cabal-syntax/src/Distribution/Utils/String.hs
@@ -1,110 +1,10 @@
 module Distribution.Utils.String
-  ( -- * Encode to/from UTF8
-    decodeStringUtf8
-  , encodeStringUtf8
-  , trim
+  ( trim
   ) where
 
-import Data.Bits
-import Data.Char (chr, ord)
 import Data.List (dropWhileEnd)
-import Data.Word
 import GHC.Unicode (isSpace)
 
--- | Decode 'String' from UTF8-encoded octets.
---
--- Invalid data in the UTF8 stream (this includes code-points @U+D800@
--- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
---
--- See also 'encodeStringUtf8'
-decodeStringUtf8 :: [Word8] -> String
-decodeStringUtf8 = go
-  where
-    go :: [Word8] -> String
-    go [] = []
-    go (c : cs)
-      | c <= 0x7F = chr (fromIntegral c) : go cs
-      | c <= 0xBF = replacementChar : go cs
-      | c <= 0xDF = twoBytes c cs
-      | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF)
-      | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7)
-      | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3)
-      | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
-      | otherwise = replacementChar : go cs
-
-    twoBytes :: Word8 -> [Word8] -> String
-    twoBytes c0 (c1 : cs')
-      | c1 .&. 0xC0 == 0x80 =
-          let d =
-                (fromIntegral (c0 .&. 0x1F) `shiftL` 6)
-                  .|. fromIntegral (c1 .&. 0x3F)
-           in if d >= 0x80
-                then chr d : go cs'
-                else replacementChar : go cs'
-    twoBytes _ cs' = replacementChar : go cs'
-
-    moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
-    moreBytes 1 overlong cs' acc
-      | overlong <= acc
-      , acc <= 0x10FFFF
-      , acc < 0xD800 || 0xDFFF < acc =
-          chr acc : go cs'
-      | otherwise =
-          replacementChar : go cs'
-    moreBytes byteCount overlong (cn : cs') acc
-      | cn .&. 0xC0 == 0x80 =
-          moreBytes
-            (byteCount - 1)
-            overlong
-            cs'
-            ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
-    moreBytes _ _ cs' _ =
-      replacementChar : go cs'
-
-    replacementChar = '\xfffd'
-
--- | Encode 'String' to a list of UTF8-encoded octets
---
--- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
--- as the replacement character (i.e. @U+FFFD@).
---
--- See also 'decodeUtf8'
-encodeStringUtf8 :: String -> [Word8]
-encodeStringUtf8 [] = []
-encodeStringUtf8 (c : cs)
-  | c <= '\x07F' =
-      w8
-        : encodeStringUtf8 cs
-  | c <= '\x7FF' =
-      (0xC0 .|. w8ShiftR 6)
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  | c <= '\xD7FF' =
-      (0xE0 .|. w8ShiftR 12)
-        : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  | c <= '\xDFFF' =
-      0xEF
-        : 0xBF
-        : 0xBD -- U+FFFD
-        : encodeStringUtf8 cs
-  | c <= '\xFFFF' =
-      (0xE0 .|. w8ShiftR 12)
-        : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  | otherwise =
-      (0xf0 .|. w8ShiftR 18)
-        : (0x80 .|. (w8ShiftR 12 .&. 0x3F))
-        : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  where
-    w8 = fromIntegral (ord c) :: Word8
-    w8ShiftR :: Int -> Word8
-    w8ShiftR = fromIntegral . shiftR (ord c)
-
 -- @since 3.8.0.0
 trim :: String -> String
 trim = dropWhile isSpace . dropWhileEnd isSpace