haskell · Bodigrim · Jan 17, 2026 · Jan 24, 2026 · Jan 25, 2026 · geekosaur
diff --git a/Cabal-syntax/Cabal-syntax.cabal b/Cabal-syntax/Cabal-syntax.cabal
@@ -39,7 +39,7 @@ library
     , mtl        >= 2.1      && < 2.4
     , parsec     >= 3.1.13.0 && < 3.2
     , pretty     >= 1.1.1    && < 1.2
-    , text       (>= 1.2.3.0 && < 1.3) || (>= 2.0 && < 2.2)
+    , text       >= 2.0.2    && < 2.2
     , time       >= 1.4.0.1  && < 1.16
     -- transformers-0.4.0.0 doesn't have record syntax e.g. for Identity
     -- See also https://github.com/ekmett/transformers-compat/issues/35

diff --git a/Cabal-syntax/src/Distribution/Parsec/FieldLineStream.hs b/Cabal-syntax/src/Distribution/Parsec/FieldLineStream.hs
@@ -1,6 +1,6 @@
+{-# LANGUAGE CPP #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
-{-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# OPTIONS_GHC -Wall -Werror #-}
 
@@ -11,13 +11,13 @@ module Distribution.Parsec.FieldLineStream
   , fieldLineStreamEnd
   ) where
 
-import Data.Bits
 import Data.ByteString (ByteString)
 import Distribution.Compat.Prelude
 import Distribution.Utils.Generic (toUTF8BS)
 import Prelude ()
 
 import qualified Data.ByteString as BS
+import Data.Text.Internal.Encoding.Utf8
 import qualified Text.Parsec as Parsec
 
 -- | This is essentially a lazy bytestring, but chunks are glued with newline @\'\\n\'@.
@@ -47,47 +47,15 @@ instance Monad m => Parsec.Stream FieldLineStream m Char where
     Nothing -> Just ('\n', s)
     Just (c, bs') -> Just (unconsChar c bs' (\bs'' -> FLSCons bs'' s) s)
 
--- Based on implementation 'decodeStringUtf8'
 unconsChar :: forall a. Word8 -> ByteString -> (ByteString -> a) -> a -> (Char, a)
-unconsChar c0 bs0 f next
-  | c0 <= 0x7F = (chr (fromIntegral c0), f bs0)
-  | c0 <= 0xBF = (replacementChar, f bs0)
-  | c0 <= 0xDF = twoBytes
-  | c0 <= 0xEF = moreBytes 3 0x800 bs0 (fromIntegral $ c0 .&. 0xF)
-  | c0 <= 0xF7 = moreBytes 4 0x10000 bs0 (fromIntegral $ c0 .&. 0x7)
-  | c0 <= 0xFB = moreBytes 5 0x200000 bs0 (fromIntegral $ c0 .&. 0x3)
-  | c0 <= 0xFD = moreBytes 6 0x4000000 bs0 (fromIntegral $ c0 .&. 0x1)
-  | otherwise = error $ "not implemented " ++ show c0
+unconsChar c0 bs0 f next = go (utf8DecodeStart c0) bs0
   where
-    twoBytes = case BS.uncons bs0 of
-      Nothing -> (replacementChar, next)
-      Just (c1, bs1)
-        | c1 .&. 0xC0 == 0x80 ->
-            if d >= 0x80
-              then (chr d, f bs1)
-              else (replacementChar, f bs1)
-        | otherwise -> (replacementChar, f bs1)
-        where
-          d = (fromIntegral (c0 .&. 0x1F) `shiftL` 6) .|. fromIntegral (c1 .&. 0x3F)
-
-    moreBytes :: Int -> Int -> ByteString -> Int -> (Char, a)
-    moreBytes 1 overlong bs' acc
-      | overlong <= acc
-      , acc <= 0x10FFFF
-      , acc < 0xD800 || 0xDFFF < acc =
-          (chr acc, f bs')
-      | otherwise =
-          (replacementChar, f bs')
-    moreBytes byteCount overlong bs' acc = case BS.uncons bs' of
-      Nothing -> (replacementChar, f bs')
-      Just (cn, bs1)
-        | cn .&. 0xC0 == 0x80 ->
-            moreBytes
-              (byteCount - 1)
-              overlong
-              bs1
-              ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
-        | otherwise -> (replacementChar, f bs1)
+    go decoderResult bs = case decoderResult of
+      Accept ch -> (ch, f bs)
+      Reject -> (replacementChar, f bs)
+      Incomplete state codePoint -> case BS.uncons bs of
+        Nothing -> (replacementChar, next)
+        Just (w, bs') -> go (utf8DecodeContinue w state codePoint) bs'
 
 replacementChar :: Char
 replacementChar = '\xfffd'
diff --git a/Cabal-syntax/src/Distribution/Utils/Generic.hs b/Cabal-syntax/src/Distribution/Utils/Generic.hs
@@ -86,7 +86,6 @@ import Distribution.Compat.Prelude
 import Prelude ()
 
 import Data.Char (isAsciiLower, isAsciiUpper)
-import Distribution.Utils.String
 
 import Data.Bits (shiftL, (.&.), (.|.))
 import qualified Data.ByteString as SBS
@@ -95,6 +94,11 @@ import Data.List
   ( isInfixOf
   )
 import qualified Data.Set as Set
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Encoding.Error as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
 
 import qualified Control.Exception as Exception
 import System.Directory
@@ -212,22 +216,22 @@ writeFileAtomic targetPath content = do
 -- Invalid data in the UTF8 stream (this includes code-points @U+D800@
 -- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
 fromUTF8BS :: SBS.ByteString -> String
-fromUTF8BS = decodeStringUtf8 . SBS.unpack
+fromUTF8BS = T.unpack . T.decodeUtf8With T.lenientDecode
 
 -- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
 fromUTF8LBS :: LBS.ByteString -> String
-fromUTF8LBS = decodeStringUtf8 . LBS.unpack
+fromUTF8LBS = TL.unpack . TL.decodeUtf8With T.lenientDecode
 
 -- | Encode 'String' to UTF8-encoded 'SBS.ByteString'
 --
 -- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
 -- as the replacement character (i.e. @U+FFFD@).
 toUTF8BS :: String -> SBS.ByteString
-toUTF8BS = SBS.pack . encodeStringUtf8
+toUTF8BS = T.encodeUtf8 . T.pack
 
 -- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
 toUTF8LBS :: String -> LBS.ByteString
-toUTF8LBS = LBS.pack . encodeStringUtf8
+toUTF8LBS = TL.encodeUtf8 . TL.pack
 
 -- | Check that strict 'ByteString' is valid UTF8. Returns 'Just offset' if it's not.
 validateUTF8 :: SBS.ByteString -> Maybe Int

diff --git a/Cabal-syntax/src/Distribution/Utils/ShortText.hs b/Cabal-syntax/src/Distribution/Utils/ShortText.hs
@@ -20,20 +20,18 @@ module Distribution.Utils.ShortText
     -- * Operations
   , null
   , length
-
-    -- * internal utilities
-  , decodeStringUtf8
-  , encodeStringUtf8
   ) where
 
 import Distribution.Compat.Prelude hiding (length, null)
 import Prelude ()
 
-import Distribution.Utils.String (decodeStringUtf8, encodeStringUtf8)
 import Distribution.Utils.Structured (Structured (..), nominalStructure)
 
 import qualified Data.ByteString as BS
 import qualified Data.List as List
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.Encoding.Error as T
 
 import qualified Data.ByteString.Short as BS.Short
 
@@ -69,9 +67,9 @@ instance Binary ShortText where
   put = put . unST
   get = fmap ST get
 
-toShortText = ST . BS.Short.pack . encodeStringUtf8
+toShortText = ST . BS.Short.toShort . T.encodeUtf8 . T.pack
 
-fromShortText = decodeStringUtf8 . BS.Short.unpack . unST
+fromShortText = T.unpack . T.decodeUtf8With T.lenientDecode . BS.Short.fromShort . unST
 
 unsafeFromUTF8BS = ST . BS.Short.toShort
 

diff --git a/Cabal-syntax/src/Distribution/Utils/String.hs b/Cabal-syntax/src/Distribution/Utils/String.hs
@@ -1,110 +1,10 @@
 module Distribution.Utils.String
-  ( -- * Encode to/from UTF8
-    decodeStringUtf8
-  , encodeStringUtf8
-  , trim
+  ( trim
   ) where
 
-import Data.Bits
-import Data.Char (chr, ord)
 import Data.List (dropWhileEnd)
-import Data.Word
 import GHC.Unicode (isSpace)
 
--- | Decode 'String' from UTF8-encoded octets.
---
--- Invalid data in the UTF8 stream (this includes code-points @U+D800@
--- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
---
--- See also 'encodeStringUtf8'
-decodeStringUtf8 :: [Word8] -> String
-decodeStringUtf8 = go
-  where
-    go :: [Word8] -> String
-    go [] = []
-    go (c : cs)
-      | c <= 0x7F = chr (fromIntegral c) : go cs
-      | c <= 0xBF = replacementChar : go cs
-      | c <= 0xDF = twoBytes c cs
-      | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF)
-      | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7)
-      | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3)
-      | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
-      | otherwise = replacementChar : go cs
-
-    twoBytes :: Word8 -> [Word8] -> String
-    twoBytes c0 (c1 : cs')
-      | c1 .&. 0xC0 == 0x80 =
-          let d =
-                (fromIntegral (c0 .&. 0x1F) `shiftL` 6)
-                  .|. fromIntegral (c1 .&. 0x3F)
-           in if d >= 0x80
-                then chr d : go cs'
-                else replacementChar : go cs'
-    twoBytes _ cs' = replacementChar : go cs'
-
-    moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
-    moreBytes 1 overlong cs' acc
-      | overlong <= acc
-      , acc <= 0x10FFFF
-      , acc < 0xD800 || 0xDFFF < acc =
-          chr acc : go cs'
-      | otherwise =
-          replacementChar : go cs'
-    moreBytes byteCount overlong (cn : cs') acc
-      | cn .&. 0xC0 == 0x80 =
-          moreBytes
-            (byteCount - 1)
-            overlong
-            cs'
-            ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
-    moreBytes _ _ cs' _ =
-      replacementChar : go cs'
-
-    replacementChar = '\xfffd'
-
--- | Encode 'String' to a list of UTF8-encoded octets
---
--- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
--- as the replacement character (i.e. @U+FFFD@).
---
--- See also 'decodeUtf8'
-encodeStringUtf8 :: String -> [Word8]
-encodeStringUtf8 [] = []
-encodeStringUtf8 (c : cs)
-  | c <= '\x07F' =
-      w8
-        : encodeStringUtf8 cs
-  | c <= '\x7FF' =
-      (0xC0 .|. w8ShiftR 6)
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  | c <= '\xD7FF' =
-      (0xE0 .|. w8ShiftR 12)
-        : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  | c <= '\xDFFF' =
-      0xEF
-        : 0xBF
-        : 0xBD -- U+FFFD
-        : encodeStringUtf8 cs
-  | c <= '\xFFFF' =
-      (0xE0 .|. w8ShiftR 12)
-        : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  | otherwise =
-      (0xf0 .|. w8ShiftR 18)
-        : (0x80 .|. (w8ShiftR 12 .&. 0x3F))
-        : (0x80 .|. (w8ShiftR 6 .&. 0x3F))
-        : (0x80 .|. (w8 .&. 0x3F))
-        : encodeStringUtf8 cs
-  where
-    w8 = fromIntegral (ord c) :: Word8
-    w8ShiftR :: Int -> Word8
-    w8ShiftR = fromIntegral . shiftR (ord c)
-
 -- @since 3.8.0.0
 trim :: String -> String
 trim = dropWhile isSpace . dropWhileEnd isSpace