Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cabal-syntax/Cabal-syntax.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ library
, mtl >= 2.1 && < 2.4
, parsec >= 3.1.13.0 && < 3.2
, pretty >= 1.1.1 && < 1.2
, text (>= 1.2.3.0 && < 1.3) || (>= 2.0 && < 2.2)
, text >= 2.0.2 && < 2.2
, time >= 1.4.0.1 && < 1.16
-- transformers-0.4.0.0 doesn't have record syntax e.g. for Identity
-- See also https://github.com/ekmett/transformers-compat/issues/35
Expand Down
50 changes: 9 additions & 41 deletions Cabal-syntax/src/Distribution/Parsec/FieldLineStream.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# OPTIONS_GHC -Wall -Werror #-}

Expand All @@ -11,13 +11,13 @@ module Distribution.Parsec.FieldLineStream
, fieldLineStreamEnd
) where

import Data.Bits
import Data.ByteString (ByteString)
import Distribution.Compat.Prelude
import Distribution.Utils.Generic (toUTF8BS)
import Prelude ()

import qualified Data.ByteString as BS
import Data.Text.Internal.Encoding.Utf8
import qualified Text.Parsec as Parsec

-- | This is essentially a lazy bytestring, but chunks are glued with newline @\'\\n\'@.
Expand Down Expand Up @@ -47,47 +47,15 @@ instance Monad m => Parsec.Stream FieldLineStream m Char where
Nothing -> Just ('\n', s)
Just (c, bs') -> Just (unconsChar c bs' (\bs'' -> FLSCons bs'' s) s)

-- Based on implementation 'decodeStringUtf8'
unconsChar :: forall a. Word8 -> ByteString -> (ByteString -> a) -> a -> (Char, a)
unconsChar c0 bs0 f next
| c0 <= 0x7F = (chr (fromIntegral c0), f bs0)
| c0 <= 0xBF = (replacementChar, f bs0)
| c0 <= 0xDF = twoBytes
| c0 <= 0xEF = moreBytes 3 0x800 bs0 (fromIntegral $ c0 .&. 0xF)
| c0 <= 0xF7 = moreBytes 4 0x10000 bs0 (fromIntegral $ c0 .&. 0x7)
| c0 <= 0xFB = moreBytes 5 0x200000 bs0 (fromIntegral $ c0 .&. 0x3)
| c0 <= 0xFD = moreBytes 6 0x4000000 bs0 (fromIntegral $ c0 .&. 0x1)
| otherwise = error $ "not implemented " ++ show c0
unconsChar c0 bs0 f next = go (utf8DecodeStart c0) bs0
where
twoBytes = case BS.uncons bs0 of
Nothing -> (replacementChar, next)
Just (c1, bs1)
| c1 .&. 0xC0 == 0x80 ->
if d >= 0x80
then (chr d, f bs1)
else (replacementChar, f bs1)
| otherwise -> (replacementChar, f bs1)
where
d = (fromIntegral (c0 .&. 0x1F) `shiftL` 6) .|. fromIntegral (c1 .&. 0x3F)

moreBytes :: Int -> Int -> ByteString -> Int -> (Char, a)
moreBytes 1 overlong bs' acc
| overlong <= acc
, acc <= 0x10FFFF
, acc < 0xD800 || 0xDFFF < acc =
(chr acc, f bs')
| otherwise =
(replacementChar, f bs')
moreBytes byteCount overlong bs' acc = case BS.uncons bs' of
Nothing -> (replacementChar, f bs')
Just (cn, bs1)
| cn .&. 0xC0 == 0x80 ->
moreBytes
(byteCount - 1)
overlong
bs1
((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
| otherwise -> (replacementChar, f bs1)
go decoderResult bs = case decoderResult of
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that we're using the same replacementChar, couldn't we just use Data.Text.Encoding.decodeUtf8Lenient? (It's also in 2.0, if you're worried about that.)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all a bit awkward for historical reasons. In the modern Haskell ecosystem Cabal-syntax should not be parsing ByteString but with UTF-8 semantics, this is obviously stupid. And all FieldLineStream and FieldLine should wrap Text, not ByteString. But fixing it properly is a significant breaking change and I don't want to disrupt ongoing work on Cabal formatters.

Accept ch -> (ch, f bs)
Reject -> (replacementChar, f bs)
Incomplete state codePoint -> case BS.uncons bs of
Nothing -> (replacementChar, next)
Just (w, bs') -> go (utf8DecodeContinue w state codePoint) bs'

replacementChar :: Char
replacementChar = '\xfffd'
14 changes: 9 additions & 5 deletions Cabal-syntax/src/Distribution/Utils/Generic.hs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ import Distribution.Compat.Prelude
import Prelude ()

import Data.Char (isAsciiLower, isAsciiUpper)
import Distribution.Utils.String

import Data.Bits (shiftL, (.&.), (.|.))
import qualified Data.ByteString as SBS
Expand All @@ -95,6 +94,11 @@ import Data.List
( isInfixOf
)
import qualified Data.Set as Set
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL

import qualified Control.Exception as Exception
import System.Directory
Expand Down Expand Up @@ -212,22 +216,22 @@ writeFileAtomic targetPath content = do
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
fromUTF8BS :: SBS.ByteString -> String
fromUTF8BS = decodeStringUtf8 . SBS.unpack
fromUTF8BS = T.unpack . T.decodeUtf8With T.lenientDecode

-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
fromUTF8LBS :: LBS.ByteString -> String
fromUTF8LBS = decodeStringUtf8 . LBS.unpack
fromUTF8LBS = TL.unpack . TL.decodeUtf8With T.lenientDecode

-- | Encode 'String' to UTF8-encoded 'SBS.ByteString'
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
toUTF8BS :: String -> SBS.ByteString
toUTF8BS = SBS.pack . encodeStringUtf8
toUTF8BS = T.encodeUtf8 . T.pack

-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
toUTF8LBS :: String -> LBS.ByteString
toUTF8LBS = LBS.pack . encodeStringUtf8
toUTF8LBS = TL.encodeUtf8 . TL.pack

-- | Check that strict 'ByteString' is valid UTF8. Returns 'Just offset' if it's not.
validateUTF8 :: SBS.ByteString -> Maybe Int
Expand Down
12 changes: 5 additions & 7 deletions Cabal-syntax/src/Distribution/Utils/ShortText.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,18 @@ module Distribution.Utils.ShortText
-- * Operations
, null
, length

-- * internal utilities
, decodeStringUtf8
, encodeStringUtf8
) where

import Distribution.Compat.Prelude hiding (length, null)
import Prelude ()

import Distribution.Utils.String (decodeStringUtf8, encodeStringUtf8)
import Distribution.Utils.Structured (Structured (..), nominalStructure)

import qualified Data.ByteString as BS
import qualified Data.List as List
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as T

import qualified Data.ByteString.Short as BS.Short

Expand Down Expand Up @@ -69,9 +67,9 @@ instance Binary ShortText where
put = put . unST
get = fmap ST get

toShortText = ST . BS.Short.pack . encodeStringUtf8
toShortText = ST . BS.Short.toShort . T.encodeUtf8 . T.pack

fromShortText = decodeStringUtf8 . BS.Short.unpack . unST
fromShortText = T.unpack . T.decodeUtf8With T.lenientDecode . BS.Short.fromShort . unST

unsafeFromUTF8BS = ST . BS.Short.toShort

Expand Down
102 changes: 1 addition & 101 deletions Cabal-syntax/src/Distribution/Utils/String.hs
Original file line number Diff line number Diff line change
@@ -1,110 +1,10 @@
module Distribution.Utils.String
( -- * Encode to/from UTF8
decodeStringUtf8
, encodeStringUtf8
, trim
( trim
) where

import Data.Bits
import Data.Char (chr, ord)
import Data.List (dropWhileEnd)
import Data.Word
import GHC.Unicode (isSpace)

-- | Decode 'String' from UTF8-encoded octets.
--
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
-- See also 'encodeStringUtf8'
decodeStringUtf8 :: [Word8] -> String
decodeStringUtf8 = go
where
go :: [Word8] -> String
go [] = []
go (c : cs)
| c <= 0x7F = chr (fromIntegral c) : go cs
| c <= 0xBF = replacementChar : go cs
| c <= 0xDF = twoBytes c cs
| c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF)
| c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7)
| c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3)
| c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
| otherwise = replacementChar : go cs

twoBytes :: Word8 -> [Word8] -> String
twoBytes c0 (c1 : cs')
| c1 .&. 0xC0 == 0x80 =
let d =
(fromIntegral (c0 .&. 0x1F) `shiftL` 6)
.|. fromIntegral (c1 .&. 0x3F)
in if d >= 0x80
then chr d : go cs'
else replacementChar : go cs'
twoBytes _ cs' = replacementChar : go cs'

moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
moreBytes 1 overlong cs' acc
| overlong <= acc
, acc <= 0x10FFFF
, acc < 0xD800 || 0xDFFF < acc =
chr acc : go cs'
| otherwise =
replacementChar : go cs'
moreBytes byteCount overlong (cn : cs') acc
| cn .&. 0xC0 == 0x80 =
moreBytes
(byteCount - 1)
overlong
cs'
((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
moreBytes _ _ cs' _ =
replacementChar : go cs'

replacementChar = '\xfffd'

-- | Encode 'String' to a list of UTF8-encoded octets
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
-- See also 'decodeUtf8'
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 [] = []
encodeStringUtf8 (c : cs)
| c <= '\x07F' =
w8
: encodeStringUtf8 cs
| c <= '\x7FF' =
(0xC0 .|. w8ShiftR 6)
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xD7FF' =
(0xE0 .|. w8ShiftR 12)
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xDFFF' =
0xEF
: 0xBF
: 0xBD -- U+FFFD
: encodeStringUtf8 cs
| c <= '\xFFFF' =
(0xE0 .|. w8ShiftR 12)
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| otherwise =
(0xf0 .|. w8ShiftR 18)
: (0x80 .|. (w8ShiftR 12 .&. 0x3F))
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
where
w8 = fromIntegral (ord c) :: Word8
w8ShiftR :: Int -> Word8
w8ShiftR = fromIntegral . shiftR (ord c)

-- @since 3.8.0.0
trim :: String -> String
trim = dropWhile isSpace . dropWhileEnd isSpace
Loading
Loading