|
1 | 1 | module Distribution.Utils.String |
2 | | - ( -- * Encode to/from UTF8 |
3 | | - decodeStringUtf8 |
4 | | - , encodeStringUtf8 |
5 | | - , trim |
| 2 | + ( trim |
6 | 3 | ) where |
7 | 4 |
|
8 | | -import Data.Bits |
9 | | -import Data.Char (chr, ord) |
10 | 5 | import Data.List (dropWhileEnd) |
11 | | -import Data.Word |
12 | 6 | import GHC.Unicode (isSpace) |
13 | 7 |
|
14 | | --- | Decode 'String' from UTF8-encoded octets. |
15 | | --- |
16 | | --- Invalid data in the UTF8 stream (this includes code-points @U+D800@ |
17 | | --- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@). |
18 | | --- |
19 | | --- See also 'encodeStringUtf8' |
20 | | -decodeStringUtf8 :: [Word8] -> String |
21 | | -decodeStringUtf8 = go |
22 | | - where |
23 | | - go :: [Word8] -> String |
24 | | - go [] = [] |
25 | | - go (c : cs) |
26 | | - | c <= 0x7F = chr (fromIntegral c) : go cs |
27 | | - | c <= 0xBF = replacementChar : go cs |
28 | | - | c <= 0xDF = twoBytes c cs |
29 | | - | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF) |
30 | | - | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7) |
31 | | - | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3) |
32 | | - | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1) |
33 | | - | otherwise = replacementChar : go cs |
34 | | - |
35 | | - twoBytes :: Word8 -> [Word8] -> String |
36 | | - twoBytes c0 (c1 : cs') |
37 | | - | c1 .&. 0xC0 == 0x80 = |
38 | | - let d = |
39 | | - (fromIntegral (c0 .&. 0x1F) `shiftL` 6) |
40 | | - .|. fromIntegral (c1 .&. 0x3F) |
41 | | - in if d >= 0x80 |
42 | | - then chr d : go cs' |
43 | | - else replacementChar : go cs' |
44 | | - twoBytes _ cs' = replacementChar : go cs' |
45 | | - |
46 | | - moreBytes :: Int -> Int -> [Word8] -> Int -> [Char] |
47 | | - moreBytes 1 overlong cs' acc |
48 | | - | overlong <= acc |
49 | | - , acc <= 0x10FFFF |
50 | | - , acc < 0xD800 || 0xDFFF < acc = |
51 | | - chr acc : go cs' |
52 | | - | otherwise = |
53 | | - replacementChar : go cs' |
54 | | - moreBytes byteCount overlong (cn : cs') acc |
55 | | - | cn .&. 0xC0 == 0x80 = |
56 | | - moreBytes |
57 | | - (byteCount - 1) |
58 | | - overlong |
59 | | - cs' |
60 | | - ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F) |
61 | | - moreBytes _ _ cs' _ = |
62 | | - replacementChar : go cs' |
63 | | - |
64 | | - replacementChar = '\xfffd' |
65 | | - |
66 | | --- | Encode 'String' to a list of UTF8-encoded octets |
67 | | --- |
68 | | --- Code-points in the @U+D800@-@U+DFFF@ range will be encoded |
69 | | --- as the replacement character (i.e. @U+FFFD@). |
70 | | --- |
71 | | --- See also 'decodeUtf8' |
72 | | -encodeStringUtf8 :: String -> [Word8] |
73 | | -encodeStringUtf8 [] = [] |
74 | | -encodeStringUtf8 (c : cs) |
75 | | - | c <= '\x07F' = |
76 | | - w8 |
77 | | - : encodeStringUtf8 cs |
78 | | - | c <= '\x7FF' = |
79 | | - (0xC0 .|. w8ShiftR 6) |
80 | | - : (0x80 .|. (w8 .&. 0x3F)) |
81 | | - : encodeStringUtf8 cs |
82 | | - | c <= '\xD7FF' = |
83 | | - (0xE0 .|. w8ShiftR 12) |
84 | | - : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) |
85 | | - : (0x80 .|. (w8 .&. 0x3F)) |
86 | | - : encodeStringUtf8 cs |
87 | | - | c <= '\xDFFF' = |
88 | | - 0xEF |
89 | | - : 0xBF |
90 | | - : 0xBD -- U+FFFD |
91 | | - : encodeStringUtf8 cs |
92 | | - | c <= '\xFFFF' = |
93 | | - (0xE0 .|. w8ShiftR 12) |
94 | | - : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) |
95 | | - : (0x80 .|. (w8 .&. 0x3F)) |
96 | | - : encodeStringUtf8 cs |
97 | | - | otherwise = |
98 | | - (0xf0 .|. w8ShiftR 18) |
99 | | - : (0x80 .|. (w8ShiftR 12 .&. 0x3F)) |
100 | | - : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) |
101 | | - : (0x80 .|. (w8 .&. 0x3F)) |
102 | | - : encodeStringUtf8 cs |
103 | | - where |
104 | | - w8 = fromIntegral (ord c) :: Word8 |
105 | | - w8ShiftR :: Int -> Word8 |
106 | | - w8ShiftR = fromIntegral . shiftR (ord c) |
107 | | - |
108 | 8 | -- @since 3.8.0.0 |
109 | 9 | trim :: String -> String |
110 | 10 | trim = dropWhile isSpace . dropWhileEnd isSpace |
0 commit comments