Skip to content

Commit 093d3a5

Browse files
mikegunterjudah
authored andcommitted
Fix Issues #20, #40, and #45 by using the same escaping as Protobuf d… (#48)
* Fix Issues #20, #40, and #45 by using the same escaping as Protobuf distribution The Unicode escaping that the Protocol Buffer distribution supports is not implemented for now. Add tests.
1 parent a83de6d commit 093d3a5

File tree

8 files changed

+160
-18
lines changed

8 files changed

+160
-18
lines changed

proto-lens-tests/proto-lens-tests.cabal

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,13 +127,15 @@ Test-Suite text_format_test
127127
hs-source-dirs: tests
128128
build-depends: HUnit
129129
, base
130+
, bytestring
130131
, lens-family
131132
, pretty
132133
, proto-lens
133134
, proto-lens-protoc
134135
, proto-lens-tests
135136
, test-framework
136137
, test-framework-hunit
138+
, text
137139

138140
Test-Suite enum_test
139141
default-language: Haskell2010

proto-lens-tests/tests/canonical.proto

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,8 @@ message Test3 {
1919
message Test4 {
2020
repeated int32 d = 4 [packed=true];
2121
}
22+
23+
message Test5 {
24+
required bytes e = 1;
25+
}
26+

proto-lens-tests/tests/proto3_test.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ main = testMain
5555
$ tagged 3 $ Fixed32 0x40d55555
5656
, serializeTo "bytes"
5757
(def & d .~ "a\0b" :: Foo)
58-
"d: \"a\\NULb\""
58+
"d: \"a\\000b\""
5959
$ tagged 4 $ Lengthy "a\0b"
6060
-- Scalar "oneof" fields should have a "maybe" selector.
6161
, testCase "maybe" $ do

proto-lens-tests/tests/raw_fields_test.hs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ main = testMain
5656
, testDouble
5757
, testBool
5858
, testString
59+
, testUnicode
5960
, testBytes
6061
, testFailedDecoding
6162
]
@@ -220,17 +221,26 @@ testString = testRawValues "string" h
220221
, ("longer", "abcde")
221222
-- stress-test the encoding of the length
222223
, ("very long", Text.replicate 12345 "x")
223-
, ("unicode-char", "α")
224-
, ("unicode-string", "aαbβcαβ")
225224
] :: [(String, Text)])
226225

226+
testUnicode = testGroup "unicode"
227+
[ test "unicode-char" "α" "h: \"\\316\\261\""
228+
, test "unicode-string" "aαbβcαβ"
229+
"h: \"a\\316\\261b\\316\\262c\\316\\261\\316\\262\""
230+
]
231+
where
232+
test name value text =
233+
serializeTo name ((def :: Raw) & h .~ value) text
234+
((tagged 8 . Lengthy . byteString . encodeUtf8) value)
235+
236+
227237
testBytes = testRawValues "bytes" i
228238
(keyed "i")
229239
(tagged 9 . Lengthy . byteString)
230240
(fmap (second B.pack)
231241
[ ("empty", [])
232-
, ("small", [42])
233-
, ("longer", [1..10])
242+
, ("small", [42]) -- Chosen to be ASCII.
243+
, ("longer", [42..52]) -- Chosen to be ASCII.
234244
-- stress-test the encoding of the length
235245
, ("very long", replicate 12345 42)
236246
])

proto-lens-tests/tests/text_format_test.hs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
{-# LANGUAGE OverloadedStrings #-}
88
module Main where
99

10+
import qualified Data.ByteString
11+
import Data.Char (ord)
12+
import Data.Monoid ((<>))
13+
import qualified Data.Text.Lazy
14+
import Data.Word (Word8)
1015
import Data.ProtoLens (
1116
def, Message, showMessage, showMessageShort, pprintMessage)
1217
import Lens.Family2 ((&), (.~))
@@ -29,6 +34,9 @@ def3 = def
2934
def4 :: Test4
3035
def4 = def
3136

37+
def5 :: Test5
38+
def5 = def
39+
3240
failed1 :: Maybe Test1
3341
failed1 = Nothing
3442

@@ -57,8 +65,46 @@ main = testMain
5765
, testCase "Render multiple lines" $
5866
"d: 1\nd: 2\nd: 3" @=?
5967
showMessageWithLineLength 3 (def4 & d .~ [1, 2, 3])
68+
, readFrom
69+
("Parse string with numeric escape sequences"
70+
++ " (including ones we do not emit)")
71+
-- '\o172' == '\x7a' == 'z'
72+
(Just $ def2 & b .~ "\o1\o12\o123\x2\o172z3z3")
73+
(Data.Text.Lazy.pack "b: \"\\001\\012\\123\\002\\172\\x7a3\\1723\"")
74+
, readFrom
75+
("Parse string with non-numeric escape sequences"
76+
++ " (including ones we do not emit)")
77+
(Just $ def2 & b .~ "\a\b\f\n\r\t\v\\\'\"")
78+
(Data.Text.Lazy.pack "b: \"\a\b\f\n\r\t\v\\\\\\\'\\\"\"")
79+
, testCase "Render string with escape sequences" $
80+
escapeRendered @=? showMessageShort escapeMessage
81+
, readFrom "Parse rendered string with escape sequences"
82+
(Just escapeMessage) (Data.Text.Lazy.pack escapeRendered)
83+
, testCase "Render bytes" $
84+
invalidUTF8BytesRendered @=? showMessage invalidUTF8BytesMessage
85+
, readFrom "Parse single-quote-delimited string"
86+
(Just $ def2 & b .~ "ab\o2") "b: \'ab\2\'"
87+
, readFrom "Non-UTF8 bytes"
88+
(Just invalidUTF8BytesMessage)
89+
(Data.Text.Lazy.pack invalidUTF8BytesRendered)
6090
, let kNums = [0..99] -- The default line limit is 100 so we exceed it.
6191
kExpected = unwords $ map (("d: " ++) . show) kNums
6292
in testCase "Render single line for debugString" $
6393
kExpected @=? showMessageShort (def4 & d .~ kNums)
6494
]
95+
where
96+
escapeMessage = def2 & b
97+
.~ ("a\r\n\t\"\'\\" <> "bc\o030" <> "1" <> "\o109" <> "¢" <> "\o1")
98+
escapeRendered =
99+
-- 'a' followed by all the non-numeric escapes we emit:
100+
"b: \"a\\r\\n\\t\\\"\\\'\\\\"
101+
++ "bc\\0301" -- The last digit is a separate character, not part
102+
-- of the escape.
103+
++ "\\010" ++ "9" -- Note that the 9 is a separate character
104+
++ "\\302\\242" -- UTF-8 for the cent symbol, '¢'.
105+
++ "\\001" -- Works fine at EOL.
106+
++ "\""
107+
invalidUTF8BytesMessage =
108+
def5 & e .~ Data.ByteString.pack (map (fromIntegral . ord) "abc"
109+
++ [0xC0, 0xC0, 0x0]) -- Invalid UTF8.
110+
invalidUTF8BytesRendered = "e: \"abc\\300\\300\\000\""

proto-lens/src/Data/ProtoLens/Encoding.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,4 +223,3 @@ integralFieldWireType w = simpleFieldWireType w fromIntegral fromIntegral
223223
stringizeError :: Either UnicodeException a -> Either String a
224224
stringizeError (Left e) = Left (show e)
225225
stringizeError (Right a) = Right a
226-

proto-lens/src/Data/ProtoLens/TextFormat.hs

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ module Data.ProtoLens.TextFormat(
2020
import Lens.Family2 ((&),(^.),(.~), set, over)
2121
import Control.Applicative ((<$>))
2222
import Control.Arrow (left)
23-
import qualified Data.ByteString.Char8 as B
23+
import qualified Data.ByteString
24+
import Data.Char (isPrint, isAscii, chr)
2425
import Data.Foldable (foldlM, foldl')
2526
import Data.Maybe (catMaybes)
2627
import qualified Data.Map as Map
2728
import qualified Data.Set as Set
28-
import qualified Data.Text as Text
29+
import qualified Data.Text.Encoding as Text
2930
import qualified Data.Text.Lazy as Lazy
31+
import Numeric (showOct)
3032
import Text.Parsec (parse)
3133
import Text.PrettyPrint
3234

@@ -102,11 +104,35 @@ pprintFieldValue name SFixed64Field x = primField name x
102104
pprintFieldValue name FloatField x = primField name x
103105
pprintFieldValue name DoubleField x = primField name x
104106
pprintFieldValue name BoolField x = text name <> colon <+> boolValue x
105-
pprintFieldValue name StringField x = primField name x
106-
pprintFieldValue name BytesField x = primField name x
107+
pprintFieldValue name StringField x = pprintByteString name (Text.encodeUtf8 x)
108+
pprintFieldValue name BytesField x = pprintByteString name x
107109
pprintFieldValue name GroupField m
108110
= text name <+> lbrace $$ nest 2 (pprintMessage m) $$ rbrace
109111

112+
-- | Formats a string in a way that mostly matches the C-compatible escaping
113+
-- used by the Protocol Buffer distribution. We depart a bit by escaping all
114+
-- non-ASCII characters, which depending on the locale, the distribution might
115+
-- not do.
116+
--
117+
-- This uses three-digit octal escapes, e.g. "\011" plus \n, \r,, \t, \', \",
118+
-- and \\ only. Note that Haskell string-literal syntax calls for "\011" to be
119+
-- interpreted as decimal 11, rather than the decimal 9 it actually represent,
120+
-- so you can't use Prelude.read to parse the strings created here.
121+
pprintByteString :: String -> Data.ByteString.ByteString -> Doc
122+
pprintByteString name x = text name <> colon <+> char '\"'
123+
<> text (concatMap escape $ Data.ByteString.unpack x) <> char '\"'
124+
where escape w8 | ch == '\n' = "\\n"
125+
| ch == '\r' = "\\r"
126+
| ch == '\t' = "\\t"
127+
| ch == '\"' = "\\\""
128+
| ch == '\'' = "\\\'"
129+
| ch == '\\' = "\\\\"
130+
| isPrint ch && isAscii ch = ch : ""
131+
| otherwise = "\\" ++ pad (showOct w8 "")
132+
where
133+
ch = chr $ fromIntegral w8
134+
pad str = replicate (3 - length str) '0' ++ str
135+
110136
primField :: Show value => String -> value -> Doc
111137
primField name x = text name <> colon <+> text (show x)
112138

@@ -202,8 +228,8 @@ makeValue BoolField (Parser.EnumValue x)
202228
| x == "true" = Right True
203229
| x == "false" = Right False
204230
| otherwise = Left $ "Unrecognized bool value " ++ show x
205-
makeValue StringField (Parser.StringValue x) = Right (Text.pack x)
206-
makeValue BytesField (Parser.StringValue x) = Right (B.pack x)
231+
makeValue StringField (Parser.ByteStringValue x) = Right (Text.decodeUtf8 x)
232+
makeValue BytesField (Parser.ByteStringValue x) = Right x
207233
makeValue EnumField (Parser.IntValue x) =
208234
maybe (Left $ "Unrecognized enum value " ++ show x) Right
209235
(maybeToEnum $ fromInteger x)

proto-lens/src/Data/ProtoLens/TextFormat/Parser.hs

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,19 @@ module Data.ProtoLens.TextFormat.Parser
1414
, parser
1515
) where
1616

17-
import Data.List (intercalate)
17+
import Data.ByteString (ByteString, pack)
18+
import Data.Char (ord)
1819
import Data.Functor.Identity (Identity)
20+
import Data.List (intercalate)
21+
import Data.Maybe (catMaybes)
1922
import Data.Text.Lazy (Text)
20-
import Text.Parsec.Char (alphaNum, char, letter, oneOf)
23+
import Data.Word (Word8)
24+
import Numeric (readOct, readHex)
25+
import Text.Parsec.Char
26+
(alphaNum, char, hexDigit, letter, octDigit, oneOf, satisfy)
2127
import Text.Parsec.Text.Lazy (Parser)
22-
import Text.Parsec.Combinator (eof, sepBy1, many1, choice)
23-
import Text.Parsec.Token
28+
import Text.Parsec.Combinator (choice, eof, many1, optionMaybe, sepBy1)
29+
import Text.Parsec.Token hiding (octal)
2430
import Control.Applicative ((<*), (<|>), (*>), many)
2531
import Control.Monad (liftM, liftM2, mzero)
2632

@@ -60,7 +66,7 @@ data Key = Key String -- ^ A standard key that is just a string.
6066

6167
data Value = IntValue Integer -- ^ An integer
6268
| DoubleValue Double -- ^ Any floating point number
63-
| StringValue String -- ^ A string literal
69+
| ByteStringValue ByteString -- ^ A string or bytes literal
6470
| MessageValue Message -- ^ A sub message
6571
| EnumValue String -- ^ Any undelimited string (including false & true)
6672
deriving (Show,Ord,Eq)
@@ -91,7 +97,8 @@ parser = whiteSpace ptp *> parseMessage <* eof
9197
negative <- (symbol ptp "-" >> return True) <|> return False
9298
value <- naturalOrFloat ptp
9399
return $ makeNumberValue negative value
94-
parseString = liftM (StringValue . concat) . many1 $ stringLiteral ptp
100+
parseString = liftM (ByteStringValue . mconcat)
101+
$ many1 $ lexeme ptp $ protoStringLiteral
95102
parseEnumValue = liftM EnumValue (identifier ptp)
96103
parseMessageValue = liftM MessageValue
97104
(braces ptp parseMessage <|> angles ptp parseMessage)
@@ -101,3 +108,50 @@ parser = whiteSpace ptp *> parseMessage <* eof
101108
makeNumberValue False (Left intValue) = IntValue intValue
102109
makeNumberValue True (Right doubleValue) = DoubleValue (negate doubleValue)
103110
makeNumberValue False (Right doubleValue) = DoubleValue doubleValue
111+
112+
-- | Reads a literal string the way the Protocol Buffer distribution's
113+
-- tokenizer.cc does. This differs from Haskell string literals in treating,
114+
-- e.g. "\11" as octal instead of decimal, so reading as 9 instead of 11. Also,
115+
-- like tokenizer.cc we assume octal and hex escapes can have at most three and
116+
-- two digits, respectively.
117+
--
118+
-- TODO: implement reading of Unicode escapes.
119+
protoStringLiteral :: Parser ByteString
120+
protoStringLiteral = do
121+
initialQuoteChar <- char '\'' <|> char '\"'
122+
word8s <- many stringChar
123+
_ <- char initialQuoteChar
124+
return $ pack word8s
125+
where
126+
stringChar :: Parser Word8
127+
stringChar = nonEscape <|> stringEscape
128+
nonEscape = fmap (fromIntegral . ord)
129+
$ satisfy (\c -> c `notElem` "\\\'\"" && ord c < 256)
130+
stringEscape = char '\\' >> (octal <|> hex <|> unicode <|> simple)
131+
octal = do d0 <- octDigit
132+
d1 <- optionMaybe octDigit
133+
d2 <- optionMaybe octDigit
134+
readMaybeDigits readOct [Just d0, d1, d2]
135+
readMaybeDigits :: ReadS Word8 -> [Maybe Char] -> Parser Word8
136+
readMaybeDigits reader
137+
= return . (\str -> let [(v, "")] = reader str in v) . catMaybes
138+
hex = do _ <- oneOf "xX"
139+
d0 <- hexDigit
140+
d1 <- optionMaybe hexDigit
141+
readMaybeDigits readHex [Just d0, d1]
142+
unicode = oneOf "uU" >> fail "Unicode in string literals not yet supported"
143+
simple = choice $ map charRet [ ('a', '\a')
144+
, ('b', '\b')
145+
, ('f', '\f')
146+
, ('n', '\n')
147+
, ('r', '\r')
148+
, ('t', '\t')
149+
, ('v', '\v')
150+
, ('\\', '\\')
151+
, ('\'', '\'')
152+
, ('\"', '\"')
153+
]
154+
where
155+
charRet :: (Char, Char) -> Parser Word8
156+
charRet (escapeCh, ch) = do _ <- char escapeCh
157+
return $ fromIntegral $ ord ch

0 commit comments

Comments
 (0)