Fix Issues #20, #40, and #45 by using the same escaping as Protobuf d… (#48)

mikegunter · judah · commit 093d3a57d034 · 2016-09-15T11:50:55.000-07:00
* Fix Issues #20, #40, and #45 by using the same escaping as Protobuf distribution The Unicode escaping that the Protocol Buffer distribution supports is not implemented for now. Add tests.
diff --git a/proto-lens-tests/proto-lens-tests.cabal b/proto-lens-tests/proto-lens-tests.cabal
@@ -127,13 +127,15 @@ Test-Suite text_format_test
   hs-source-dirs: tests
   build-depends: HUnit
                , base
+               , bytestring
                , lens-family
                , pretty
                , proto-lens
                , proto-lens-protoc
                , proto-lens-tests
                , test-framework
                , test-framework-hunit
+               , text
 
 Test-Suite enum_test
   default-language: Haskell2010
diff --git a/proto-lens-tests/tests/canonical.proto b/proto-lens-tests/tests/canonical.proto
@@ -19,3 +19,8 @@ message Test3 {
 message Test4 {
     repeated int32 d = 4 [packed=true];
 }
+
+message Test5 {
+    required bytes e = 1;
+}
+
diff --git a/proto-lens-tests/tests/proto3_test.hs b/proto-lens-tests/tests/proto3_test.hs
@@ -55,7 +55,7 @@ main = testMain
             $ tagged 3 $ Fixed32 0x40d55555
         , serializeTo "bytes"
             (def & d .~ "a\0b" :: Foo)
-            "d: \"a\\NULb\""
+            "d: \"a\\000b\""
             $ tagged 4 $ Lengthy "a\0b"
         -- Scalar "oneof" fields should have a "maybe" selector.
         , testCase "maybe" $ do
diff --git a/proto-lens-tests/tests/raw_fields_test.hs b/proto-lens-tests/tests/raw_fields_test.hs
@@ -56,6 +56,7 @@ main = testMain
     , testDouble
     , testBool
     , testString
+    , testUnicode
     , testBytes
     , testFailedDecoding
     ]
@@ -220,17 +221,26 @@ testString = testRawValues "string" h
      , ("longer", "abcde")
      -- stress-test the encoding of the length
      , ("very long", Text.replicate 12345 "x")
-     , ("unicode-char", "α")
-     , ("unicode-string", "aαbβcαβ")
      ] :: [(String, Text)])
 
+testUnicode = testGroup "unicode"
+    [ test "unicode-char"   "α"       "h: \"\\316\\261\""
+    , test "unicode-string" "aαbβcαβ"
+           "h: \"a\\316\\261b\\316\\262c\\316\\261\\316\\262\""
+    ]
+  where
+     test name value text =
+         serializeTo name ((def :: Raw) & h .~ value) text
+                     ((tagged 8 . Lengthy . byteString . encodeUtf8) value)
+
+
 testBytes = testRawValues "bytes" i
     (keyed "i")
     (tagged 9 . Lengthy . byteString)
     (fmap (second B.pack)
         [ ("empty", [])
-        , ("small", [42])
-        , ("longer", [1..10])
+        , ("small", [42])       -- Chosen to be ASCII.
+        , ("longer", [42..52])  -- Chosen to be ASCII.
         -- stress-test the encoding of the length
         , ("very long", replicate 12345 42)
         ])
diff --git a/proto-lens-tests/tests/text_format_test.hs b/proto-lens-tests/tests/text_format_test.hs
@@ -7,6 +7,11 @@
 {-# LANGUAGE OverloadedStrings #-}
 module Main where
 
+import qualified Data.ByteString
+import Data.Char (ord)
+import Data.Monoid ((<>))
+import qualified Data.Text.Lazy
+import Data.Word (Word8)
 import Data.ProtoLens (
     def, Message, showMessage, showMessageShort, pprintMessage)
 import Lens.Family2 ((&), (.~))
@@ -29,6 +34,9 @@ def3 = def
 def4 :: Test4
 def4 = def
 
+def5 :: Test5
+def5 = def
+
 failed1 :: Maybe Test1
 failed1 = Nothing
 
@@ -57,8 +65,46 @@ main = testMain
     , testCase "Render multiple lines" $
         "d: 1\nd: 2\nd: 3" @=?
             showMessageWithLineLength 3 (def4 & d .~ [1, 2, 3])
+    , readFrom
+         ("Parse string with numeric escape sequences"
+             ++ " (including ones we do not emit)")
+          -- '\o172' == '\x7a' == 'z'
+         (Just $ def2 & b .~ "\o1\o12\o123\x2\o172z3z3")
+         (Data.Text.Lazy.pack "b: \"\\001\\012\\123\\002\\172\\x7a3\\1723\"")
+    , readFrom
+         ("Parse string with non-numeric escape sequences"
+             ++ " (including ones we do not emit)")
+         (Just $ def2 & b .~ "\a\b\f\n\r\t\v\\\'\"")
+         (Data.Text.Lazy.pack "b: \"\a\b\f\n\r\t\v\\\\\\\'\\\"\"")
+    , testCase "Render string with escape sequences" $
+        escapeRendered @=? showMessageShort escapeMessage
+    , readFrom "Parse rendered string with escape sequences"
+               (Just escapeMessage) (Data.Text.Lazy.pack escapeRendered)
+    , testCase "Render bytes" $
+         invalidUTF8BytesRendered @=? showMessage invalidUTF8BytesMessage
+    , readFrom "Parse single-quote-delimited string"
+         (Just $ def2 & b .~ "ab\o2") "b: \'ab\2\'"
+    , readFrom "Non-UTF8 bytes"
+         (Just invalidUTF8BytesMessage)
+         (Data.Text.Lazy.pack invalidUTF8BytesRendered)
     , let kNums = [0..99]  -- The default line limit is 100 so we exceed it.
           kExpected = unwords $ map (("d: " ++) . show) kNums
       in testCase "Render single line for debugString" $
           kExpected @=? showMessageShort (def4 & d .~ kNums)
     ]
+  where
+    escapeMessage  = def2 & b
+        .~ ("a\r\n\t\"\'\\" <> "bc\o030" <> "1" <> "\o109" <> "¢" <> "\o1")
+    escapeRendered =
+        -- 'a' followed by all the non-numeric escapes we emit:
+        "b: \"a\\r\\n\\t\\\"\\\'\\\\"
+        ++ "bc\\0301"      -- The last digit is a separate character, not part
+                           -- of the escape.
+        ++ "\\010" ++ "9"  -- Note that the 9 is a separate character
+        ++ "\\302\\242"    -- UTF-8 for the cent symbol, '¢'.
+        ++ "\\001"         -- Works fine at EOL.
+        ++ "\""
+    invalidUTF8BytesMessage =
+        def5 & e .~ Data.ByteString.pack (map (fromIntegral . ord) "abc"
+            ++ [0xC0, 0xC0, 0x0])  -- Invalid UTF8.
+    invalidUTF8BytesRendered = "e: \"abc\\300\\300\\000\""
diff --git a/proto-lens/src/Data/ProtoLens/Encoding.hs b/proto-lens/src/Data/ProtoLens/Encoding.hs
@@ -223,4 +223,3 @@ integralFieldWireType w = simpleFieldWireType w fromIntegral fromIntegral
 stringizeError :: Either UnicodeException a -> Either String a
 stringizeError (Left e) = Left (show e)
 stringizeError (Right a) = Right a
-
diff --git a/proto-lens/src/Data/ProtoLens/TextFormat.hs b/proto-lens/src/Data/ProtoLens/TextFormat.hs
@@ -20,13 +20,15 @@ module Data.ProtoLens.TextFormat(
 import Lens.Family2 ((&),(^.),(.~), set, over)
 import Control.Applicative ((<$>))
 import Control.Arrow (left)
-import qualified Data.ByteString.Char8 as B
+import qualified Data.ByteString
+import Data.Char (isPrint, isAscii, chr)
 import Data.Foldable (foldlM, foldl')
 import Data.Maybe (catMaybes)
 import qualified Data.Map as Map
 import qualified Data.Set as Set
-import qualified Data.Text as Text
+import qualified Data.Text.Encoding as Text
 import qualified Data.Text.Lazy as Lazy
+import Numeric (showOct)
 import Text.Parsec (parse)
 import Text.PrettyPrint
 
@@ -102,11 +104,35 @@ pprintFieldValue name SFixed64Field x = primField name x
 pprintFieldValue name FloatField x = primField name x
 pprintFieldValue name DoubleField x = primField name x
 pprintFieldValue name BoolField x = text name <> colon <+> boolValue x
-pprintFieldValue name StringField x = primField name x
-pprintFieldValue name BytesField x = primField name x
+pprintFieldValue name StringField x = pprintByteString name (Text.encodeUtf8 x)
+pprintFieldValue name BytesField x = pprintByteString name x
 pprintFieldValue name GroupField m
     = text name <+> lbrace $$ nest 2 (pprintMessage m) $$ rbrace
 
+-- | Formats a string in a way that mostly matches the C-compatible escaping
+-- used by the Protocol Buffer distribution.  We depart a bit by escaping all
+-- non-ASCII characters, which depending on the locale, the distribution might
+-- not do.
+--
+-- This uses three-digit octal escapes, e.g. "\011" plus \n, \r,, \t, \', \",
+-- and \\ only.  Note that Haskell string-literal syntax calls for "\011" to be
+-- interpreted as decimal 11, rather than the decimal 9 it actually represent,
+-- so you can't use Prelude.read to parse the strings created here.
+pprintByteString :: String -> Data.ByteString.ByteString -> Doc
+pprintByteString name x = text name <> colon <+> char '\"'
+    <> text (concatMap escape $ Data.ByteString.unpack x) <> char '\"'
+  where escape w8 | ch == '\n'               = "\\n"
+                  | ch == '\r'               = "\\r"
+                  | ch == '\t'               = "\\t"
+                  | ch == '\"'               = "\\\""
+                  | ch == '\''               = "\\\'"
+                  | ch == '\\'               = "\\\\"
+                  | isPrint ch && isAscii ch = ch : ""
+                  | otherwise                = "\\" ++ pad (showOct w8 "")
+          where
+            ch = chr $ fromIntegral w8
+            pad str = replicate (3 - length str) '0' ++ str
+
 primField :: Show value => String -> value -> Doc
 primField name x = text name <> colon <+> text (show x)
 
@@ -202,8 +228,8 @@ makeValue BoolField (Parser.EnumValue x)
     | x == "true" = Right True
     | x == "false" = Right False
     | otherwise = Left $ "Unrecognized bool value " ++ show x
-makeValue StringField (Parser.StringValue x) = Right (Text.pack x)
-makeValue BytesField (Parser.StringValue x) = Right (B.pack x)
+makeValue StringField (Parser.ByteStringValue x) = Right (Text.decodeUtf8 x)
+makeValue BytesField (Parser.ByteStringValue x) = Right x
 makeValue EnumField (Parser.IntValue x) =
     maybe (Left $ "Unrecognized enum value " ++ show x) Right
         (maybeToEnum $ fromInteger x)
diff --git a/proto-lens/src/Data/ProtoLens/TextFormat/Parser.hs b/proto-lens/src/Data/ProtoLens/TextFormat/Parser.hs
@@ -14,13 +14,19 @@ module Data.ProtoLens.TextFormat.Parser
     , parser
     ) where
 
-import Data.List (intercalate)
+import Data.ByteString (ByteString, pack)
+import Data.Char (ord)
 import Data.Functor.Identity (Identity)
+import Data.List (intercalate)
+import Data.Maybe (catMaybes)
 import Data.Text.Lazy (Text)
-import Text.Parsec.Char (alphaNum, char, letter, oneOf)
+import Data.Word (Word8)
+import Numeric (readOct, readHex)
+import Text.Parsec.Char
+  (alphaNum, char, hexDigit, letter, octDigit, oneOf, satisfy)
 import Text.Parsec.Text.Lazy (Parser)
-import Text.Parsec.Combinator (eof, sepBy1, many1, choice)
-import Text.Parsec.Token
+import Text.Parsec.Combinator (choice, eof, many1, optionMaybe, sepBy1)
+import Text.Parsec.Token hiding (octal)
 import Control.Applicative ((<*), (<|>), (*>), many)
 import Control.Monad (liftM, liftM2, mzero)
 
@@ -60,7 +66,7 @@ data Key = Key String  -- ^ A standard key that is just a string.
 
 data Value = IntValue Integer  -- ^ An integer
   | DoubleValue Double  -- ^ Any floating point number
-  | StringValue String  -- ^ A string literal
+  | ByteStringValue ByteString    -- ^ A string or bytes literal
   | MessageValue Message  -- ^ A sub message
   | EnumValue String  -- ^ Any undelimited string (including false & true)
   deriving (Show,Ord,Eq)
@@ -91,7 +97,8 @@ parser = whiteSpace ptp *> parseMessage <* eof
         negative <- (symbol ptp "-" >> return True) <|> return False
         value <- naturalOrFloat ptp
         return $ makeNumberValue negative value
-    parseString = liftM (StringValue . concat) . many1 $ stringLiteral ptp
+    parseString = liftM (ByteStringValue . mconcat)
+        $ many1 $ lexeme ptp $ protoStringLiteral
     parseEnumValue = liftM EnumValue (identifier ptp)
     parseMessageValue = liftM MessageValue
         (braces ptp parseMessage <|> angles ptp parseMessage)
@@ -101,3 +108,50 @@ parser = whiteSpace ptp *> parseMessage <* eof
     makeNumberValue False (Left intValue) = IntValue intValue
     makeNumberValue True (Right doubleValue) = DoubleValue (negate doubleValue)
     makeNumberValue False (Right doubleValue) = DoubleValue doubleValue
+
+-- | Reads a literal string the way the Protocol Buffer distribution's
+-- tokenizer.cc does.  This differs from Haskell string literals in treating,
+-- e.g. "\11" as octal instead of decimal, so reading as 9 instead of 11.  Also,
+-- like tokenizer.cc we assume octal and hex escapes can have at most three and
+-- two digits, respectively.
+--
+-- TODO: implement reading of Unicode escapes.
+protoStringLiteral :: Parser ByteString
+protoStringLiteral = do
+    initialQuoteChar <- char '\'' <|> char '\"'
+    word8s <- many stringChar
+    _ <- char initialQuoteChar
+    return $ pack word8s
+  where
+    stringChar :: Parser Word8
+    stringChar = nonEscape <|> stringEscape
+    nonEscape  = fmap (fromIntegral . ord)
+        $ satisfy (\c -> c `notElem` "\\\'\"" && ord c < 256)
+    stringEscape = char '\\' >> (octal <|> hex <|> unicode <|> simple)
+    octal = do d0 <- octDigit
+               d1 <- optionMaybe octDigit
+               d2 <- optionMaybe octDigit
+               readMaybeDigits readOct [Just d0, d1, d2]
+    readMaybeDigits :: ReadS Word8 -> [Maybe Char] -> Parser Word8
+    readMaybeDigits reader
+        = return . (\str -> let [(v, "")] = reader str in v) . catMaybes
+    hex = do _ <- oneOf "xX"
+             d0 <- hexDigit
+             d1 <- optionMaybe hexDigit
+             readMaybeDigits readHex [Just d0, d1]
+    unicode = oneOf "uU" >> fail "Unicode in string literals not yet supported"
+    simple = choice $ map charRet [ ('a', '\a')
+                                  , ('b', '\b')
+                                  , ('f', '\f')
+                                  , ('n', '\n')
+                                  , ('r', '\r')
+                                  , ('t', '\t')
+                                  , ('v', '\v')
+                                  , ('\\', '\\')
+                                  , ('\'', '\'')
+                                  , ('\"', '\"')
+                                  ]
+      where
+        charRet :: (Char, Char) -> Parser Word8
+        charRet (escapeCh, ch) = do _ <- char escapeCh
+                                    return $ fromIntegral $ ord ch

Original file line number	Diff line number	Diff line change
`@@ -19,3 +19,8 @@ message Test3 {`
`19`	`19`	`message Test4 {`
`20`	`20`	`repeated int32 d = 4 [packed=true];`
`21`	`21`	`}`
	`22`	`+`
	`23`	`+message Test5 {`
	`24`	`+ required bytes e = 1;`
	`25`	`+}`
	`26`	`+`