rust-lang · bors · Jul 22, 2014 · Jul 14, 2014 · Jul 14, 2014 · Jul 14, 2014
diff --git a/Makefile.in b/Makefile.in
@@ -216,6 +216,7 @@ ifneq ($(strip $(findstring check,$(MAKECMDGOALS)) \
                $(findstring tidy,$(MAKECMDGOALS))),)
   CFG_INFO := $(info cfg: including test rules)
   include $(CFG_SRC_DIR)mk/tests.mk
+  include $(CFG_SRC_DIR)mk/grammar.mk
 endif
 
 # Performance and benchmarking

diff --git a/configure b/configure
@@ -493,6 +493,9 @@ probe CFG_VALGRIND         valgrind
 probe CFG_PERF             perf
 probe CFG_ISCC             iscc
 probe CFG_LLNEXTGEN        LLnextgen
+probe CFG_JAVAC            javac
+probe CFG_ANTLR4           antlr4
+probe CFG_GRUN             grun
 probe CFG_PANDOC           pandoc
 probe CFG_PDFLATEX         pdflatex
 probe CFG_XELATEX          xelatex

diff --git a/mk/grammar.mk b/mk/grammar.mk
@@ -0,0 +1,55 @@
+# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+BG = $(CFG_BUILD_DIR)/grammar/
+SG = $(S)src/grammar/
+B = $(CFG_BUILD_DIR)/$(CFG_BUILD)/stage2/
+L = $(B)lib/rustlib/$(CFG_BUILD)/lib
+LD = $(CFG_BUILD)/stage2/lib/rustlib/$(CFG_BUILD)/lib/
+RUSTC = $(B)bin/rustc
+
+# Run the reference lexer against libsyntax and compare the tokens and spans.
+# If "// ignore-lexer-test" is present in the file, it will be ignored.
+#
+# $(1) is the file to test.
+define LEXER_TEST
+grep "// ignore-lexer-test" $(1) ; \
+  if [ $$? -eq 1 ]; then \
+   CLASSPATH=$(B)grammar $(CFG_GRUN) RustLexer tokens -tokens < $(1) \
+   | $(B)grammar/verify $(1) ; \
+  fi
+endef
+
+$(BG):
+	$(Q)mkdir -p $(BG)
+
+$(BG)RustLexer.class: $(SG)RustLexer.g4
+	$(Q)$(CFG_ANTLR4) -o $(B)grammar $(SG)RustLexer.g4
+	$(Q)$(CFG_JAVAC) -d $(BG) $(BG)RustLexer.java
+
+$(BG)verify: $(SG)verify.rs rustc-stage2-H-$(CFG_BUILD) $(LD)stamp.regex_macros $(LD)stamp.rustc
+	$(Q)$(RUSTC) -O --out-dir $(BG) -L $(L) $(SG)verify.rs
+
+check-lexer: $(BG) $(BG)RustLexer.class $(BG)verify
+ifdef CFG_JAVAC
+ifdef CFG_ANTLR4
+ifdef CFG_GRUN
+	$(info Verifying libsyntax against the reference lexer ...)
+	$(Q)$(SG)check.sh $(S) "$(BG)" \
+		"$(CFG_GRUN)" "$(BG)verify" "$(BG)RustLexer.tokens"
+else
+$(info grun not available, skipping lexer test...)
+endif
+else
+$(info antlr4 not available, skipping lexer test...)
+endif
+else
+$(info javac not available, skipping lexer test...)
+endif
diff --git a/mk/tests.mk b/mk/tests.mk
@@ -192,6 +192,8 @@ check-docs: cleantestlibs cleantmptestlogs check-stage2-docs
 # NOTE: Remove after reprogramming windows bots
 check-fast: check-lite
 
+check-syntax: check-lexer
+
 .PHONY: cleantmptestlogs cleantestlibs
 
 cleantmptestlogs:

diff --git a/src/grammar/.gitignore b/src/grammar/.gitignore
@@ -0,0 +1,4 @@
+verify
+*.class
+*.java
+*.tokens
diff --git a/src/grammar/README.md b/src/grammar/README.md
@@ -0,0 +1,20 @@
+Reference grammar.
+
+Uses [antlr4](http://www.antlr.org/) and a custom Rust tool to compare
+ASTs/token streams generated. You can use the `check-syntax` make target to
+run all of the available tests.
+
+To use manually:
+
+```
+antlr4 RustLexer.g4
+javac *.java
+rustc -O verify.rs
+for file in ../*/**.rs; do
+    echo $file;
+    grun RustLexer tokens -tokens < $file | ./verify $file || break
+done
+```
+
+Note That the `../*/**.rs` glob will match every `*.rs` file in the above
+directory and all of its recursive children. This is a zsh extension.
diff --git a/src/grammar/RustLexer.g4 b/src/grammar/RustLexer.g4
@@ -0,0 +1,170 @@
+lexer grammar RustLexer;
+
+tokens {
+    EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
+    MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
+    BINOPEQ, AT, DOT, DOTDOT, DOTDOTDOT, COMMA, SEMI, COLON,
+    MOD_SEP, RARROW, FAT_ARROW, LPAREN, RPAREN, LBRACKET, RBRACKET,
+    LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
+    LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
+    LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
+    COMMENT
+}
+
+/* Note: due to antlr limitations, we can't represent XID_start and
+ * XID_continue properly. ASCII-only substitute. */
+
+fragment XID_start : [_a-zA-Z] ;
+fragment XID_continue : [_a-zA-Z0-9] ;
+
+
+/* Expression-operator symbols */
+
+EQ      : '=' ;
+LT      : '<' ;
+LE      : '<=' ;
+EQEQ    : '==' ;
+NE      : '!=' ;
+GE      : '>=' ;
+GT      : '>' ;
+ANDAND  : '&&' ;
+OROR    : '||' ;
+NOT     : '!' ;
+TILDE   : '~' ;
+PLUS    : '+' ;
+MINUS   : '-' ;
+STAR    : '*' ;
+SLASH   : '/' ;
+PERCENT : '%' ;
+CARET   : '^' ;
+AND     : '&' ;
+OR      : '|' ;
+SHL     : '<<' ;
+SHR     : '>>' ;
+
+BINOP
+    : PLUS
+    | SLASH
+    | MINUS
+    | STAR
+    | PERCENT
+    | CARET
+    | AND
+    | OR
+    | SHL
+    | SHR
+    ;
+
+BINOPEQ : BINOP EQ ;
+
+/* "Structural symbols" */
+
+AT         : '@' ;
+DOT        : '.' ;
+DOTDOT     : '..' ;
+DOTDOTDOT  : '...' ;
+COMMA      : ',' ;
+SEMI       : ';' ;
+COLON      : ':' ;
+MOD_SEP    : '::' ;
+RARROW     : '->' ;
+FAT_ARROW  : '=>' ;
+LPAREN     : '(' ;
+RPAREN     : ')' ;
+LBRACKET   : '[' ;
+RBRACKET   : ']' ;
+LBRACE     : '{' ;
+RBRACE     : '}' ;
+POUND      : '#';
+DOLLAR     : '$' ;
+UNDERSCORE : '_' ;
+
+// Literals
+
+fragment HEXIT
+  : [0-9a-fA-F]
+  ;
+
+fragment CHAR_ESCAPE
+  : [nrt\\'"0]
+  | [xX] HEXIT HEXIT
+  | 'u' HEXIT HEXIT HEXIT HEXIT
+  | 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
+  ;
+
+LIT_CHAR
+  : '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\''
+  ;
+
+LIT_BYTE
+  : 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\''
+  ;
+
+fragment INT_SUFFIX
+  : 'i'
+  | 'i8'
+  | 'i16'
+  | 'i32'
+  | 'i64'
+  | 'u'
+  | 'u8'
+  | 'u16'
+  | 'u32'
+  | 'u64'
+  ;
+
+LIT_INTEGER
+  : [0-9][0-9_]* INT_SUFFIX?
+  | '0b' [01][01_]* INT_SUFFIX?
+  | '0o' [0-7][0-7_]* INT_SUFFIX?
+  | '0x' [0-9a-fA-F][0-9a-fA-F_]* INT_SUFFIX?
+  ;
+
+FLOAT_SUFFIX
+  : 'f32'
+  | 'f64'
+  | 'f128'
+  ;
+
+LIT_FLOAT
+  : [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? FLOAT_SUFFIX?)
+  ;
+
+LIT_STR
+  : '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"'
+  ;
+
+LIT_BINARY : 'b' LIT_STR ;
+LIT_BINARY_RAW : 'rb' LIT_STR_RAW ;
+
+/* this is a bit messy */
+
+fragment LIT_STR_RAW_INNER
+  : '"' .*? '"'
+  | LIT_STR_RAW_INNER2
+  ;
+
+fragment LIT_STR_RAW_INNER2
+  : POUND LIT_STR_RAW_INNER POUND
+  ;
+
+LIT_STR_RAW
+  : 'r' LIT_STR_RAW_INNER
+  ;
+
+IDENT : XID_start XID_continue* ;
+
+LIFETIME : '\'' IDENT ;
+
+WHITESPACE : [ \r\n\t]+ ;
+
+UNDOC_COMMENT     : '////' ~[\r\n]* -> type(COMMENT) ;
+YESDOC_COMMENT    : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
+OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
+LINE_COMMENT      : '//' ~[\r\n]* -> type(COMMENT) ;
+
+DOC_BLOCK_COMMENT
+  : ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
+  ;
+
+BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;
diff --git a/src/grammar/check.sh b/src/grammar/check.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+# Run the reference lexer against libsyntax and compare the tokens and spans.
+# If "// ignore-lexer-test" is present in the file, it will be ignored.
+
+
+# Argument $1 is the file to check, $2 is the classpath to use, $3 is the path
+# to the grun binary, $4 is the path to the verify binary, $5 is the path to
+# RustLexer.tokens
+if [ "${VERBOSE}" == "1" ]; then
+    set -x
+fi
+
+check() {
+    grep --silent "// ignore-lexer-test" $1;
+
+    # if it's *not* found...
+    if [ $? -eq 1 ]; then
+        cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
+        # figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
+        # seem to have anny effect.
+        if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
+            echo "pass: $1"
+        else
+            echo "fail: $1"
+        fi
+    else
+        echo "skip: $1"
+    fi
+}
+
+for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
+    check $file $2 $3 $4 $5
+done
diff --git a/src/grammar/raw-string-literal-ambiguity.md b/src/grammar/raw-string-literal-ambiguity.md
@@ -0,0 +1,29 @@
+Rust's lexical grammar is not context-free. Raw string literals are the source
+of the problem. Informally, a raw string literal is an `r`, followed by `N`
+hashes (where N can be zero), a quote, any characters, then a quote followed
+by `N` hashes. This grammar describes this as best possible:
+
+    R -> 'r' S
+    S -> '"' B '"'
+    S -> '#' S '#'
+    B -> . B
+    B -> ε
+
+Where `.` represents any character, and `ε` the empty string. Consider the
+string `r#""#"#`. This string is not a valid raw string literal, but can be
+accepted as one by the above grammar, using the derivation:
+
+    R : #""#"#
+    S : ""#"
+    S : "#
+    B : #
+    B : ε
+
+(Where `T : U` means the rule `T` is applied, and `U` is the remainder of the
+string.) The difficulty arises from the fact that it is fundamentally
+context-sensitive. In particular, the context needed is the number of hashes.
+I know of no way to resolve this, but also have not come up with a proof that
+it is not context sensitive. Such a proof would probably use the pumping lemma
+for context-free languages, but I (cmr) could not come up with a proof after
+spending a few hours on it, and decided my time best spent elsewhere. Pull
+request welcome!