diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 9039f346edb52..add9a4cb9f3bd 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -70,10 +70,10 @@ impl<'a> Reader for StringReader<'a> { ret_val } fn fatal(&self, m: &str) -> ! { - self.span_diagnostic.span_fatal(self.peek_span, m) + self.fatal_span(self.peek_span, m) } fn err(&self, m: &str) { - self.span_diagnostic.span_err(self.peek_span, m) + self.err_span(self.peek_span, m) } fn peek(&self) -> TokenAndSpan { // FIXME(pcwalton): Bad copy! @@ -137,43 +137,52 @@ impl<'a> StringReader<'a> { self.curr == Some(c) } - /// Report a lexical error spanning [`from_pos`, `to_pos`) - fn fatal_span(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str) -> ! { - self.peek_span = codemap::mk_sp(from_pos, to_pos); - self.fatal(m); + /// Report a fatal lexical error with a given span. + pub fn fatal_span(&self, sp: Span, m: &str) -> ! { + self.span_diagnostic.span_fatal(sp, m) } - fn err_span(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str) { - self.peek_span = codemap::mk_sp(from_pos, to_pos); - self.err(m); + /// Report a lexical error with a given span. + pub fn err_span(&self, sp: Span, m: &str) { + self.span_diagnostic.span_err(sp, m) + } + + /// Report a fatal error spanning [`from_pos`, `to_pos`). + fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> ! { + self.fatal_span(codemap::mk_sp(from_pos, to_pos), m) + } + + /// Report a lexical error spanning [`from_pos`, `to_pos`). + fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) { + self.err_span(codemap::mk_sp(from_pos, to_pos), m) } /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an /// escaped character to the error message - fn fatal_span_char(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> ! { + fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> ! { let mut m = m.to_string(); m.push_str(": "); char::escape_default(c, |c| m.push_char(c)); - self.fatal_span(from_pos, to_pos, m.as_slice()); + self.fatal_span_(from_pos, to_pos, m.as_slice()); } /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an /// escaped character to the error message - fn err_span_char(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { + fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { let mut m = m.to_string(); m.push_str(": "); char::escape_default(c, |c| m.push_char(c)); - self.err_span(from_pos, to_pos, m.as_slice()); + self.err_span_(from_pos, to_pos, m.as_slice()); } /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the /// offending string to the error message - fn fatal_span_verbose(&mut self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> ! { + fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> ! { m.push_str(": "); let from = self.byte_offset(from_pos).to_uint(); let to = self.byte_offset(to_pos).to_uint(); m.push_str(self.filemap.src.as_slice().slice(from, to)); - self.fatal_span(from_pos, to_pos, m.as_slice()); + self.fatal_span_(from_pos, to_pos, m.as_slice()); } /// Advance peek_tok and peek_span to refer to the next token, and @@ -216,6 +225,47 @@ impl<'a> StringReader<'a> { self.byte_offset(end).to_uint())) } + /// Converts CRLF to LF in the given string, raising an error on bare CR. + fn translate_crlf<'a>(&self, start: BytePos, + s: &'a str, errmsg: &'a str) -> str::MaybeOwned<'a> { + let mut i = 0u; + while i < s.len() { + let str::CharRange { ch, next } = s.char_range_at(i); + if ch == '\r' { + if next < s.len() && s.char_at(next) == '\n' { + return translate_crlf_(self, start, s, errmsg, i).into_maybe_owned(); + } + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos(next as u32); + self.err_span_(pos, end_pos, errmsg); + } + i = next; + } + return s.into_maybe_owned(); + + fn translate_crlf_(rdr: &StringReader, start: BytePos, + s: &str, errmsg: &str, mut i: uint) -> String { + let mut buf = String::with_capacity(s.len()); + let mut j = 0; + while i < s.len() { + let str::CharRange { ch, next } = s.char_range_at(i); + if ch == '\r' { + if j < i { buf.push_str(s.slice(j, i)); } + j = next; + if next >= s.len() || s.char_at(next) != '\n' { + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos(next as u32); + rdr.err_span_(pos, end_pos, errmsg); + } + } + i = next; + } + if j < s.len() { buf.push_str(s.slice_from(j)); } + buf + } + } + + /// Advance the StringReader by one character. If a newline is /// discovered, add it to the FileMap's list of line start offsets. pub fn bump(&mut self) { @@ -296,7 +346,20 @@ impl<'a> StringReader<'a> { // line comments starting with "///" or "//!" are doc-comments if self.curr_is('/') || self.curr_is('!') { let start_bpos = self.pos - BytePos(3); - while !self.curr_is('\n') && !self.is_eof() { + while !self.is_eof() { + match self.curr.unwrap() { + '\n' => break, + '\r' => { + if self.nextch_is('\n') { + // CRLF + break + } else { + self.err_span_(self.last_pos, self.pos, + "bare CR not allowed in doc-comment"); + } + } + _ => () + } self.bump(); } let ret = self.with_str_from(start_bpos, |string| { @@ -304,7 +367,7 @@ impl<'a> StringReader<'a> { if !is_line_non_doc_comment(string) { Some(TokenAndSpan{ tok: token::DOC_COMMENT(str_to_ident(string)), - sp: codemap::mk_sp(start_bpos, self.pos) + sp: codemap::mk_sp(start_bpos, self.last_pos) }) } else { None @@ -358,9 +421,10 @@ impl<'a> StringReader<'a> { fn consume_block_comment(&mut self) -> Option { // block comments starting with "/**" or "/*!" are doc-comments let is_doc_comment = self.curr_is('*') || self.curr_is('!'); - let start_bpos = self.pos - BytePos(if is_doc_comment {3} else {2}); + let start_bpos = self.last_pos - BytePos(2); let mut level: int = 1; + let mut has_cr = false; while level > 0 { if self.is_eof() { let msg = if is_doc_comment { @@ -369,27 +433,37 @@ impl<'a> StringReader<'a> { "unterminated block comment" }; let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, msg); - } else if self.curr_is('/') && self.nextch_is('*') { - level += 1; - self.bump(); - self.bump(); - } else if self.curr_is('*') && self.nextch_is('/') { - level -= 1; - self.bump(); - self.bump(); - } else { - self.bump(); + self.fatal_span_(start_bpos, last_bpos, msg); + } + let n = self.curr.unwrap(); + match n { + '/' if self.nextch_is('*') => { + level += 1; + self.bump(); + } + '*' if self.nextch_is('/') => { + level -= 1; + self.bump(); + } + '\r' => { + has_cr = true; + } + _ => () } + self.bump(); } let res = if is_doc_comment { self.with_str_from(start_bpos, |string| { // but comments with only "*"s between two "/"s are not if !is_block_non_doc_comment(string) { + let string = if has_cr { + self.translate_crlf(start_bpos, string, + "bare CR not allowed in block doc-comment") + } else { string.into_maybe_owned() }; Some(TokenAndSpan{ - tok: token::DOC_COMMENT(str_to_ident(string)), - sp: codemap::mk_sp(start_bpos, self.pos) + tok: token::DOC_COMMENT(str_to_ident(string.as_slice())), + sp: codemap::mk_sp(start_bpos, self.last_pos) }) } else { None @@ -421,7 +495,7 @@ impl<'a> StringReader<'a> { return Some(rslt); } else { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "scan_exponent: bad fp literal"); + self.err_span_(start_bpos, last_bpos, "scan_exponent: bad fp literal"); rslt.push_str("1"); // arbitrary placeholder exponent return Some(rslt); } @@ -447,9 +521,10 @@ impl<'a> StringReader<'a> { fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: uint) { match base { - 16u => self.err_span(start_bpos, last_bpos, "hexadecimal float literal is not supported"), - 8u => self.err_span(start_bpos, last_bpos, "octal float literal is not supported"), - 2u => self.err_span(start_bpos, last_bpos, "binary float literal is not supported"), + 16u => self.err_span_(start_bpos, last_bpos, + "hexadecimal float literal is not supported"), + 8u => self.err_span_(start_bpos, last_bpos, "octal float literal is not supported"), + 2u => self.err_span_(start_bpos, last_bpos, "binary float literal is not supported"), _ => () } } @@ -509,7 +584,7 @@ impl<'a> StringReader<'a> { } if num_str.len() == 0u { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "no valid digits found for number"); + self.err_span_(start_bpos, last_bpos, "no valid digits found for number"); num_str = "1".to_string(); } let parsed = match from_str_radix::(num_str.as_slice(), @@ -517,7 +592,7 @@ impl<'a> StringReader<'a> { Some(p) => p, None => { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "int literal is too large"); + self.err_span_(start_bpos, last_bpos, "int literal is too large"); 1 } }; @@ -573,7 +648,7 @@ impl<'a> StringReader<'a> { return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), ast::TyF128); } let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "expected `f32`, `f64` or `f128` suffix"); + self.err_span_(start_bpos, last_bpos, "expected `f32`, `f64` or `f128` suffix"); } if is_float { let last_bpos = self.last_pos; @@ -583,7 +658,7 @@ impl<'a> StringReader<'a> { } else { if num_str.len() == 0u { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "no valid digits found for number"); + self.err_span_(start_bpos, last_bpos, "no valid digits found for number"); num_str = "1".to_string(); } let parsed = match from_str_radix::(num_str.as_slice(), @@ -591,7 +666,7 @@ impl<'a> StringReader<'a> { Some(p) => p, None => { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "int literal is too large"); + self.err_span_(start_bpos, last_bpos, "int literal is too large"); 1 } }; @@ -609,11 +684,11 @@ impl<'a> StringReader<'a> { for _ in range(0, n_hex_digits) { if self.is_eof() { let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, "unterminated numeric character escape"); + self.fatal_span_(start_bpos, last_bpos, "unterminated numeric character escape"); } if self.curr_is(delim) { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "numeric character escape is too short"); + self.err_span_(start_bpos, last_bpos, "numeric character escape is too short"); break; } let c = self.curr.unwrap_or('\x00'); @@ -630,7 +705,7 @@ impl<'a> StringReader<'a> { Some(x) => x, None => { let last_bpos = self.last_pos; - self.err_span(start_bpos, last_bpos, "illegal numeric character escape"); + self.err_span_(start_bpos, last_bpos, "illegal numeric character escape"); '?' } } @@ -665,6 +740,10 @@ impl<'a> StringReader<'a> { self.consume_whitespace(); return None }, + '\r' if delim == '"' && self.curr_is('\n') => { + self.consume_whitespace(); + return None + } c => { let last_pos = self.last_pos; self.err_span_char( @@ -686,6 +765,15 @@ impl<'a> StringReader<'a> { else { "character constant must be escaped" }, first_source_char); } + '\r' => { + if self.curr_is('\n') { + self.bump(); + return Some('\n'); + } else { + self.err_span_(start, self.last_pos, + "bare CR not allowed in string, use \\r instead"); + } + } _ => if ascii_only && first_source_char > '\x7F' { let last_pos = self.last_pos; self.err_span_char( @@ -856,16 +944,16 @@ impl<'a> StringReader<'a> { let last_bpos = self.last_pos; if token::is_keyword(token::keywords::Self, keyword_checking_token) { - self.err_span(start, - last_bpos, - "invalid lifetime name: 'self \ - is no longer a special lifetime"); + self.err_span_(start, + last_bpos, + "invalid lifetime name: 'self \ + is no longer a special lifetime"); } else if token::is_any_keyword(keyword_checking_token) && !token::is_keyword(token::keywords::Static, keyword_checking_token) { - self.err_span(start, - last_bpos, - "invalid lifetime name"); + self.err_span_(start, + last_bpos, + "invalid lifetime name"); } return token::LIFETIME(ident); } @@ -922,8 +1010,8 @@ impl<'a> StringReader<'a> { while !self_.curr_is('"') { if self_.is_eof() { let last_pos = self_.last_pos; - self_.fatal_span(start, last_pos, - "unterminated double quote byte string"); + self_.fatal_span_(start, last_pos, + "unterminated double quote byte string"); } let ch_start = self_.last_pos; @@ -947,7 +1035,7 @@ impl<'a> StringReader<'a> { if self_.is_eof() { let last_pos = self_.last_pos; - self_.fatal_span(start_bpos, last_pos, "unterminated raw string"); + self_.fatal_span_(start_bpos, last_pos, "unterminated raw string"); } else if !self_.curr_is('"') { let last_pos = self_.last_pos; let ch = self_.curr.unwrap(); @@ -963,7 +1051,7 @@ impl<'a> StringReader<'a> { match self_.curr { None => { let last_pos = self_.last_pos; - self_.fatal_span(start_bpos, last_pos, "unterminated raw string") + self_.fatal_span_(start_bpos, last_pos, "unterminated raw string") }, Some('"') => { content_end_bpos = self_.last_pos; @@ -997,7 +1085,7 @@ impl<'a> StringReader<'a> { while !self.curr_is('"') { if self.is_eof() { let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, "unterminated double quote string"); + self.fatal_span_(start_bpos, last_bpos, "unterminated double quote string"); } let ch_start = self.last_pos; @@ -1020,7 +1108,7 @@ impl<'a> StringReader<'a> { if self.is_eof() { let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, "unterminated raw string"); + self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"); } else if !self.curr_is('"') { let last_bpos = self.last_pos; let curr_char = self.curr.unwrap(); @@ -1032,28 +1120,45 @@ impl<'a> StringReader<'a> { self.bump(); let content_start_bpos = self.last_pos; let mut content_end_bpos; + let mut has_cr = false; 'outer: loop { if self.is_eof() { let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, "unterminated raw string"); + self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"); } - if self.curr_is('"') { - content_end_bpos = self.last_pos; - for _ in range(0, hash_count) { - self.bump(); - if !self.curr_is('#') { - continue 'outer; + //if self.curr_is('"') { + //content_end_bpos = self.last_pos; + //for _ in range(0, hash_count) { + //self.bump(); + //if !self.curr_is('#') { + //continue 'outer; + let c = self.curr.unwrap(); + match c { + '"' => { + content_end_bpos = self.last_pos; + for _ in range(0, hash_count) { + self.bump(); + if !self.curr_is('#') { + continue 'outer; + } } + break; + } + '\r' => { + has_cr = true; } - break; + _ => () } self.bump(); } self.bump(); - let str_content = self.with_str_from_to( - content_start_bpos, - content_end_bpos, - str_to_ident); + let str_content = self.with_str_from_to(content_start_bpos, content_end_bpos, |string| { + let string = if has_cr { + self.translate_crlf(content_start_bpos, string, + "bare CR not allowed in raw string") + } else { string.into_maybe_owned() }; + str_to_ident(string.as_slice()) + }); return token::LIT_STR_RAW(str_content, hash_count); } '-' => { diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index eb0c6f2555aee..331a49c83beac 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -288,6 +288,8 @@ mod test { use owned_slice::OwnedSlice; use ast; use abi; + use attr; + use attr::AttrMetaMethods; use parse::parser::Parser; use parse::token::{str_to_ident}; use util::parser_testing::{string_to_tts, string_to_parser}; @@ -726,4 +728,24 @@ mod test { }".to_string()); } + #[test] fn crlf_doc_comments() { + let sess = new_parse_sess(); + + let name = "".to_string(); + let source = "/// doc comment\r\nfn foo() {}".to_string(); + let item = parse_item_from_source_str(name.clone(), source, Vec::new(), &sess).unwrap(); + let doc = attr::first_attr_value_str_by_name(item.attrs.as_slice(), "doc").unwrap(); + assert_eq!(doc.get(), "/// doc comment"); + + let source = "/// doc comment\r\n/// line 2\r\nfn foo() {}".to_string(); + let item = parse_item_from_source_str(name.clone(), source, Vec::new(), &sess).unwrap(); + let docs = item.attrs.iter().filter(|a| a.name().get() == "doc") + .map(|a| a.value_str().unwrap().get().to_string()).collect::>(); + assert_eq!(docs.as_slice(), &["/// doc comment".to_string(), "/// line 2".to_string()]); + + let source = "/** doc comment\r\n * with CRLF */\r\nfn foo() {}".to_string(); + let item = parse_item_from_source_str(name, source, Vec::new(), &sess).unwrap(); + let doc = attr::first_attr_value_str_by_name(item.attrs.as_slice(), "doc").unwrap(); + assert_eq!(doc.get(), "/** doc comment\n * with CRLF */"); + } } diff --git a/src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs b/src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs new file mode 100644 index 0000000000000..c1e5121d6dd4e --- /dev/null +++ b/src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs @@ -0,0 +1,30 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// ignore-tidy-cr + +/// doc comment with bare CR: ' ' +pub fn foo() {} +//~^^ ERROR: bare CR not allowed in doc-comment + +/** block doc comment with bare CR: ' ' */ +pub fn bar() {} +//~^^ ERROR: bare CR not allowed in block doc-comment + +fn main() { + // the following string literal has a bare CR in it + let _s = "foo bar"; //~ ERROR: bare CR not allowed in string + + // the following string literal has a bare CR in it + let _s = r"bar foo"; //~ ERROR: bare CR not allowed in raw string + + // the following string literal has a bare CR in it + let _s = "foo\ bar"; //~ ERROR: unknown character escape: \r +} diff --git a/src/test/run-pass/.gitattributes b/src/test/run-pass/.gitattributes new file mode 100644 index 0000000000000..c6a6f23074de0 --- /dev/null +++ b/src/test/run-pass/.gitattributes @@ -0,0 +1 @@ +lexer-crlf-line-endings-string-literal-doc-comment.rs -text diff --git a/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs new file mode 100644 index 0000000000000..5c8db524cc2ed --- /dev/null +++ b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs @@ -0,0 +1,44 @@ +// ignore-tidy-cr ignore-license +// ignore-tidy-cr (repeated again because of tidy bug) +// license is ignored because tidy can't handle the CRLF here properly. + +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NB: this file needs CRLF line endings. The .gitattributes file in +// this directory should enforce it. + +// ignore-pretty + +/// Doc comment that ends in CRLF +pub fn foo() {} + +/** Block doc comment that + * contains CRLF characters + */ +pub fn bar() {} + +fn main() { + let s = "string +literal"; + assert_eq!(s, "string\nliteral"); + + let s = "literal with \ + escaped newline"; + assert_eq!(s, "literal with escaped newline"); + + let s = r"string +literal"; + assert_eq!(s, "string\nliteral"); + + // validate that our source file has CRLF endings + let source = include_str!("lexer-crlf-line-endings-string-literal-doc-comment.rs"); + assert!(source.contains("string\r\nliteral")); +}