Skip to content

Commit 13747a1

Browse files
committed
Parse Haskell source paths more reliably
Haskell source paths, as GHC understands them, are remarkably permissive: they must end with one of the source extensions (now more accurately listed here, with references to the upstream GHC code), but can otherwise contain quirks up to and including multiple extensions, whitespace, and newlines. GHCi is actually even more lenient than this in what it accepts; it'll automatically append `.hs` and `.lhs` to paths you give it and check if those exist, but fortunately they get printed out in `:show targets` and diagnostics as the resolved source paths: ```text ghci> :add src/MyLib [1 of 1] Compiling MyLib ( src/MyLib.hs, interpreted ) ghci> :show targets src/MyLib.hs ghci> :add src/Foo target ‘src/Foo’ is not a module name or a source file ghci> :add src/MyLib.lhs File src/MyLib.lhs not found ghci> :add "src/ Foo.hs" File src/ Foo.hs not found ghci> :add "src\n/Foo.hs" File src /Foo.hs not found ```
1 parent da5279f commit 13747a1

File tree

4 files changed

+172
-12
lines changed

4 files changed

+172
-12
lines changed

src/ghci/parse/ghc_message/module_import_cycle_diagnostic.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ use winnow::ascii::space1;
44
use winnow::combinator::alt;
55
use winnow::combinator::opt;
66
use winnow::combinator::repeat;
7-
use winnow::token::take_until;
87
use winnow::PResult;
98
use winnow::Parser;
109

1110
use crate::ghci::parse::haskell_grammar::module_name;
11+
use crate::ghci::parse::haskell_source_file;
1212
use crate::ghci::parse::lines::line_ending_or_eof;
1313
use crate::ghci::parse::lines::rest_of_line;
1414
use crate::ghci::parse::Severity;
@@ -44,11 +44,10 @@ pub fn module_import_cycle_diagnostic(input: &mut &str) -> PResult<Vec<GhcMessag
4444
let _ = single_quote.parse_next(input)?;
4545
let _ = space1.parse_next(input)?;
4646
let _ = "(".parse_next(input)?;
47-
let path = take_until(1.., ")").parse_next(input)?;
48-
let _ = ")".parse_next(input)?;
47+
let (path, _) = haskell_source_file(')').parse_next(input)?;
4948
let _ = rest_of_line.parse_next(input)?;
5049

51-
Ok(Utf8PathBuf::from(path))
50+
Ok(path)
5251
}
5352

5453
fn inner(input: &mut &str) -> PResult<Vec<Utf8PathBuf>> {
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
use camino::Utf8PathBuf;
2+
use winnow::combinator::alt;
3+
use winnow::combinator::repeat_till;
4+
use winnow::error::ParserError;
5+
use winnow::stream::Accumulate;
6+
use winnow::stream::AsChar;
7+
use winnow::stream::Compare;
8+
use winnow::stream::Stream;
9+
use winnow::stream::StreamIsPartial;
10+
use winnow::token::take_till;
11+
use winnow::Parser;
12+
13+
use crate::haskell_source_file::HASKELL_SOURCE_EXTENSIONS;
14+
15+
/// Parse a Haskell source file name and an ending delimiter.
16+
///
17+
/// The returned path will end with a dot and one of the [`HASKELL_SOURCE_EXTENSIONS`], but may
18+
/// otherwise contain quirks up to and including multiple extensions, whitespace, and newlines.
19+
///
20+
/// GHCi is actually even more lenient than this in what it accepts; it'll automatically append
21+
/// `.hs` and `.lhs` to paths you give it and check if those exist, but fortunately they get
22+
/// printed out in `:show targets` and diagnostics as the resolved source paths:
23+
///
24+
/// ```text
25+
/// ghci> :add src/MyLib
26+
/// [1 of 1] Compiling MyLib ( src/MyLib.hs, interpreted )
27+
///
28+
/// ghci> :show targets
29+
/// src/MyLib.hs
30+
///
31+
/// ghci> :add src/Foo
32+
/// target ‘src/Foo’ is not a module name or a source file
33+
///
34+
/// ghci> :add src/MyLib.lhs
35+
/// File src/MyLib.lhs not found
36+
///
37+
/// ghci> :add "src/ Foo.hs"
38+
/// File src/ Foo.hs not found
39+
///
40+
/// ghci> :add "src\n/Foo.hs"
41+
/// File src
42+
/// /Foo.hs not found
43+
/// ```
44+
pub fn haskell_source_file<I, O, E>(
45+
end: impl Parser<I, O, E>,
46+
) -> impl Parser<I, (Utf8PathBuf, O), E>
47+
where
48+
I: Stream + StreamIsPartial + for<'a> Compare<&'a str>,
49+
E: ParserError<I>,
50+
<I as Stream>::Token: AsChar,
51+
char: Parser<I, <I as Stream>::Token, E>,
52+
String: Accumulate<<I as Stream>::Slice>,
53+
{
54+
repeat_till(1.., path_chunk(), end)
55+
.map(|(path, end): (String, O)| (Utf8PathBuf::from(path), end))
56+
}
57+
58+
fn path_chunk<I, E>() -> impl Parser<I, <I as Stream>::Slice, E>
59+
where
60+
I: Stream + StreamIsPartial + for<'a> Compare<&'a str>,
61+
E: ParserError<I>,
62+
<I as Stream>::Token: AsChar,
63+
char: Parser<I, <I as Stream>::Token, E>,
64+
{
65+
repeat_till::<_, _, (), _, _, _, _>(
66+
1..,
67+
(take_till(0.., '.'), '.'),
68+
alt(HASKELL_SOURCE_EXTENSIONS),
69+
)
70+
.recognize()
71+
}
72+
73+
#[cfg(test)]
74+
mod tests {
75+
use pretty_assertions::assert_eq;
76+
use winnow::error::ContextError;
77+
use winnow::error::ParseError;
78+
79+
use super::*;
80+
81+
fn parse_haskell_source_file<'a, O>(
82+
input: &'a str,
83+
end: impl Parser<&'a str, O, ContextError>,
84+
) -> Result<(Utf8PathBuf, O), ParseError<&'a str, ContextError>> {
85+
haskell_source_file::<&str, _, ContextError>(end).parse(input)
86+
}
87+
88+
#[test]
89+
fn test_parse_haskell_source_file() {
90+
// No end delimiter.
91+
assert!(parse_haskell_source_file("src/Puppy.hs", ' ').is_err());
92+
93+
// No source file.
94+
assert!(parse_haskell_source_file(" ", ' ').is_err());
95+
96+
// Simple source file.
97+
assert_eq!(
98+
parse_haskell_source_file("src/Puppy.hs ", ' ').unwrap(),
99+
(Utf8PathBuf::from("src/Puppy.hs"), ' ')
100+
);
101+
102+
// Weirder path, non-standard extension.
103+
assert_eq!(
104+
parse_haskell_source_file("src/../Puppy/Doggy.lhs ", ' ').unwrap(),
105+
(Utf8PathBuf::from("src/../Puppy/Doggy.lhs"), ' ')
106+
);
107+
108+
// Multiple extensions!
109+
assert_eq!(
110+
parse_haskell_source_file("src/Puppy.hs.lhs ", ' ').unwrap(),
111+
(Utf8PathBuf::from("src/Puppy.hs.lhs"), ' ')
112+
);
113+
114+
// More filename after extension.
115+
assert_eq!(
116+
parse_haskell_source_file("src/Puppy.hs.Doggy.lhs ", ' ').unwrap(),
117+
(Utf8PathBuf::from("src/Puppy.hs.Doggy.lhs"), ' ')
118+
);
119+
120+
// More filename after extension, no dot after extension.
121+
assert_eq!(
122+
parse_haskell_source_file("src/Puppy.hsDoggy.lhs ", ' ').unwrap(),
123+
(Utf8PathBuf::from("src/Puppy.hsDoggy.lhs"), ' ')
124+
);
125+
126+
// Space in middle.
127+
assert_eq!(
128+
parse_haskell_source_file("src/Pu ppy.hs ", ' ').unwrap(),
129+
(Utf8PathBuf::from("src/Pu ppy.hs"), ' ')
130+
);
131+
132+
// Space and extension in middle.
133+
assert_eq!(
134+
parse_haskell_source_file("src/Puppy.hsD oggy.hs ", ' ').unwrap(),
135+
(Utf8PathBuf::from("src/Puppy.hsD oggy.hs"), ' ')
136+
);
137+
138+
// Do you know that GHCi will happily read paths that contain newlines??
139+
assert_eq!(
140+
parse_haskell_source_file("src/\nPuppy.hs ", ' ').unwrap(),
141+
(Utf8PathBuf::from("src/\nPuppy.hs"), ' ')
142+
);
143+
144+
// If you do this and it breaks it's your own fault:
145+
assert_eq!(
146+
parse_haskell_source_file("src/Puppy.hs.hs", ".hs").unwrap(),
147+
(Utf8PathBuf::from("src/Puppy.hs"), ".hs")
148+
);
149+
150+
// This is dubious for the same reason:
151+
assert_eq!(
152+
parse_haskell_source_file("src/Puppy.hs.", '.').unwrap(),
153+
(Utf8PathBuf::from("src/Puppy.hs"), '.')
154+
);
155+
}
156+
}

src/ghci/parse/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
mod eval;
44
mod ghc_message;
55
mod haskell_grammar;
6+
mod haskell_source_file;
67
mod lines;
78
mod module_and_files;
89
mod module_set;
@@ -11,6 +12,7 @@ mod show_targets;
1112
mod target_kind;
1213

1314
use haskell_grammar::module_name;
15+
use haskell_source_file::haskell_source_file;
1416
use lines::rest_of_line;
1517
use module_and_files::module_and_files;
1618

src/haskell_source_file.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,20 @@
33
use camino::Utf8Path;
44

55
/// File extensions for Haskell source code.
6-
pub const HASKELL_SOURCE_EXTENSIONS: [&str; 9] = [
6+
///
7+
/// See: <https://downloads.haskell.org/ghc/latest/docs/users_guide/using.html#meaningful-file-suffixes>
8+
///
9+
/// See: <https://gitlab.haskell.org/ghc/ghc/-/blob/077cb2e11fa81076e8c9c5f8dd3bdfa99c8aaf8d/compiler/GHC/Driver/Phases.hs#L236-L252>
10+
pub const HASKELL_SOURCE_EXTENSIONS: [&str; 8] = [
711
// NOTE: This should start with `hs` so that iterators try the most common extension first.
8-
"hs", // Haskell
9-
"lhs", // Literate Haskell
12+
"hs", // Haskell
13+
"lhs", // Literate Haskell
1014
"hs-boot", // See: https://downloads.haskell.org/ghc/latest/docs/users_guide/separate_compilation.html#how-to-compile-mutually-recursive-modules
15+
"lhs-boot", // Literate `hs-boot`.
1116
"hsig", // Backpack module signatures: https://ghc.gitlab.haskell.org/ghc/doc/users_guide/separate_compilation.html#module-signatures
12-
"hsc", // `hsc2hs` C bindings: https://downloads.haskell.org/ghc/latest/docs/users_guide/utils.html?highlight=interfaces#writing-haskell-interfaces-to-c-code-hsc2hs
13-
"x", // `alex` (lexer generator): https://hackage.haskell.org/package/alex
14-
"y", // `happy` (parser generator): https://hackage.haskell.org/package/happy
15-
"c2hs", // `c2hs` C bindings: https://hackage.haskell.org/package/c2hs
16-
"gc", // `greencard` C bindings: https://hackage.haskell.org/package/greencard
17+
"lhsig", // Literate backpack module signatures.
18+
"hspp", // "A file created by the preprocessor".
19+
"hscpp", // Haskell C-preprocessor files.
1720
];
1821

1922
/// Determine if a given path represents a Haskell source file.

0 commit comments

Comments
 (0)