Skip to content

Commit 5398873

Browse files
committed
Parse single-quoted GHC output more reliably
GHC output contains quoted fragments: Module graph contains a cycle: module ‘C’ (./C.hs) imports module ‘A’ (A.hs) which imports module ‘B’ (./B.hs) which imports module ‘C’ (./C.hs) When Unicode output is not available, the Unicode quotes are substituted for GNU-style ASCII quotes: module `C' (./C.hs) However, when the quoted text starts or ends with a single quote, ASCII quotes are omitted. This leads to ambiguous output: A → `A' A' → A' `A' → `A' 'A → 'A 'A' → 'A' Correctly parsing this is challenging. This probably increases the amount of backtracking and lookahead required for these parsers. Not sure if that's significant or relevant.
1 parent c712560 commit 5398873

File tree

4 files changed

+178
-45
lines changed

4 files changed

+178
-45
lines changed

src/ghci/parse/ghc_message/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub use position::PositionRange;
1414
mod severity;
1515
pub use severity::Severity;
1616

17-
mod single_quote;
17+
mod single_quoted;
1818

1919
mod path_colon;
2020
use path_colon::path_colon;

src/ghci/parse/ghc_message/module_import_cycle_diagnostic.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::ghci::parse::haskell_grammar::module_name;
1313
use crate::ghci::parse::lines::rest_of_line;
1414
use crate::ghci::parse::Severity;
1515

16-
use super::single_quote::single_quote;
16+
use super::single_quoted::single_quoted;
1717
use super::GhcDiagnostic;
1818
use super::GhcMessage;
1919

@@ -39,10 +39,7 @@ pub fn module_import_cycle_diagnostic(input: &mut &str) -> PResult<Vec<GhcMessag
3939
let _ = opt("which ").parse_next(input)?;
4040
let _ = opt("imports ").parse_next(input)?;
4141
let _ = "module ".parse_next(input)?;
42-
let _ = single_quote.parse_next(input)?;
43-
let _name = module_name.parse_next(input)?;
44-
let _ = single_quote.parse_next(input)?;
45-
let _ = space1.parse_next(input)?;
42+
let (_name, _) = single_quoted(module_name, space1).parse_next(input)?;
4643
let _ = "(".parse_next(input)?;
4744
let path = take_until(1.., ")").parse_next(input)?;
4845
let _ = ")".parse_next(input)?;

src/ghci/parse/ghc_message/single_quote.rs

Lines changed: 0 additions & 39 deletions
This file was deleted.
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
use winnow::combinator::alt;
2+
use winnow::combinator::preceded;
3+
use winnow::error::ParserError;
4+
use winnow::stream::AsChar;
5+
use winnow::stream::Stream;
6+
use winnow::token::any;
7+
use winnow::token::take_till;
8+
use winnow::Parser;
9+
10+
use crate::ghci::parse::transform_till;
11+
12+
/// Parse a single-quoted portion of GHC output.
13+
///
14+
/// If Unicode is supported and `GHC_NO_UNICODE` is unset, the output will be surrounded with
15+
/// Unicode single quotes:
16+
///
17+
/// ```text
18+
/// ‘puppy’
19+
/// ```
20+
///
21+
/// Otherwise, the output will be surrounded with "GNU-style" quotes:
22+
///
23+
/// ```text
24+
/// `puppy'
25+
/// ```
26+
///
27+
/// However, if the quoted string starts or ends with an ASCII single quote (`'`) and Unicode
28+
/// output is disabled, the quotes will be omitted entirely:
29+
///
30+
/// ```text
31+
/// puppy -> `puppy'
32+
/// puppy' -> puppy'
33+
/// 'puppy -> 'puppy
34+
/// 'puppy' -> 'puppy'
35+
/// `puppy' -> `puppy'
36+
/// ```
37+
///
38+
/// Note that the quoted output for the first and last examples is the same, so the output is
39+
/// ambiguous in this case.
40+
///
41+
/// See: <https://gitlab.haskell.org/ghc/ghc/-/blob/077cb2e11fa81076e8c9c5f8dd3bdfa99c8aaf8d/compiler/GHC/Utils/Outputable.hs#L744-L756>
42+
///
43+
/// See: <https://gitlab.haskell.org/ghc/ghc/-/blob/077cb2e11fa81076e8c9c5f8dd3bdfa99c8aaf8d/compiler/GHC/Utils/Ppr.hs#L468>
44+
pub fn single_quoted<'i, O1, O2, E>(
45+
mut inner: impl Parser<&'i str, O1, E>,
46+
mut end: impl Parser<&'i str, O2, E>,
47+
) -> impl Parser<&'i str, (O1, O2), E>
48+
where
49+
E: ParserError<&'i str>,
50+
{
51+
move |input: &mut &'i str| {
52+
let start = input.checkpoint();
53+
54+
let initial = any.parse_next(input)?.as_char();
55+
match initial {
56+
'‘' => transform_till(
57+
alt((preceded('’', take_till(0.., '’')), take_till(1.., '’'))),
58+
inner.by_ref(),
59+
preceded('’', end.by_ref()),
60+
)
61+
.parse_next(input),
62+
'`' => {
63+
// If the output starts with a backtick, it must end with a single quote.
64+
// * Either the output is quoted normally (in which case it ends with a single quote), or
65+
// the quotes are skipped.
66+
// * If the quotes are skipped, then the output either starts or ends with a single quote.
67+
// * The output starts with a backtick, so we know it doesn't start with a single quote.
68+
// * Therefore, it must end with a single quote.
69+
transform_till(
70+
alt((preceded('\'', take_till(0.., '\'')), take_till(1.., '\''))),
71+
inner.by_ref(),
72+
preceded('\'', end.by_ref()),
73+
)
74+
.parse_next(input)
75+
}
76+
// If the output starts with anything else, the quoting must be skipped.
77+
_ => {
78+
input.reset(start);
79+
// Potentially this will have to consume the entire input before backtracking. Sad!
80+
transform_till(any, inner.by_ref(), end.by_ref()).parse_next(input)
81+
}
82+
}
83+
}
84+
}
85+
86+
#[cfg(test)]
87+
mod tests {
88+
use crate::ghci::parse::haskell_grammar::module_name;
89+
90+
use super::*;
91+
92+
use pretty_assertions::assert_eq;
93+
94+
#[test]
95+
fn test_parse_single_quoted() {
96+
// Unicode.
97+
assert_eq!(
98+
single_quoted(module_name, ' ').parse("‘Puppy’ ").unwrap(),
99+
("Puppy", ' ')
100+
);
101+
102+
assert_eq!(
103+
single_quoted(module_name, ' ').parse("‘Puppy'’ ").unwrap(),
104+
("Puppy'", ' ')
105+
);
106+
107+
assert_eq!(
108+
single_quoted(module_name, ' ').parse("‘Puppy''’ ").unwrap(),
109+
("Puppy''", ' ')
110+
);
111+
112+
// ASCII.
113+
assert_eq!(
114+
single_quoted(module_name, ' ').parse("`Puppy' ").unwrap(),
115+
("Puppy", ' ')
116+
);
117+
118+
// Internal quotes.
119+
assert_eq!(
120+
single_quoted(module_name, ' ').parse("`Pupp'y' ").unwrap(),
121+
("Pupp'y", ' ')
122+
);
123+
assert_eq!(
124+
single_quoted(module_name, ' ').parse("`Pupp''y' ").unwrap(),
125+
("Pupp''y", ' ')
126+
);
127+
assert_eq!(
128+
single_quoted(module_name, ' ')
129+
.parse("`Pupp'''y' ")
130+
.unwrap(),
131+
("Pupp'''y", ' ')
132+
);
133+
assert_eq!(
134+
single_quoted(module_name, ' ')
135+
.parse("`Pupp''''y' ")
136+
.unwrap(),
137+
("Pupp''''y", ' ')
138+
);
139+
140+
// Starts/ends with single quote.
141+
assert_eq!(
142+
single_quoted(module_name, ' ').parse("Puppy' ").unwrap(),
143+
("Puppy'", ' ')
144+
);
145+
assert_eq!(
146+
single_quoted(module_name, ' ').parse("Puppy'' ").unwrap(),
147+
("Puppy''", ' ')
148+
);
149+
assert_eq!(
150+
single_quoted(preceded('\'', module_name), ' ')
151+
.parse("'Puppy ")
152+
.unwrap(),
153+
("Puppy", ' ')
154+
);
155+
assert_eq!(
156+
single_quoted(preceded('\'', module_name), ' ')
157+
.parse("'Puppy' ")
158+
.unwrap(),
159+
("Puppy'", ' ')
160+
);
161+
162+
// Negative cases.
163+
164+
// No valid ending.
165+
assert!(single_quoted(module_name, ' ').parse("‘Puppy’x").is_err());
166+
167+
// Modules can't start with numbers.
168+
assert!(single_quoted(module_name, ' ').parse("`0' ").is_err());
169+
assert!(single_quoted(module_name, ' ').parse("0 ").is_err());
170+
171+
// Delimiters have to match.
172+
assert!(single_quoted(module_name, ' ').parse("‘Puppy' ").is_err());
173+
assert!(single_quoted(module_name, ' ').parse("`Puppy’ ").is_err());
174+
}
175+
}

0 commit comments

Comments
 (0)