Skip to content

Commit cee7514

Browse files
committed
Add raw string interpolation support for cpp2
Raw-string literals that starts with $ (dollar sign) will interpolate. That means that following code: ```cpp rs := $R"(m["one"] + m["two"] = (m["one"] + m["two"])$)"; ``` will generate follwing cpp1 code: ```cpp auto rs { R"(m["one"] + m["two"] = )" + cpp2::to_string(cpp2::assert_in_bounds(m, "one") + cpp2::assert_in_bounds(m, "two")) }; ``` It handles raw strings in single line and in multiple lines. It process line by one and stores parts of multiline raw string in separate buffer (multiline_raw_strings).
1 parent f84679d commit cee7514

File tree

2 files changed

+200
-23
lines changed

2 files changed

+200
-23
lines changed

source/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,13 @@ struct raw_string
216216
std::string text;
217217
std::string opening_seq;
218218
std::string closing_seq;
219+
bool should_interpolate = false;
220+
};
221+
222+
struct multiline_raw_string
223+
{
224+
std::string text;
225+
source_position end = {0, 0};
219226
};
220227

221228
//-----------------------------------------------------------------------

source/lex.h

Lines changed: 193 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,71 @@ auto expand_string_literal(
436436
return parts.generate();
437437
}
438438

439+
auto expand_raw_string_literal(
440+
const std::string& opening_seq,
441+
const std::string& closing_seq,
442+
string_parts::adds_sequences closing_strategy,
443+
std::string_view text,
444+
std::vector<error>& errors,
445+
source_position src_pos) -> string_parts
446+
{
447+
auto const length = std::ssize(text);
448+
auto pos = 0;
449+
auto first_quote_pos = pos;
450+
auto current_start = pos; // the current offset before which the string has been added to ret
451+
string_parts parts{opening_seq, closing_seq, closing_strategy};
452+
453+
// Now we're on the first character of the string itself
454+
for ( ; pos < length; ++pos )
455+
{
456+
// Find the next )$
457+
if (text[pos] == '$' && text[pos-1] == ')')
458+
{
459+
// Scan back to find the matching (
460+
auto paren_depth = 1;
461+
auto open = pos - 2;
462+
463+
for( ; open > current_start; --open)
464+
{
465+
if (text[open] == ')') {
466+
++paren_depth;
467+
}
468+
else if (text[open] == '(') {
469+
--paren_depth;
470+
if (paren_depth == 0) {
471+
break;
472+
}
473+
}
474+
}
475+
if (text[open] != '(')
476+
{
477+
errors.emplace_back(
478+
source_position( src_pos.lineno, src_pos.colno + pos ),
479+
"no matching ( for string interpolation ending in )$"
480+
);
481+
return parts;
482+
}
483+
484+
// 'open' is now at the matching (
485+
486+
// Put the next non-empty non-interpolated chunk straight into ret
487+
if (open != current_start) {
488+
parts.add_string(text.substr(current_start, open - current_start));
489+
}
490+
// Then put interpolated chunk into ret
491+
parts.add_code("cpp2::to_string" + std::string{text.substr(open, pos - open)});
492+
493+
current_start = pos+1;
494+
}
495+
}
496+
497+
// Put the final non-interpolated chunk straight into ret
498+
if (current_start < std::ssize(text)) {
499+
parts.add_string(text.substr(current_start));
500+
}
501+
502+
return parts;
503+
}
439504

440505
//-----------------------------------------------------------------------
441506
// lex: Tokenize a single line while maintaining inter-line state
@@ -455,6 +520,8 @@ auto expand_string_literal(
455520
// -- this isn't about tokens generated later, that's tokens::generated_tokens
456521
static auto generated_text = std::deque<std::string>{};
457522

523+
static auto multiline_raw_strings = std::deque<multiline_raw_string>{};
524+
458525
auto lex_line(
459526
std::string& mutable_line,
460527
int const lineno,
@@ -889,6 +956,49 @@ auto lex_line(
889956
return do_is_keyword(multi_keys);
890957
};
891958

959+
auto reset_processing_of_the_line = [&]() {
960+
// Redo processing of this whole line now that the string is expanded,
961+
// which may have moved it in memory... move i back to the line start
962+
// and discard any tokens we already tokenized for this line
963+
i = colno_t{-1};
964+
while (
965+
!tokens.empty()
966+
&& tokens.back().position().lineno == lineno
967+
)
968+
{
969+
tokens.pop_back();
970+
}
971+
};
972+
973+
auto interpolate_raw_string = [&](
974+
const std::string& opening_seq,
975+
const std::string& closing_seq,
976+
string_parts::adds_sequences closing_strategy,
977+
std::string_view part,
978+
int pos_to_replace,
979+
int size_to_replace
980+
) -> bool {
981+
auto parts = expand_raw_string_literal(opening_seq, closing_seq, closing_strategy, part, errors, source_position(lineno, pos_to_replace + 1));
982+
auto new_part = parts.generate();
983+
mutable_line.replace( pos_to_replace, size_to_replace, new_part );
984+
i += std::ssize(new_part)-1;
985+
986+
if (parts.is_expanded()) {
987+
// raw string was expanded and we need to repeat the processing of this line
988+
reset_processing_of_the_line();
989+
990+
// but skipping end of potential multiline raw string that ends on this line
991+
if (!multiline_raw_strings.empty() && multiline_raw_strings.back().end.lineno == lineno) {
992+
i = multiline_raw_strings.back().end.colno;
993+
raw_string_multiline.reset();
994+
} else if (raw_string_multiline && raw_string_multiline->start.lineno == lineno) {
995+
raw_string_multiline.reset();
996+
}
997+
return true;
998+
}
999+
return false;
1000+
};
1001+
8921002
//
8931003
//-----------------------------------------------------
8941004

@@ -918,6 +1028,7 @@ auto lex_line(
9181028
else if (peek1 == 'R' && peek2 == next) { return 3; } // LR"
9191029
}
9201030
else if (line[i] == 'R' && peek1 == next) { return 2; } // R"
1031+
else if (line[i] == '$' && peek1 == 'R' && peek2 == next) { return 3; } // $R"
9211032
return 0;
9221033
};
9231034

@@ -947,22 +1058,36 @@ auto lex_line(
9471058
auto end_pos = line.find(raw_string_multiline.value().closing_seq, i);
9481059
auto part = line.substr(i, end_pos-i);
9491060

1061+
if (const auto& rsm = raw_string_multiline.value(); rsm.should_interpolate) {
1062+
1063+
auto closing_strategy = end_pos == line.npos ? string_parts::no_ends : string_parts::on_the_end;
1064+
auto size_to_replace = end_pos == line.npos ? std::ssize(line) - i : end_pos - i + std::ssize(rsm.closing_seq);
1065+
1066+
if (interpolate_raw_string(rsm.opening_seq, rsm.closing_seq, closing_strategy, part, i, size_to_replace ) ) {
1067+
continue;
1068+
}
1069+
}
1070+
// raw string was not expanded
1071+
9501072
raw_string_multiline.value().text += part;
9511073
if (end_pos == std::string::npos) {
9521074
raw_string_multiline.value().text += '\n';
9531075
break;
9541076
}
9551077

956-
// here we know that we are dealing with multiline raw string literal
957-
// token needs to use generated_text to store string that exists in multiple lines
958-
i = end_pos+std::ssize(raw_string_multiline.value().closing_seq)-1;
1078+
// here we know that we are dealing with finalized multiline raw string literal
1079+
// token needs to use multiline_raw_strings to store string that exists in multiple lines
9591080
raw_string_multiline.value().text += raw_string_multiline.value().closing_seq;
9601081

961-
generated_text.push_back(raw_string_multiline.value().text);
1082+
// and position where multiline_raw_string ends (needed for reseting line parsing)
1083+
i = end_pos+std::ssize(raw_string_multiline.value().closing_seq)-1;
1084+
1085+
const auto& text = raw_string_multiline.value().should_interpolate ? raw_string_multiline.value().text.substr(1) : raw_string_multiline.value().text;
1086+
multiline_raw_strings.emplace_back(multiline_raw_string{ text, {lineno, i} });
9621087

9631088
tokens.push_back({
964-
&generated_text.back()[0],
965-
std::ssize(generated_text.back()),
1089+
&multiline_raw_strings.back().text[0],
1090+
std::ssize(multiline_raw_strings.back().text),
9661091
raw_string_multiline.value().start,
9671092
lexeme::StringLiteral
9681093
});
@@ -1153,7 +1278,62 @@ auto lex_line(
11531278
store(1, lexeme::QuestionMark);
11541279

11551280
break;case '$':
1156-
store(1, lexeme::Dollar);
1281+
if (auto j = is_encoding_prefix_and('\"'); peek(j-2) == 'R') {
1282+
// if peek(j-2) is 'R' it means that we deal with raw-string literal
1283+
auto R_pos = i + j - 2;
1284+
auto seq_pos = i + j;
1285+
1286+
if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) {
1287+
auto opening_seq = line.substr(i, paren_pos - i + 1);
1288+
auto closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
1289+
1290+
if (auto closing_pos = line.find(closing_seq, paren_pos+1); closing_pos != line.npos) {
1291+
if (interpolate_raw_string(
1292+
opening_seq,
1293+
closing_seq,
1294+
string_parts::on_both_ends,
1295+
std::string_view(&line[paren_pos+1], closing_pos-paren_pos-1), i, closing_pos-i+std::ssize(closing_seq))
1296+
) {
1297+
continue;
1298+
}
1299+
1300+
tokens.push_back({
1301+
&line[R_pos],
1302+
i - R_pos + 1,
1303+
source_position(lineno, R_pos + 1),
1304+
lexeme::StringLiteral
1305+
});
1306+
} else {
1307+
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, opening_seq, opening_seq, closing_seq, true });
1308+
1309+
if (interpolate_raw_string(
1310+
opening_seq,
1311+
closing_seq,
1312+
string_parts::on_the_begining,
1313+
std::string_view(&line[paren_pos+1], std::ssize(line)-(paren_pos+1)), i, std::ssize(line)-i)
1314+
) {
1315+
continue;
1316+
}
1317+
// skip entire raw string opening sequence R"
1318+
i = paren_pos;
1319+
1320+
// if we are on the end of the line we need to add new line char
1321+
if (i+1 == std::ssize(line)) {
1322+
raw_string_multiline.value().text += '\n';
1323+
}
1324+
}
1325+
continue;
1326+
}
1327+
else {
1328+
errors.emplace_back(
1329+
source_position(lineno, i + j - 2),
1330+
"invalid new-line in raw string delimiter \"" + std::string(&line[i],j)
1331+
+ "\" - stray 'R' in program \""
1332+
);
1333+
}
1334+
} else {
1335+
store(1, lexeme::Dollar);
1336+
}
11571337

11581338
//G
11591339
//G literal:
@@ -1313,13 +1493,13 @@ auto lex_line(
13131493
auto seq_pos = i + j;
13141494

13151495
if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) {
1316-
auto raw_string_opening_seq = line.substr(i, paren_pos - i + 1);
1317-
auto raw_string_closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
1496+
auto opening_seq = line.substr(i, paren_pos - i + 1);
1497+
auto closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
13181498

1319-
if (auto closing_pos = line.find(raw_string_closing_seq, paren_pos+1); closing_pos != line.npos) {
1320-
store(closing_pos+std::ssize(raw_string_closing_seq)-i, lexeme::StringLiteral);
1499+
if (auto closing_pos = line.find(closing_seq, paren_pos+1); closing_pos != line.npos) {
1500+
store(closing_pos+std::ssize(closing_seq)-i, lexeme::StringLiteral);
13211501
} else {
1322-
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, raw_string_opening_seq, raw_string_opening_seq, raw_string_closing_seq });
1502+
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, opening_seq, opening_seq, closing_seq });
13231503
// skip entire raw string opening sequence R"
13241504
i = paren_pos;
13251505

@@ -1368,17 +1548,7 @@ auto lex_line(
13681548
}
13691549
mutable_line.replace( i, j+1, s );
13701550

1371-
// Redo processing of this whole line now that the string is expanded,
1372-
// which may have moved it in memory... move i back to the line start
1373-
// and discard any tokens we already tokenized for this line
1374-
i = colno_t{-1};
1375-
while (
1376-
!tokens.empty()
1377-
&& tokens.back().position().lineno == lineno
1378-
)
1379-
{
1380-
tokens.pop_back();
1381-
}
1551+
reset_processing_of_the_line();
13821552
}
13831553
}
13841554
}

0 commit comments

Comments
 (0)