Skip to content

Commit abd5143

Browse files
authored
[Parser] Do not eagerly lex numbers (#6544)
Lex integers and floats on demand to avoid wasted work. Remove `Token` completely now that all kinds of tokens are lexed on demand.
1 parent df68786 commit abd5143

File tree

2 files changed

+141
-293
lines changed

2 files changed

+141
-293
lines changed

src/parser/lexer.cpp

Lines changed: 132 additions & 189 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,25 @@ std::optional<int> getHexDigit(char c) {
123123
return {};
124124
}
125125

126+
enum Sign { NoSign, Pos, Neg };
127+
126128
// The result of lexing an integer token fragment.
127129
struct LexIntResult : LexResult {
128130
uint64_t n;
129131
Sign sign;
132+
133+
template<typename T> bool isUnsigned() {
134+
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
135+
return sign == NoSign && n <= std::numeric_limits<T>::max();
136+
}
137+
138+
template<typename T> bool isSigned() {
139+
static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
140+
if (sign == Neg) {
141+
return uint64_t(std::numeric_limits<T>::min()) <= n || n == 0;
142+
}
143+
return n <= uint64_t(std::numeric_limits<T>::max());
144+
}
130145
};
131146

132147
// Lexing context that accumulates lexed input to produce an integer token
@@ -887,123 +902,6 @@ std::optional<LexResult> keyword(std::string_view in) {
887902

888903
} // anonymous namespace
889904

890-
template<typename T> std::optional<T> Token::getU() const {
891-
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
892-
if (auto* tok = std::get_if<IntTok>(&data)) {
893-
if (tok->sign == NoSign && tok->n <= std::numeric_limits<T>::max()) {
894-
return T(tok->n);
895-
}
896-
// TODO: Add error production for unsigned overflow.
897-
}
898-
return {};
899-
}
900-
901-
template<typename T> std::optional<T> Token::getS() const {
902-
static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
903-
if (auto* tok = std::get_if<IntTok>(&data)) {
904-
if (tok->sign == Neg) {
905-
if (uint64_t(std::numeric_limits<T>::min()) <= tok->n || tok->n == 0) {
906-
return T(tok->n);
907-
}
908-
} else {
909-
if (tok->n <= uint64_t(std::numeric_limits<T>::max())) {
910-
return T(tok->n);
911-
}
912-
}
913-
}
914-
return {};
915-
}
916-
917-
template<typename T> std::optional<T> Token::getI() const {
918-
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
919-
if (auto n = getU<T>()) {
920-
return *n;
921-
}
922-
if (auto n = getS<std::make_signed_t<T>>()) {
923-
return T(*n);
924-
}
925-
return {};
926-
}
927-
928-
template std::optional<uint64_t> Token::getU<uint64_t>() const;
929-
template std::optional<int64_t> Token::getS<int64_t>() const;
930-
template std::optional<uint64_t> Token::getI<uint64_t>() const;
931-
template std::optional<uint32_t> Token::getU<uint32_t>() const;
932-
template std::optional<int32_t> Token::getS<int32_t>() const;
933-
template std::optional<uint32_t> Token::getI<uint32_t>() const;
934-
template std::optional<uint16_t> Token::getU<uint16_t>() const;
935-
template std::optional<int16_t> Token::getS<int16_t>() const;
936-
template std::optional<uint16_t> Token::getI<uint16_t>() const;
937-
template std::optional<uint8_t> Token::getU<uint8_t>() const;
938-
template std::optional<int8_t> Token::getS<int8_t>() const;
939-
template std::optional<uint8_t> Token::getI<uint8_t>() const;
940-
941-
std::optional<double> Token::getF64() const {
942-
constexpr int signif = 52;
943-
constexpr uint64_t payloadMask = (1ull << signif) - 1;
944-
constexpr uint64_t nanDefault = 1ull << (signif - 1);
945-
if (auto* tok = std::get_if<FloatTok>(&data)) {
946-
double d = tok->d;
947-
if (std::isnan(d)) {
948-
// Inject payload.
949-
uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault;
950-
if (payload == 0 || payload > payloadMask) {
951-
// TODO: Add error production for out-of-bounds payload.
952-
return {};
953-
}
954-
uint64_t bits;
955-
static_assert(sizeof(bits) == sizeof(d));
956-
memcpy(&bits, &d, sizeof(bits));
957-
bits = (bits & ~payloadMask) | payload;
958-
memcpy(&d, &bits, sizeof(bits));
959-
}
960-
return d;
961-
}
962-
if (auto* tok = std::get_if<IntTok>(&data)) {
963-
if (tok->sign == Neg) {
964-
if (tok->n == 0) {
965-
return -0.0;
966-
}
967-
return double(int64_t(tok->n));
968-
}
969-
return double(tok->n);
970-
}
971-
return {};
972-
}
973-
974-
std::optional<float> Token::getF32() const {
975-
constexpr int signif = 23;
976-
constexpr uint32_t payloadMask = (1u << signif) - 1;
977-
constexpr uint64_t nanDefault = 1ull << (signif - 1);
978-
if (auto* tok = std::get_if<FloatTok>(&data)) {
979-
float f = tok->d;
980-
if (std::isnan(f)) {
981-
// Validate and inject payload.
982-
uint64_t payload = tok->nanPayload ? *tok->nanPayload : nanDefault;
983-
if (payload == 0 || payload > payloadMask) {
984-
// TODO: Add error production for out-of-bounds payload.
985-
return {};
986-
}
987-
uint32_t bits;
988-
static_assert(sizeof(bits) == sizeof(f));
989-
memcpy(&bits, &f, sizeof(bits));
990-
bits = (bits & ~payloadMask) | payload;
991-
memcpy(&f, &bits, sizeof(bits));
992-
}
993-
return f;
994-
}
995-
if (auto* tok = std::get_if<IntTok>(&data)) {
996-
if (tok->sign == Neg) {
997-
if (tok->n == 0) {
998-
return -0.0f;
999-
}
1000-
return float(int64_t(tok->n));
1001-
}
1002-
return float(tok->n);
1003-
}
1004-
return {};
1005-
}
1006-
1007905
void Lexer::skipSpace() {
1008906
while (true) {
1009907
if (auto ctx = annotation(next())) {
@@ -1020,9 +918,6 @@ void Lexer::skipSpace() {
1020918
}
1021919

1022920
bool Lexer::takeLParen() {
1023-
if (curr) {
1024-
return false;
1025-
}
1026921
if (LexCtx(next()).startsWith("("sv)) {
1027922
++index;
1028923
advance();
@@ -1032,9 +927,6 @@ bool Lexer::takeLParen() {
1032927
}
1033928

1034929
bool Lexer::takeRParen() {
1035-
if (curr) {
1036-
return false;
1037-
}
1038930
if (LexCtx(next()).startsWith(")"sv)) {
1039931
++index;
1040932
advance();
@@ -1044,9 +936,6 @@ bool Lexer::takeRParen() {
1044936
}
1045937

1046938
std::optional<std::string> Lexer::takeString() {
1047-
if (curr) {
1048-
return std::nullopt;
1049-
}
1050939
if (auto result = str(next())) {
1051940
index += result->span.size();
1052941
advance();
@@ -1060,9 +949,6 @@ std::optional<std::string> Lexer::takeString() {
1060949
}
1061950

1062951
std::optional<Name> Lexer::takeID() {
1063-
if (curr) {
1064-
return std::nullopt;
1065-
}
1066952
if (auto result = ident(next())) {
1067953
index += result->span.size();
1068954
advance();
@@ -1080,9 +966,6 @@ std::optional<Name> Lexer::takeID() {
1080966
}
1081967

1082968
std::optional<std::string_view> Lexer::takeKeyword() {
1083-
if (curr) {
1084-
return std::nullopt;
1085-
}
1086969
if (auto result = keyword(next())) {
1087970
index += result->span.size();
1088971
advance();
@@ -1130,20 +1013,124 @@ std::optional<uint32_t> Lexer::takeAlign() {
11301013
return std::nullopt;
11311014
}
11321015

1133-
void Lexer::lexToken() {
1134-
// TODO: Ensure we're getting the longest possible match.
1135-
Token tok;
1136-
if (auto t = integer(next())) {
1137-
tok = Token{t->span, IntTok{t->n, t->sign}};
1138-
} else if (auto t = float_(next())) {
1139-
tok = Token{t->span, FloatTok{t->nanPayload, t->d}};
1140-
} else {
1141-
// TODO: Do something about lexing errors.
1142-
curr = std::nullopt;
1143-
return;
1016+
template<typename T> std::optional<T> Lexer::takeU() {
1017+
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
1018+
if (auto result = integer(next()); result && result->isUnsigned<T>()) {
1019+
index += result->span.size();
1020+
advance();
1021+
return T(result->n);
1022+
}
1023+
// TODO: Add error production for unsigned overflow.
1024+
return std::nullopt;
1025+
}
1026+
1027+
template<typename T> std::optional<T> Lexer::takeS() {
1028+
static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
1029+
if (auto result = integer(next()); result && result->isSigned<T>()) {
1030+
index += result->span.size();
1031+
advance();
1032+
return T(result->n);
1033+
}
1034+
return std::nullopt;
1035+
}
1036+
1037+
template<typename T> std::optional<T> Lexer::takeI() {
1038+
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
1039+
if (auto result = integer(next())) {
1040+
if (result->isUnsigned<T>() || result->isSigned<std::make_signed_t<T>>()) {
1041+
index += result->span.size();
1042+
advance();
1043+
return T(result->n);
1044+
}
11441045
}
1145-
index += tok.span.size();
1146-
curr = {tok};
1046+
return std::nullopt;
1047+
}
1048+
1049+
template std::optional<uint64_t> Lexer::takeU<uint64_t>();
1050+
template std::optional<int64_t> Lexer::takeS<int64_t>();
1051+
template std::optional<uint64_t> Lexer::takeI<uint64_t>();
1052+
template std::optional<uint32_t> Lexer::takeU<uint32_t>();
1053+
template std::optional<int32_t> Lexer::takeS<int32_t>();
1054+
template std::optional<uint32_t> Lexer::takeI<uint32_t>();
1055+
template std::optional<uint16_t> Lexer::takeU<uint16_t>();
1056+
template std::optional<int16_t> Lexer::takeS<int16_t>();
1057+
template std::optional<uint16_t> Lexer::takeI<uint16_t>();
1058+
template std::optional<uint8_t> Lexer::takeU<uint8_t>();
1059+
template std::optional<int8_t> Lexer::takeS<int8_t>();
1060+
template std::optional<uint8_t> Lexer::takeI<uint8_t>();
1061+
1062+
std::optional<double> Lexer::takeF64() {
1063+
constexpr int signif = 52;
1064+
constexpr uint64_t payloadMask = (1ull << signif) - 1;
1065+
constexpr uint64_t nanDefault = 1ull << (signif - 1);
1066+
if (auto result = float_(next())) {
1067+
double d = result->d;
1068+
if (std::isnan(d)) {
1069+
// Inject payload.
1070+
uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault;
1071+
if (payload == 0 || payload > payloadMask) {
1072+
// TODO: Add error production for out-of-bounds payload.
1073+
return std::nullopt;
1074+
}
1075+
uint64_t bits;
1076+
static_assert(sizeof(bits) == sizeof(d));
1077+
memcpy(&bits, &d, sizeof(bits));
1078+
bits = (bits & ~payloadMask) | payload;
1079+
memcpy(&d, &bits, sizeof(bits));
1080+
}
1081+
index += result->span.size();
1082+
advance();
1083+
return d;
1084+
}
1085+
if (auto result = integer(next())) {
1086+
index += result->span.size();
1087+
advance();
1088+
if (result->sign == Neg) {
1089+
if (result->n == 0) {
1090+
return -0.0;
1091+
}
1092+
return double(int64_t(result->n));
1093+
}
1094+
return double(result->n);
1095+
}
1096+
return std::nullopt;
1097+
}
1098+
1099+
std::optional<float> Lexer::takeF32() {
1100+
constexpr int signif = 23;
1101+
constexpr uint32_t payloadMask = (1u << signif) - 1;
1102+
constexpr uint64_t nanDefault = 1ull << (signif - 1);
1103+
if (auto result = float_(next())) {
1104+
float f = result->d;
1105+
if (std::isnan(f)) {
1106+
// Validate and inject payload.
1107+
uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault;
1108+
if (payload == 0 || payload > payloadMask) {
1109+
// TODO: Add error production for out-of-bounds payload.
1110+
return std::nullopt;
1111+
}
1112+
uint32_t bits;
1113+
static_assert(sizeof(bits) == sizeof(f));
1114+
memcpy(&bits, &f, sizeof(bits));
1115+
bits = (bits & ~payloadMask) | payload;
1116+
memcpy(&f, &bits, sizeof(bits));
1117+
}
1118+
index += result->span.size();
1119+
advance();
1120+
return f;
1121+
}
1122+
if (auto result = integer(next())) {
1123+
index += result->span.size();
1124+
advance();
1125+
if (result->sign == Neg) {
1126+
if (result->n == 0) {
1127+
return -0.0f;
1128+
}
1129+
return float(int64_t(result->n));
1130+
}
1131+
return float(result->n);
1132+
}
1133+
return std::nullopt;
11471134
}
11481135

11491136
TextPos Lexer::position(const char* c) const {
@@ -1164,52 +1151,8 @@ bool TextPos::operator==(const TextPos& other) const {
11641151
return line == other.line && col == other.col;
11651152
}
11661153

1167-
bool IntTok::operator==(const IntTok& other) const {
1168-
return n == other.n && sign == other.sign;
1169-
}
1170-
1171-
bool FloatTok::operator==(const FloatTok& other) const {
1172-
return std::signbit(d) == std::signbit(other.d) &&
1173-
(d == other.d || (std::isnan(d) && std::isnan(other.d) &&
1174-
nanPayload == other.nanPayload));
1175-
}
1176-
1177-
bool Token::operator==(const Token& other) const {
1178-
return span == other.span &&
1179-
std::visit(
1180-
[](auto& t1, auto& t2) {
1181-
if constexpr (std::is_same_v<decltype(t1), decltype(t2)>) {
1182-
return t1 == t2;
1183-
} else {
1184-
return false;
1185-
}
1186-
},
1187-
data,
1188-
other.data);
1189-
}
1190-
11911154
std::ostream& operator<<(std::ostream& os, const TextPos& pos) {
11921155
return os << pos.line << ":" << pos.col;
11931156
}
11941157

1195-
std::ostream& operator<<(std::ostream& os, const IntTok& tok) {
1196-
return os << (tok.sign == Pos ? "+" : tok.sign == Neg ? "-" : "") << tok.n;
1197-
}
1198-
1199-
std::ostream& operator<<(std::ostream& os, const FloatTok& tok) {
1200-
if (std::isnan(tok.d)) {
1201-
os << (std::signbit(tok.d) ? "+" : "-");
1202-
if (tok.nanPayload) {
1203-
return os << "nan:0x" << std::hex << *tok.nanPayload << std::dec;
1204-
}
1205-
return os << "nan";
1206-
}
1207-
return os << tok.d;
1208-
}
1209-
1210-
std::ostream& operator<<(std::ostream& os, const Token& tok) {
1211-
std::visit([&](const auto& t) { os << t; }, tok.data);
1212-
return os << " \"" << tok.span << "\"";
1213-
}
1214-
12151158
} // namespace wasm::WATParser

0 commit comments

Comments
 (0)