Skip to content

Commit 5fa57f8

Browse files
author
Mikhail Arkhipov
authored
Handle numbers formatted with underscores in tokenizer (#1819)
* Undo changes * Test fixes * Increase timeout * Remove double event listening * Remove test * Revert "Remove test" This reverts commit e240c3f. * Revert "Remove double event listening" This reverts commit af573be. * #1096 The if statement is automatically formatted incorrectly * Merge fix * Add more tests * More tests * Typo * Test * Also better handle multiline arguments * Add a couple missing periods [skip ci] * Undo changes * Test fixes * Increase timeout * Remove double event listening * Remove test * Revert "Remove test" This reverts commit e240c3f. * Revert "Remove double event listening" This reverts commit af573be. * Merge fix * #1257 On type formatting errors for args and kwargs * Handle f-strings * Stop importing from test code * #1308 Single line statements leading to an indentation on the next line * #726 editing python after inline if statement invalid indent * Undo change * Move constant * Harden LS startup error checks * #1364 Intellisense doesn't work after specific const string * Telemetry for the analysis enging * PR feedback * Fix typo * Test baseline update * Jedi 0.12 * Priority to goto_defition * News * Replace unzip * Linux flavors + test * Grammar check * Grammar test * Test baselines * Add news * Pin dependency [skip ci] * Specify markdown as preferable format * Improve function argument detection * Specify markdown * Pythia setting * Baseline updates * Baseline update * Improve startup * Handle missing interpreter better * Handle interpreter change * Delete old file * Fix LS startup time reporting * Remove Async suffix from IFileSystem * Remove Pythia * Remove pre-packaged MSIL * Exe name on Unix * Plain linux * Fix casing * Fix message * Update PTVS engine activation steps * Type formatter eats space in from . * fIX CASING * Remove flag * Don't wait for LS * Small test fixes * Update hover baselines * Rename the engine * Formatting 1 * Add support for 'rf' strings * Add two spaces before comment per PEP * Fix @ operator spacing * Handle module and unary ops * Type hints * Fix typo * Trailing comma * Require space after if * underscore numbers * Update list of keywords * PR feedback * News * Use a bit more Markdown in the news entry
1 parent 51224e7 commit 5fa57f8

File tree

4 files changed

+84
-33
lines changed

4 files changed

+84
-33
lines changed

news/2 Fixes/1779.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
`editor.formatOnType` no longer breaks numbers formatted with underscores.

src/client/language/characters.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,22 @@ export function isLineBreak(ch: number): boolean {
8383
return ch === Char.CarriageReturn || ch === Char.LineFeed;
8484
}
8585

86+
export function isNumber(ch: number): boolean {
87+
return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
88+
}
89+
8690
export function isDecimal(ch: number): boolean {
87-
return ch >= Char._0 && ch <= Char._9;
91+
return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
8892
}
8993

9094
export function isHex(ch: number): boolean {
91-
return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F);
95+
return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F) || ch === Char.Underscore;
9296
}
9397

9498
export function isOctal(ch: number): boolean {
95-
return ch >= Char._0 && ch <= Char._7;
99+
return ch >= Char._0 && ch <= Char._7 || ch === Char.Underscore;
96100
}
97101

98102
export function isBinary(ch: number): boolean {
99-
return ch === Char._0 || ch === Char._1;
103+
return ch === Char._0 || ch === Char._1 || ch === Char.Underscore;
100104
}

src/client/language/tokenizer.ts

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
// tslint:disable-next-line:import-name
66
import Char from 'typescript-char';
7-
import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal } from './characters';
7+
import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal, isWhiteSpace } from './characters';
88
import { CharacterStream } from './characterStream';
99
import { TextRangeCollection } from './textRangeCollection';
1010
import { ICharacterStream, ITextRangeCollection, IToken, ITokenizer, TextRange, TokenizerMode, TokenType } from './types';
@@ -29,13 +29,8 @@ class Token extends TextRange implements IToken {
2929
export class Tokenizer implements ITokenizer {
3030
private cs: ICharacterStream = new CharacterStream('');
3131
private tokens: IToken[] = [];
32-
private floatRegex = /[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?/;
3332
private mode = TokenizerMode.Full;
3433

35-
constructor() {
36-
//this.floatRegex.compile();
37-
}
38-
3934
public tokenize(text: string): ITextRangeCollection<IToken>;
4035
public tokenize(text: string, start: number, length: number, mode: TokenizerMode): ITextRangeCollection<IToken>;
4136

@@ -224,43 +219,74 @@ export class Tokenizer implements ITokenizer {
224219

225220
if (this.cs.currentChar === Char._0) {
226221
let radix = 0;
227-
// Try hex
228-
if (this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) {
222+
// Try hex => hexinteger: "0" ("x" | "X") (["_"] hexdigit)+
223+
if ((this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) && isHex(this.cs.lookAhead(2))) {
229224
this.cs.advance(2);
230225
while (isHex(this.cs.currentChar)) {
231226
this.cs.moveNext();
232227
}
233228
radix = 16;
234229
}
235-
// Try binary
236-
if (this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) {
230+
// Try binary => bininteger: "0" ("b" | "B") (["_"] bindigit)+
231+
if ((this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) && isBinary(this.cs.lookAhead(2))) {
237232
this.cs.advance(2);
238233
while (isBinary(this.cs.currentChar)) {
239234
this.cs.moveNext();
240235
}
241236
radix = 2;
242237
}
243-
// Try octal
244-
if (this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) {
238+
// Try octal => octinteger: "0" ("o" | "O") (["_"] octdigit)+
239+
if ((this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) && isOctal(this.cs.lookAhead(2))) {
245240
this.cs.advance(2);
246241
while (isOctal(this.cs.currentChar)) {
247242
this.cs.moveNext();
248243
}
249244
radix = 8;
250245
}
246+
if (radix > 0) {
247+
const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
248+
if (!isNaN(parseInt(text, radix))) {
249+
this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
250+
return true;
251+
}
252+
}
253+
}
254+
255+
let decimal = false;
256+
// Try decimal int =>
257+
// decinteger: nonzerodigit (["_"] digit)* | "0" (["_"] "0")*
258+
// nonzerodigit: "1"..."9"
259+
// digit: "0"..."9"
260+
if (this.cs.currentChar >= Char._1 && this.cs.currentChar <= Char._9) {
261+
while (isDecimal(this.cs.currentChar)) {
262+
this.cs.moveNext();
263+
}
264+
decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
265+
}
266+
267+
if (this.cs.currentChar === Char._0) { // "0" (["_"] "0")*
268+
while (this.cs.currentChar === Char._0 || this.cs.currentChar === Char.Underscore) {
269+
this.cs.moveNext();
270+
}
271+
decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
272+
}
273+
274+
if (decimal) {
251275
const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
252-
if (radix > 0 && parseInt(text.substr(2), radix)) {
276+
if (!isNaN(parseInt(text, 10))) {
253277
this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
254278
return true;
255279
}
256280
}
257281

258-
if (isDecimal(this.cs.currentChar) || this.cs.currentChar === Char.Period) {
259-
const candidate = this.cs.getText().substr(this.cs.position);
260-
const re = this.floatRegex.exec(candidate);
261-
if (re && re.length > 0 && re[0] && candidate.startsWith(re[0])) {
262-
this.tokens.push(new Token(TokenType.Number, start, re[0].length + leadingSign));
263-
this.cs.position = start + re[0].length + leadingSign;
282+
// Floating point
283+
if ((this.cs.currentChar >= Char._0 && this.cs.currentChar <= Char._9) || this.cs.currentChar === Char.Period) {
284+
while (!isWhiteSpace(this.cs.currentChar)) {
285+
this.cs.moveNext();
286+
}
287+
const text = this.cs.getText().substr(start, this.cs.position - start);
288+
if (!isNaN(parseFloat(text))) {
289+
this.tokens.push(new Token(TokenType.Number, start, this.cs.position - start));
264290
return true;
265291
}
266292
}
@@ -380,7 +406,7 @@ export class Tokenizer implements ITokenizer {
380406
case 'rf':
381407
case 'ur':
382408
case 'br':
383-
return 2;
409+
return 2;
384410
default:
385411
break;
386412
}

src/test/language/tokenizer.test.ts

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ suite('Language.Tokenizer', () => {
193193
test('Hex number', () => {
194194
const t = new Tokenizer();
195195
const tokens = t.tokenize('1 0X2 0x3 0x');
196-
assert.equal(tokens.count, 4);
196+
assert.equal(tokens.count, 5);
197197

198198
assert.equal(tokens.getItemAt(0).type, TokenType.Number);
199199
assert.equal(tokens.getItemAt(0).length, 1);
@@ -204,13 +204,16 @@ suite('Language.Tokenizer', () => {
204204
assert.equal(tokens.getItemAt(2).type, TokenType.Number);
205205
assert.equal(tokens.getItemAt(2).length, 3);
206206

207-
assert.equal(tokens.getItemAt(3).type, TokenType.Unknown);
208-
assert.equal(tokens.getItemAt(3).length, 2);
207+
assert.equal(tokens.getItemAt(3).type, TokenType.Number);
208+
assert.equal(tokens.getItemAt(3).length, 1);
209+
210+
assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
211+
assert.equal(tokens.getItemAt(4).length, 1);
209212
});
210213
test('Binary number', () => {
211214
const t = new Tokenizer();
212215
const tokens = t.tokenize('1 0B1 0b010 0b3 0b');
213-
assert.equal(tokens.count, 6);
216+
assert.equal(tokens.count, 7);
214217

215218
assert.equal(tokens.getItemAt(0).type, TokenType.Number);
216219
assert.equal(tokens.getItemAt(0).length, 1);
@@ -227,13 +230,16 @@ suite('Language.Tokenizer', () => {
227230
assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
228231
assert.equal(tokens.getItemAt(4).length, 2);
229232

230-
assert.equal(tokens.getItemAt(5).type, TokenType.Unknown);
231-
assert.equal(tokens.getItemAt(5).length, 2);
233+
assert.equal(tokens.getItemAt(5).type, TokenType.Number);
234+
assert.equal(tokens.getItemAt(5).length, 1);
235+
236+
assert.equal(tokens.getItemAt(6).type, TokenType.Identifier);
237+
assert.equal(tokens.getItemAt(6).length, 1);
232238
});
233239
test('Octal number', () => {
234240
const t = new Tokenizer();
235241
const tokens = t.tokenize('1 0o4 0o077 -0o200 0o9 0oO');
236-
assert.equal(tokens.count, 7);
242+
assert.equal(tokens.count, 8);
237243

238244
assert.equal(tokens.getItemAt(0).type, TokenType.Number);
239245
assert.equal(tokens.getItemAt(0).length, 1);
@@ -253,8 +259,11 @@ suite('Language.Tokenizer', () => {
253259
assert.equal(tokens.getItemAt(5).type, TokenType.Identifier);
254260
assert.equal(tokens.getItemAt(5).length, 2);
255261

256-
assert.equal(tokens.getItemAt(6).type, TokenType.Unknown);
257-
assert.equal(tokens.getItemAt(6).length, 3);
262+
assert.equal(tokens.getItemAt(6).type, TokenType.Number);
263+
assert.equal(tokens.getItemAt(6).length, 1);
264+
265+
assert.equal(tokens.getItemAt(7).type, TokenType.Identifier);
266+
assert.equal(tokens.getItemAt(7).length, 2);
258267
});
259268
test('Decimal number', () => {
260269
const t = new Tokenizer();
@@ -301,6 +310,17 @@ suite('Language.Tokenizer', () => {
301310
assert.equal(tokens.getItemAt(5).type, TokenType.Number);
302311
assert.equal(tokens.getItemAt(5).length, 5);
303312
});
313+
test('Underscore numbers', () => {
314+
const t = new Tokenizer();
315+
const tokens = t.tokenize('+1_0_0_0 0_0 .5_00_3e-4 0xCAFE_F00D 10_000_000.0 0b_0011_1111_0100_1110');
316+
const lengths = [8, 3, 10, 11, 12, 22];
317+
assert.equal(tokens.count, 6);
318+
319+
for (let i = 0; i < tokens.count; i += 1) {
320+
assert.equal(tokens.getItemAt(i).type, TokenType.Number);
321+
assert.equal(tokens.getItemAt(i).length, lengths[i]);
322+
}
323+
});
304324
test('Simple expression, leading minus', () => {
305325
const t = new Tokenizer();
306326
const tokens = t.tokenize('x == -y');

0 commit comments

Comments
 (0)