Handle numbers formatted with underscores in tokenizer (#1819)

Mikhail Arkhipov · web-flow · commit 5fa57f827054 · 2018-05-31T15:16:04.000-07:00
* Undo changes * Test fixes * Increase timeout * Remove double event listening * Remove test * Revert "Remove test" This reverts commit e240c3f. * Revert "Remove double event listening" This reverts commit af573be. * #1096 The if statement is automatically formatted incorrectly * Merge fix * Add more tests * More tests * Typo * Test * Also better handle multiline arguments * Add a couple missing periods [skip ci] * Undo changes * Test fixes * Increase timeout * Remove double event listening * Remove test * Revert "Remove test" This reverts commit e240c3f. * Revert "Remove double event listening" This reverts commit af573be. * Merge fix * #1257 On type formatting errors for args and kwargs * Handle f-strings * Stop importing from test code * #1308 Single line statements leading to an indentation on the next line * #726 editing python after inline if statement invalid indent * Undo change * Move constant * Harden LS startup error checks * #1364 Intellisense doesn't work after specific const string * Telemetry for the analysis enging * PR feedback * Fix typo * Test baseline update * Jedi 0.12 * Priority to goto_defition * News * Replace unzip * Linux flavors + test * Grammar check * Grammar test * Test baselines * Add news * Pin dependency [skip ci] * Specify markdown as preferable format * Improve function argument detection * Specify markdown * Pythia setting * Baseline updates * Baseline update * Improve startup * Handle missing interpreter better * Handle interpreter change * Delete old file * Fix LS startup time reporting * Remove Async suffix from IFileSystem * Remove Pythia * Remove pre-packaged MSIL * Exe name on Unix * Plain linux * Fix casing * Fix message * Update PTVS engine activation steps * Type formatter eats space in from . * fIX CASING * Remove flag * Don't wait for LS * Small test fixes * Update hover baselines * Rename the engine * Formatting 1 * Add support for 'rf' strings * Add two spaces before comment per PEP * Fix @ operator spacing * Handle module and unary ops * Type hints * Fix typo * Trailing comma * Require space after if * underscore numbers * Update list of keywords * PR feedback * News * Use a bit more Markdown in the news entry
diff --git a/news/2 Fixes/1779.md b/news/2 Fixes/1779.md
@@ -0,0 +1 @@
+`editor.formatOnType` no longer breaks numbers formatted with underscores.
diff --git a/src/client/language/characters.ts b/src/client/language/characters.ts
@@ -83,18 +83,22 @@ export function isLineBreak(ch: number): boolean {
     return ch === Char.CarriageReturn || ch === Char.LineFeed;
 }
 
+export function isNumber(ch: number): boolean {
+    return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
+}
+
 export function isDecimal(ch: number): boolean {
-    return ch >= Char._0 && ch <= Char._9;
+    return ch >= Char._0 && ch <= Char._9 || ch === Char.Underscore;
 }
 
 export function isHex(ch: number): boolean {
-    return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F);
+    return isDecimal(ch) || (ch >= Char.a && ch <= Char.f) || (ch >= Char.A && ch <= Char.F) || ch === Char.Underscore;
 }
 
 export function isOctal(ch: number): boolean {
-    return ch >= Char._0 && ch <= Char._7;
+    return ch >= Char._0 && ch <= Char._7 || ch === Char.Underscore;
 }
 
 export function isBinary(ch: number): boolean {
-    return ch === Char._0 || ch === Char._1;
+    return ch === Char._0 || ch === Char._1 || ch === Char.Underscore;
 }
diff --git a/src/client/language/tokenizer.ts b/src/client/language/tokenizer.ts
@@ -4,7 +4,7 @@
 
 // tslint:disable-next-line:import-name
 import Char from 'typescript-char';
-import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal } from './characters';
+import { isBinary, isDecimal, isHex, isIdentifierChar, isIdentifierStartChar, isOctal, isWhiteSpace } from './characters';
 import { CharacterStream } from './characterStream';
 import { TextRangeCollection } from './textRangeCollection';
 import { ICharacterStream, ITextRangeCollection, IToken, ITokenizer, TextRange, TokenizerMode, TokenType } from './types';
@@ -29,13 +29,8 @@ class Token extends TextRange implements IToken {
 export class Tokenizer implements ITokenizer {
     private cs: ICharacterStream = new CharacterStream('');
     private tokens: IToken[] = [];
-    private floatRegex = /[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?/;
     private mode = TokenizerMode.Full;
 
-    constructor() {
-        //this.floatRegex.compile();
-    }
-
     public tokenize(text: string): ITextRangeCollection<IToken>;
     public tokenize(text: string, start: number, length: number, mode: TokenizerMode): ITextRangeCollection<IToken>;
 
@@ -224,43 +219,74 @@ export class Tokenizer implements ITokenizer {
 
         if (this.cs.currentChar === Char._0) {
             let radix = 0;
-            // Try hex
-            if (this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) {
+            // Try hex => hexinteger: "0" ("x" | "X") (["_"] hexdigit)+
+            if ((this.cs.nextChar === Char.x || this.cs.nextChar === Char.X) && isHex(this.cs.lookAhead(2))) {
                 this.cs.advance(2);
                 while (isHex(this.cs.currentChar)) {
                     this.cs.moveNext();
                 }
                 radix = 16;
             }
-            // Try binary
-            if (this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) {
+            // Try binary => bininteger: "0" ("b" | "B") (["_"] bindigit)+
+            if ((this.cs.nextChar === Char.b || this.cs.nextChar === Char.B) && isBinary(this.cs.lookAhead(2))) {
                 this.cs.advance(2);
                 while (isBinary(this.cs.currentChar)) {
                     this.cs.moveNext();
                 }
                 radix = 2;
             }
-            // Try octal
-            if (this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) {
+            // Try octal => octinteger: "0" ("o" | "O") (["_"] octdigit)+
+            if ((this.cs.nextChar === Char.o || this.cs.nextChar === Char.O) && isOctal(this.cs.lookAhead(2))) {
                 this.cs.advance(2);
                 while (isOctal(this.cs.currentChar)) {
                     this.cs.moveNext();
                 }
                 radix = 8;
             }
+            if (radix > 0) {
+                const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
+                if (!isNaN(parseInt(text, radix))) {
+                    this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
+                    return true;
+                }
+            }
+        }
+
+        let decimal = false;
+        // Try decimal int =>
+        //    decinteger: nonzerodigit (["_"] digit)* | "0" (["_"] "0")*
+        //    nonzerodigit: "1"..."9"
+        //    digit: "0"..."9"
+        if (this.cs.currentChar >= Char._1 && this.cs.currentChar <= Char._9) {
+            while (isDecimal(this.cs.currentChar)) {
+                this.cs.moveNext();
+            }
+            decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
+        }
+
+        if (this.cs.currentChar === Char._0) { // "0" (["_"] "0")*
+            while (this.cs.currentChar === Char._0 || this.cs.currentChar === Char.Underscore) {
+                this.cs.moveNext();
+            }
+            decimal = this.cs.currentChar !== Char.Period && this.cs.currentChar !== Char.e && this.cs.currentChar !== Char.E;
+        }
+
+        if (decimal) {
             const text = this.cs.getText().substr(start + leadingSign, this.cs.position - start - leadingSign);
-            if (radix > 0 && parseInt(text.substr(2), radix)) {
+            if (!isNaN(parseInt(text, 10))) {
                 this.tokens.push(new Token(TokenType.Number, start, text.length + leadingSign));
                 return true;
             }
         }
 
-        if (isDecimal(this.cs.currentChar) || this.cs.currentChar === Char.Period) {
-            const candidate = this.cs.getText().substr(this.cs.position);
-            const re = this.floatRegex.exec(candidate);
-            if (re && re.length > 0 && re[0] && candidate.startsWith(re[0])) {
-                this.tokens.push(new Token(TokenType.Number, start, re[0].length + leadingSign));
-                this.cs.position = start + re[0].length + leadingSign;
+        // Floating point
+        if ((this.cs.currentChar >= Char._0 && this.cs.currentChar <= Char._9) || this.cs.currentChar === Char.Period) {
+            while (!isWhiteSpace(this.cs.currentChar)) {
+                this.cs.moveNext();
+            }
+            const text = this.cs.getText().substr(start, this.cs.position - start);
+            if (!isNaN(parseFloat(text))) {
+                this.tokens.push(new Token(TokenType.Number, start, this.cs.position - start));
                 return true;
             }
         }
@@ -380,7 +406,7 @@ export class Tokenizer implements ITokenizer {
                 case 'rf':
                 case 'ur':
                 case 'br':
-                     return 2;
+                    return 2;
                 default:
                     break;
             }
diff --git a/src/test/language/tokenizer.test.ts b/src/test/language/tokenizer.test.ts
@@ -193,7 +193,7 @@ suite('Language.Tokenizer', () => {
     test('Hex number', () => {
         const t = new Tokenizer();
         const tokens = t.tokenize('1 0X2 0x3 0x');
-        assert.equal(tokens.count, 4);
+        assert.equal(tokens.count, 5);
 
         assert.equal(tokens.getItemAt(0).type, TokenType.Number);
         assert.equal(tokens.getItemAt(0).length, 1);
@@ -204,13 +204,16 @@ suite('Language.Tokenizer', () => {
         assert.equal(tokens.getItemAt(2).type, TokenType.Number);
         assert.equal(tokens.getItemAt(2).length, 3);
 
-        assert.equal(tokens.getItemAt(3).type, TokenType.Unknown);
-        assert.equal(tokens.getItemAt(3).length, 2);
+        assert.equal(tokens.getItemAt(3).type, TokenType.Number);
+        assert.equal(tokens.getItemAt(3).length, 1);
+
+        assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
+        assert.equal(tokens.getItemAt(4).length, 1);
     });
     test('Binary number', () => {
         const t = new Tokenizer();
         const tokens = t.tokenize('1 0B1 0b010 0b3 0b');
-        assert.equal(tokens.count, 6);
+        assert.equal(tokens.count, 7);
 
         assert.equal(tokens.getItemAt(0).type, TokenType.Number);
         assert.equal(tokens.getItemAt(0).length, 1);
@@ -227,13 +230,16 @@ suite('Language.Tokenizer', () => {
         assert.equal(tokens.getItemAt(4).type, TokenType.Identifier);
         assert.equal(tokens.getItemAt(4).length, 2);
 
-        assert.equal(tokens.getItemAt(5).type, TokenType.Unknown);
-        assert.equal(tokens.getItemAt(5).length, 2);
+        assert.equal(tokens.getItemAt(5).type, TokenType.Number);
+        assert.equal(tokens.getItemAt(5).length, 1);
+
+        assert.equal(tokens.getItemAt(6).type, TokenType.Identifier);
+        assert.equal(tokens.getItemAt(6).length, 1);
     });
     test('Octal number', () => {
         const t = new Tokenizer();
         const tokens = t.tokenize('1 0o4 0o077 -0o200 0o9 0oO');
-        assert.equal(tokens.count, 7);
+        assert.equal(tokens.count, 8);
 
         assert.equal(tokens.getItemAt(0).type, TokenType.Number);
         assert.equal(tokens.getItemAt(0).length, 1);
@@ -253,8 +259,11 @@ suite('Language.Tokenizer', () => {
         assert.equal(tokens.getItemAt(5).type, TokenType.Identifier);
         assert.equal(tokens.getItemAt(5).length, 2);
 
-        assert.equal(tokens.getItemAt(6).type, TokenType.Unknown);
-        assert.equal(tokens.getItemAt(6).length, 3);
+        assert.equal(tokens.getItemAt(6).type, TokenType.Number);
+        assert.equal(tokens.getItemAt(6).length, 1);
+
+        assert.equal(tokens.getItemAt(7).type, TokenType.Identifier);
+        assert.equal(tokens.getItemAt(7).length, 2);
     });
     test('Decimal number', () => {
         const t = new Tokenizer();
@@ -301,6 +310,17 @@ suite('Language.Tokenizer', () => {
         assert.equal(tokens.getItemAt(5).type, TokenType.Number);
         assert.equal(tokens.getItemAt(5).length, 5);
     });
+    test('Underscore numbers', () => {
+        const t = new Tokenizer();
+        const tokens = t.tokenize('+1_0_0_0 0_0 .5_00_3e-4 0xCAFE_F00D 10_000_000.0 0b_0011_1111_0100_1110');
+        const lengths = [8, 3, 10, 11, 12, 22];
+        assert.equal(tokens.count, 6);
+
+        for (let i = 0; i < tokens.count; i += 1) {
+            assert.equal(tokens.getItemAt(i).type, TokenType.Number);
+            assert.equal(tokens.getItemAt(i).length, lengths[i]);
+        }
+    });
     test('Simple expression, leading minus', () => {
         const t = new Tokenizer();
         const tokens = t.tokenize('x == -y');

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+`editor.formatOnType` no longer breaks numbers formatted with underscores.
Original file line number	Diff line number	Diff line change
`@@ -83,18 +83,22 @@ export function isLineBreak(ch: number): boolean {`
`83`	`83`	`return ch === Char.CarriageReturn \|\| ch === Char.LineFeed;`
`84`	`84`	`}`
`85`	`85`
	`86`	`+export function isNumber(ch: number): boolean {`
	`87`	`+ return ch >= Char._0 && ch <= Char._9 \|\| ch === Char.Underscore;`
	`88`	`+}`
	`89`	`+`
`86`	`90`	`export function isDecimal(ch: number): boolean {`
`87`		`- return ch >= Char._0 && ch <= Char._9;`
	`91`	`+ return ch >= Char._0 && ch <= Char._9 \|\| ch === Char.Underscore;`
`88`	`92`	`}`
`89`	`93`
`90`	`94`	`export function isHex(ch: number): boolean {`
`91`		`- return isDecimal(ch) \|\| (ch >= Char.a && ch <= Char.f) \|\| (ch >= Char.A && ch <= Char.F);`
	`95`	`+ return isDecimal(ch) \|\| (ch >= Char.a && ch <= Char.f) \|\| (ch >= Char.A && ch <= Char.F) \|\| ch === Char.Underscore;`
`92`	`96`	`}`
`93`	`97`
`94`	`98`	`export function isOctal(ch: number): boolean {`
`95`		`- return ch >= Char._0 && ch <= Char._7;`
	`99`	`+ return ch >= Char._0 && ch <= Char._7 \|\| ch === Char.Underscore;`
`96`	`100`	`}`
`97`	`101`
`98`	`102`	`export function isBinary(ch: number): boolean {`
`99`		`- return ch === Char._0 \|\| ch === Char._1;`
	`103`	`+ return ch === Char._0 \|\| ch === Char._1 \|\| ch === Char.Underscore;`
`100`	`104`	`}`