Skip to content

Commit 0cf4af8

Browse files
feat: unicode modes (#85)
1 parent 17641ad commit 0cf4af8

File tree

11 files changed

+339
-38
lines changed

11 files changed

+339
-38
lines changed

Diff for: README.md

+20-14
Original file line numberDiff line numberDiff line change
@@ -127,18 +127,21 @@ See [Quantifiers API doc](https://callstack.github.io/ts-regex-builder/api/quant
127127

128128
### Character classes
129129

130-
| Character class | Regex Syntax | Description |
131-
| --------------------- | ------------ | ------------------------------------------------- |
132-
| `any` | `.` | Any character |
133-
| `word` | `\w` | Word character: letter, digit, underscore |
134-
| `digit` | `\d` | Digit character: 0 to 9 |
135-
| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... |
136-
| `anyOf('abc')` | `[abc]` | Any of provided characters |
137-
| `charRange('a', 'z')` | `[a-z]` | Character in a range |
138-
| `charClass(...)` | `[...]` | Union of multiple character classes |
139-
| `negated(...)` | `[^...]` | Negation of a given character class |
140-
141-
See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) for more info.
130+
| Character class | Regex Syntax | Description |
131+
| ---------------------- | ------------ | ------------------------------------------------- |
132+
| `any` | `.` | Any character |
133+
| `word` | `\w` | Word character: letter, digit, underscore |
134+
| `digit` | `\d` | Digit character: 0 to 9 |
135+
| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... |
136+
| `anyOf('abc')` | `[abc]` | Any of provided characters |
137+
| `charRange('a', 'z')` | `[a-z]` | Character in a range |
138+
| `charClass(...)` | `[...]` | Union of multiple character classes |
139+
| `negated(...)` | `[^...]` | Negation of a given character class |
140+
| `char(...)` | `\uXXXX` | Character specified given Unicode code point |
141+
| `unicodeProperty(...)` | `\p{...}` | Characters with given Unicode property |
142+
143+
144+
See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) and [Unicode API doc](https://callstack.github.io/ts-regex-builder/api/unicode) for more info.
142145

143146
### Assertions
144147

@@ -177,9 +180,12 @@ TS Regex Builder is inspired by [Swift Regex Builder API](https://developer.appl
177180

178181
## Reference
179182

180-
- [ECMAScript Regular Expression BNF Grammar](https://262.ecma-international.org/7.0/#sec-regular-expressions)
181-
- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder)
183+
- [ECMAScript Regular Expression BNF Grammar](https://tc39.es/ecma262/#sec-regular-expressions)
184+
- [Unicode Regular Expressions](https://www.unicode.org/reports/tr18/)
182185
- [Swift Evolution 351: Regex Builder DSL](https://github.com/apple/swift-evolution/blob/main/proposals/0351-regex-builder.md)
186+
- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder)
187+
188+
183189

184190
---
185191

Diff for: src/__tests__/builder.test.ts

+22-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { buildRegExp } from '..';
1+
import { buildRegExp, char, unicodeProperty } from '..';
22

33
test('`regexBuilder` flags', () => {
44
expect(buildRegExp('a').flags).toBe('');
@@ -32,3 +32,24 @@ test('`regexBuilder` flags', () => {
3232
}).flags,
3333
).toBe('gisy');
3434
});
35+
36+
test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => {
37+
expect(() => buildRegExp(char(0x1234))).not.toThrow();
38+
expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow();
39+
expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow();
40+
41+
expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot(
42+
`"Expected a valid unicode code point but received 1193046"`,
43+
);
44+
expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot(
45+
`"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
46+
);
47+
expect(() =>
48+
buildRegExp(unicodeProperty('Emoji_Presentation')),
49+
).toThrowErrorMatchingInlineSnapshot(
50+
`"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
51+
);
52+
expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot(
53+
`"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
54+
);
55+
});

Diff for: src/builders.ts

+18
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@ import { encode } from './encoder';
1111
export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp {
1212
const pattern = encode(sequence).pattern;
1313
const flagsString = encodeFlags(flags ?? {});
14+
15+
if (!flags?.unicode) {
16+
const unicodeModePattern = getUnicodeModePattern(pattern);
17+
if (unicodeModePattern) {
18+
throw new Error(
19+
`The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`,
20+
);
21+
}
22+
}
23+
1424
return new RegExp(pattern, flagsString);
1525
}
1626

@@ -32,6 +42,14 @@ function encodeFlags(flags: RegexFlags): string {
3242
if (flags.hasIndices) result += 'd';
3343
if (flags.dotAll) result += 's';
3444
if (flags.sticky) result += 'y';
45+
if (flags.unicode) result += 'u';
3546

3647
return result;
3748
}
49+
50+
const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/;
51+
52+
function getUnicodeModePattern(pattern: string): string | null {
53+
const match = pattern.match(unicodeModePatterns);
54+
return match?.[0] ?? null;
55+
}
+154
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import {
2+
buildRegExp,
3+
char,
4+
charClass,
5+
endOfString,
6+
type RegexSequence,
7+
startOfString,
8+
unicodeProperty,
9+
} from '../..';
10+
11+
function u(sequence: RegexSequence) {
12+
return buildRegExp(sequence, { unicode: true });
13+
}
14+
15+
test('`char` pattern', () => {
16+
// eslint-disable-next-line no-control-regex
17+
expect(char(0)).toEqualRegex(/\u0000/);
18+
// eslint-disable-next-line no-control-regex
19+
expect(char(0x1)).toEqualRegex(/\u0001/);
20+
// eslint-disable-next-line no-control-regex
21+
expect(char(0x12)).toEqualRegex(/\u0012/);
22+
expect(char(0x123)).toEqualRegex(/\u0123/);
23+
expect(char(0x1234)).toEqualRegex(/\u1234/);
24+
25+
// eslint-disable-next-line no-control-regex
26+
expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u'));
27+
// eslint-disable-next-line no-control-regex
28+
expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u'));
29+
expect(u(char(0x12))).toEqualRegex(
30+
// eslint-disable-next-line no-control-regex
31+
new RegExp('\\u0012', 'u'),
32+
);
33+
expect(char(0x0123)).toEqualRegex(/\u0123/);
34+
expect(char(0x1234)).toEqualRegex(/\u1234/);
35+
36+
expect(u(char(0x0123))).toEqualRegex(/\u0123/u);
37+
expect(u(char(0x1234))).toEqualRegex(/\u1234/u);
38+
expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u'));
39+
expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u'));
40+
});
41+
42+
test('`char` matching', () => {
43+
expect(char(0)).toMatchString('\u{0}');
44+
expect(char(0x1)).toMatchString('\u{1}');
45+
expect(char(0x12)).toMatchString('\u{12}}');
46+
expect(char(0x123)).toMatchString('\u{123}');
47+
expect(char(0x1234)).toMatchString('\u{1234}}');
48+
49+
expect(char('a'.codePointAt(0)!)).toMatchString('a');
50+
expect(char('ą'.codePointAt(0)!)).toMatchString('ą');
51+
expect(char('©'.codePointAt(0)!)).toMatchString('©');
52+
53+
expect(u(char(0))).toMatchString('\u{0}');
54+
expect(u(char(0))).not.toMatchString('a');
55+
expect(u(char(0x1))).toMatchString('\u{1}');
56+
expect(u(char(0x12))).toMatchString('\u{12}');
57+
expect(u(char(0x123))).toMatchString('\u{123}');
58+
expect(u(char(0x1234))).toMatchString('\u{1234}');
59+
expect(u(char(0x12345))).toMatchString('\u{12345}');
60+
expect(u(char(0x103456))).toMatchString('\u{103456}');
61+
62+
expect(u(char('a'.codePointAt(0)!))).toMatchString('a');
63+
expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą');
64+
expect(u(char('©'.codePointAt(0)!))).toMatchString('©');
65+
expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎');
66+
expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}');
67+
});
68+
69+
test('`char` nesting matching', () => {
70+
expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a');
71+
expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą');
72+
expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b');
73+
});
74+
75+
test('`char` edge cases handling', () => {
76+
expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot(
77+
`"Expected a valid unicode code point but received NaN"`,
78+
);
79+
expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot(
80+
`"Expected a valid unicode code point but received 1.5"`,
81+
);
82+
expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot(
83+
`"Expected a valid unicode code point but received -1"`,
84+
);
85+
expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot(
86+
`"Expected a valid unicode code point but received 1114112"`,
87+
);
88+
89+
expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u);
90+
});
91+
92+
test('`unicodeProperty` pattern', () => {
93+
expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex(
94+
/\p{General_Category=Letter}/u,
95+
);
96+
expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u);
97+
expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u);
98+
expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u);
99+
expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u);
100+
expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u);
101+
expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u);
102+
expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u);
103+
104+
expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}');
105+
expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}');
106+
expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}');
107+
108+
expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}');
109+
expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex(
110+
'\\p{Script_Extensions=Thaana}',
111+
);
112+
expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}');
113+
114+
expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}');
115+
});
116+
117+
test('`unicodeProperty` matching', () => {
118+
expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A');
119+
expect(u(unicodeProperty('Letter'))).toMatchString('A');
120+
expect(u(unicodeProperty('L'))).toMatchString('A');
121+
122+
expect(u(unicodeProperty('Uppercase'))).toMatchString('A');
123+
expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a');
124+
expect(u(unicodeProperty('Lu'))).toMatchString('A');
125+
126+
expect(u(unicodeProperty('Lowercase'))).toMatchString('a');
127+
expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A');
128+
expect(u(unicodeProperty('Ll'))).toMatchString('a');
129+
130+
expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A');
131+
expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α');
132+
expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α');
133+
expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A');
134+
135+
// Basic emoji
136+
expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎');
137+
expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌');
138+
139+
// Complex emoji with skin tone modifier
140+
expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼');
141+
expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼');
142+
});
143+
144+
test('`unicodeProperty` nesting matching', () => {
145+
expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString(
146+
'a',
147+
);
148+
expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString(
149+
' ',
150+
);
151+
expect(
152+
u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))),
153+
).not.toMatchString('A');
154+
});

Diff for: src/constructs/char-class.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ export function charRange(start: string, end: string): CharacterClass {
3232
}
3333

3434
export function anyOf(characters: string): CharacterClass {
35-
const chars = characters.split('').map((c) => escapeForCharacterClass(c));
35+
const chars = characters.split('').map((c) => escapeCharClass(c));
3636

3737
if (chars.length === 0) {
3838
throw new Error('`anyOf` should received at least one character');
@@ -52,6 +52,6 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex
5252
*/
5353
export const inverted = negated;
5454

55-
function escapeForCharacterClass(text: string): string {
55+
function escapeCharClass(text: string): string {
5656
return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string
5757
}

Diff for: src/constructs/char-escape.ts

+51
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,54 @@ export const notWord = nonWord;
5959
* @deprecated Renamed to `nonWhitespace`.
6060
*/
6161
export const notWhitespace = nonWhitespace;
62+
63+
/**
64+
* Unicode character code point escape.
65+
*
66+
* Regex pattern:
67+
* - `\uXXXX`: 4-digit hex escape for code points below 0x10000.
68+
* - `\u{X}`: Unicode code point escape for code points above 0xFFFF.
69+
*
70+
* Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).
71+
*
72+
* @param codePoint The code point of the character to escape.
73+
* @returns A character class representing the unicode escape.
74+
*/
75+
export function char(codePoint: number): CharacterEscape {
76+
if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
77+
throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`);
78+
}
79+
80+
let escape =
81+
codePoint < 0x10000
82+
? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes)
83+
: `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode)
84+
85+
return {
86+
precedence: 'atom',
87+
pattern: escape,
88+
chars: [escape],
89+
};
90+
}
91+
92+
/**
93+
* Unicode property escape matching a set of characters specified by a Unicode property.
94+
*
95+
* Regex pattern: `\p{Property}` or `\p{Property=Value}`
96+
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape
97+
*
98+
* Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).
99+
*
100+
* @param property Unicode property name.
101+
* @param value Unicode property value (optional).
102+
* @returns A character class representing the unicode property escape.
103+
*/
104+
export function unicodeProperty(property: string, value?: string): CharacterEscape {
105+
const escape = `\\p{${property}${value ? `=${value}` : ''}}`;
106+
107+
return {
108+
precedence: 'atom',
109+
pattern: escape,
110+
chars: [escape],
111+
};
112+
}

Diff for: src/index.ts

+10-8
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,37 @@ export type { QuantifierOptions } from './constructs/quantifiers';
55
export type { RepeatOptions } from './constructs/repeat';
66

77
// Builders
8-
export { buildPattern, buildRegExp } from './builders';
8+
export { buildRegExp, buildPattern } from './builders';
99

1010
// Constructs
1111
export {
12+
startOfString,
1213
endOfString,
14+
wordBoundary,
1315
nonWordBoundary,
1416
notWordBoundary,
15-
startOfString,
16-
wordBoundary,
1717
} from './constructs/anchors';
1818
export { capture, ref } from './constructs/capture';
19-
export { anyOf, charClass, charRange, negated, inverted } from './constructs/char-class';
19+
export { charClass, charRange, anyOf, negated, inverted } from './constructs/char-class';
2020
export {
2121
any,
2222
digit,
2323
nonDigit,
24-
nonWhitespace,
24+
word,
2525
nonWord,
26+
whitespace,
27+
nonWhitespace,
2628
notDigit,
2729
notWhitespace,
2830
notWord,
29-
whitespace,
30-
word,
31+
char,
32+
unicodeProperty,
3133
} from './constructs/char-escape';
3234
export { choiceOf } from './constructs/choice-of';
3335
export { lookahead } from './constructs/lookahead';
3436
export { lookbehind } from './constructs/lookbehind';
3537
export { negativeLookahead } from './constructs/negative-lookahead';
3638
export { negativeLookbehind } from './constructs/negative-lookbehind';
37-
export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers';
39+
export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers';
3840
export { regex } from './constructs/regex';
3941
export { repeat } from './constructs/repeat';

0 commit comments

Comments
 (0)