Skip to content

Commit b59ecf4

Browse files
authored
Optimize surrogate decoding. (#894)
Use a slightly different approach to recognizing surrogate pairs, which can avoid some duplicate computations. Slight speedup measured locally when with `compile js` and `compile exe`.
1 parent dc97530 commit b59ecf4

File tree

11 files changed

+159
-111
lines changed

11 files changed

+159
-111
lines changed

pkgs/characters/CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 1.4.1-wip
1+
## 1.4.1
22

3-
- Run `dart format` with the new style.
3+
* Run `dart format` with the new style.
4+
* Performance improvement for non-BMP characters.
45

56
## 1.4.0
67

pkgs/characters/lib/src/characters_impl.dart

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -509,15 +509,16 @@ class StringCharacterRange implements CharacterRange {
509509
var index = _end;
510510
while (index < _string.length) {
511511
var char = _string.codeUnitAt(index);
512+
var surrogate = char ^ 0xD800;
512513
var category = categoryControl;
513514
var nextIndex = index + 1;
514-
if (char & 0xFC00 != 0xD800) {
515+
if (surrogate > 0x3FF) {
515516
category = low(char);
516517
} else if (nextIndex < _string.length) {
517-
var nextChar = _string.codeUnitAt(nextIndex);
518-
if (nextChar & 0xFC00 == 0xDC00) {
518+
var nextSurrogate = _string.codeUnitAt(nextIndex) ^ 0xDC00;
519+
if (nextSurrogate <= 0x3FF) {
519520
nextIndex += 1;
520-
category = high(char, nextChar);
521+
category = high(surrogate, nextSurrogate);
521522
}
522523
}
523524
state = move(state, category);

pkgs/characters/lib/src/grapheme_clusters/breaks.dart

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,17 @@ class Breaks {
7676
void step() {
7777
assert(cursor < end);
7878
var char = base.codeUnitAt(cursor++);
79-
if (char & 0xFC00 != 0xD800) {
79+
var surrogate = char ^ 0xD800;
80+
if (surrogate > 0x3FF) {
8081
state = move(state, low(char));
8182
return;
8283
}
8384
// The category of an unpaired lead surrogate is Control.
8485
int category;
85-
int nextChar;
86+
int nextSurrogate;
8687
if (cursor < end &&
87-
(nextChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
88-
category = high(char, nextChar);
88+
(nextSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
89+
category = high(surrogate, nextSurrogate);
8990
cursor++;
9091
} else {
9192
category = categoryControl;
@@ -112,31 +113,36 @@ class Breaks {
112113
}
113114
var cursorBefore = cursor - 1;
114115
var prevChar = base.codeUnitAt(cursorBefore);
115-
int prevCategory;
116-
if (prevChar & 0xF800 != 0xD800) {
116+
var prevSurrogate = prevChar ^ 0xD800;
117+
if (prevSurrogate > 0x7FF) {
117118
// Not surrogate.
118-
prevCategory = low(prevChar);
119-
} else if (prevChar & 0xFC00 == 0xD800) {
120-
// Lead surrogate. Check for a following tail surrogate.
121-
int tailChar;
122-
if (cursor < end &&
123-
(tailChar = base.codeUnitAt(cursor)) & 0xFC00 == 0xDC00) {
124-
cursor += 1;
125-
prevCategory = high(prevChar, tailChar);
126-
} else {
127-
prevCategory = categoryControl;
128-
}
129-
} else {
119+
var prevCategory = low(prevChar);
120+
state = move(stateCAny, prevCategory);
121+
return cursorBefore;
122+
}
123+
int prevCategory;
124+
if (prevSurrogate > 0x3FF) {
130125
// Tail surrogate, check for prior lead surrogate.
131-
int leadChar;
126+
int leadSurrogate;
132127
var leadIndex = cursorBefore - 1;
128+
prevSurrogate &= 0x3FF;
133129
if (leadIndex >= start &&
134-
(leadChar = base.codeUnitAt(leadIndex)) & 0xFC00 == 0xD800) {
135-
prevCategory = high(leadChar, prevChar);
130+
(leadSurrogate = base.codeUnitAt(leadIndex) ^ 0xD800) <= 0x3FF) {
131+
prevCategory = high(leadSurrogate, prevSurrogate);
136132
cursorBefore = leadIndex;
137133
} else {
138134
prevCategory = categoryControl;
139135
}
136+
} else {
137+
// Lead surrogate. Check for a following tail surrogate.
138+
int tailSurrogate;
139+
if (cursor < end &&
140+
(tailSurrogate = base.codeUnitAt(cursor) ^ 0xDC00) <= 0x3FF) {
141+
cursor += 1;
142+
prevCategory = high(prevSurrogate, tailSurrogate);
143+
} else {
144+
prevCategory = categoryControl;
145+
}
140146
}
141147
state = move(stateCAny, prevCategory);
142148
return cursorBefore;
@@ -206,18 +212,19 @@ class BackBreaks {
206212
void step() {
207213
assert(cursor > start);
208214
var char = base.codeUnitAt(--cursor);
209-
if (char & 0xFC00 != 0xDC00) {
215+
var surrogate = char ^ 0xDC00;
216+
if (surrogate > 0x3FF) {
210217
var category = low(char);
211218
state = moveBack(state, category);
212219
return;
213220
}
214221
// Found tail surrogate, check for prior lead surrogate.
215222
// The category of an unpaired tail surrogate is Control.
216223
int category;
217-
int prevChar;
224+
int prevSurrogate;
218225
if (cursor >= start &&
219-
(prevChar = base.codeUnitAt(--cursor)) & 0xFC00 == 0xD800) {
220-
category = high(prevChar, char);
226+
(prevSurrogate = base.codeUnitAt(--cursor) ^ 0xD800) <= 0x3FF) {
227+
category = high(prevSurrogate, surrogate);
221228
} else {
222229
category = categoryControl;
223230
cursor++;
@@ -342,21 +349,23 @@ int previousBreak(String text, int start, int end, int index) {
342349
if (start < index && index < end) {
343350
var cursorBefore = index;
344351
var nextChar = text.codeUnitAt(index);
352+
var nextSurrogate = nextChar ^ 0xD800;
345353
var category = categoryControl;
346-
if (nextChar & 0xF800 != 0xD800) {
354+
if (nextSurrogate > 0x7FF) {
347355
category = low(nextChar);
348-
} else if (nextChar & 0xFC00 == 0xD800) {
356+
} else if (nextSurrogate <= 0x3FF) {
349357
var indexAfter = index + 1;
350358
if (indexAfter < end) {
351-
var secondChar = text.codeUnitAt(indexAfter);
352-
if (secondChar & 0xFC00 == 0xDC00) {
353-
category = high(nextChar, secondChar);
359+
var secondSurrogate = text.codeUnitAt(indexAfter) ^ 0xDC00;
360+
if (secondSurrogate <= 0x3FF) {
361+
category = high(nextSurrogate, secondSurrogate);
354362
}
355363
}
356364
} else {
357-
var prevChar = text.codeUnitAt(index - 1);
358-
if (prevChar & 0xFC00 == 0xD800) {
359-
category = high(prevChar, nextChar);
365+
var prevSurrogate = text.codeUnitAt(index - 1) ^ 0xD800;
366+
nextSurrogate &= 0x3FF;
367+
if (prevSurrogate <= 0x3FF) {
368+
category = high(prevSurrogate, nextSurrogate);
360369
cursorBefore -= 1;
361370
}
362371
}

pkgs/characters/lib/src/grapheme_clusters/table.dart

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,7 @@ const String _start = '\u1132\u166c\u166c\u206f\u11c0\u13fb\u166c\u166c\u166c'
11301130
@pragma('vm:prefer-inline')
11311131
@pragma('wasm:prefer-inline')
11321132
int low(int codeUnit) {
1133+
assert(codeUnit <= 0xFFFF);
11331134
var chunkStart = _start.codeUnitAt(codeUnit >> 5);
11341135
var index = chunkStart + (codeUnit & 31);
11351136
return _data.codeUnitAt(index);
@@ -1139,10 +1140,11 @@ int low(int codeUnit) {
11391140
@pragma('vm:prefer-inline')
11401141
@pragma('wasm:prefer-inline')
11411142
int high(int lead, int tail) {
1142-
var offset = (((0x3ff & lead) << 10) + (0x3ff & tail)) + (2048 << 8);
1143-
var chunkStart = _start.codeUnitAt(offset >> 8);
1144-
var index = chunkStart + (tail & 255);
1145-
return _data.codeUnitAt(index);
1143+
assert(lead <= 0x3FF && tail <= 0x3FF);
1144+
var chunkIndex = (tail >> 8) + (lead << 2);
1145+
var byteIndex = tail & 255;
1146+
var chunkStart = _start.codeUnitAt(2048 + chunkIndex);
1147+
return _data.codeUnitAt(chunkStart + byteIndex);
11461148
}
11471149

11481150
const _stateMachine = '\x15\x01)))µ\x8d\x01=QeyeyÉ)))ñð\x15\x01)))µ\x8d\x00=Qey'

pkgs/characters/pubspec.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: characters
2-
version: 1.4.1-wip
2+
version: 1.4.1
33
description: >-
44
String replacement with operations that are Unicode/grapheme cluster aware.
55
repository: https://github.com/dart-lang/core/tree/main/pkgs/characters
@@ -14,4 +14,4 @@ environment:
1414

1515
dev_dependencies:
1616
dart_flutter_team_lints: ^3.1.0
17-
test: ^1.16.6
17+
test: ^1.16.0

pkgs/characters/test/characters_test.dart

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -115,28 +115,29 @@ void main([List<String>? args]) {
115115
var zwj = '\u200d'; // U+200D, ZWJ
116116
var rainbow = '\u{1f308}'; // U+1F308, Rainbow. Category Pictogram
117117

118-
var rbflag = '$flag$white$zwj$rainbow';
119-
var string = '-$rbflag-';
118+
var rainbowFlag = '$flag$white$zwj$rainbow';
119+
var string = '-$rainbowFlag-';
120120
var range = CharacterRange.at(string, 1);
121121
expect(range.isEmpty, true);
122122
expect(range.moveNext(), true);
123-
expect(range.current, rbflag);
123+
expect(range.current, rainbowFlag);
124124

125125
range = range = CharacterRange.at(string, 2);
126126
expect(range.isEmpty, false);
127-
expect(range.current, rbflag);
127+
expect(range.current, rainbowFlag);
128128

129129
range = range = CharacterRange.at(string, 0, 2);
130130
expect(range.isEmpty, false);
131-
expect(range.current, '-$rbflag');
131+
expect(range.current, '-$rainbowFlag');
132132

133133
range = range = CharacterRange.at(string, 0, 2);
134134
expect(range.isEmpty, false);
135-
expect(range.current, '-$rbflag');
135+
expect(range.current, '-$rainbowFlag');
136136

137-
range = range = CharacterRange.at(string, 2, '-$rbflag'.length - 1);
137+
range =
138+
range = CharacterRange.at(string, 2, '-$rainbowFlag'.length - 1);
138139
expect(range.isEmpty, false);
139-
expect(range.current, rbflag);
140+
expect(range.current, rainbowFlag);
140141
expect(range.stringBeforeLength, 1);
141142

142143
range = range = CharacterRange.at(string, 0, string.length);

pkgs/characters/test/src/unicode_tests.dart

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ String testDescription(List<String> expected) {
3131
int categoryOf(int codePoint) {
3232
if (codePoint < 0x10000) return low(codePoint);
3333
var nonBmpOffset = codePoint - 0x10000;
34-
return high(0xD800 + (nonBmpOffset >> 10), 0xDC00 + (nonBmpOffset & 0x3ff));
34+
return high(nonBmpOffset >> 10, nonBmpOffset & 0x3ff);
3535
}
3636

3737
String partCategories(List<String> parts) {

pkgs/characters/tool/benchmark.dart

Lines changed: 61 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@ import '../test/src/unicode_grapheme_tests.dart';
1010
import '../test/src/various_tests.dart';
1111

1212
// Low-level benchmark of the grapheme cluster step functions.
13+
// Use ../benchmark/benchmark.dart for the more high-level `Characters`
14+
// methods.
1315

1416
void main(List<String> args) {
1517
var count = 5;
1618
if (args.isNotEmpty) {
1719
count = int.parse(args[0]);
1820
}
19-
var gcsf = 0;
20-
var gcsb = 0;
21-
2221
var text = genesis +
2322
hangul +
2423
genesis +
@@ -28,66 +27,94 @@ void main(List<String> args) {
2827
recJoin(zalgo);
2928
var codeUnits = text.length;
3029
var codePoints = text.runes.length;
30+
// Warmup.
31+
var gcSumForward = benchForward(text, -1, codePoints, codeUnits, 150);
32+
var gcSumBackwards = benchBackward(text, -1, codePoints, codeUnits, 150);
33+
if (gcSumForward != gcSumBackwards) {
34+
print(
35+
'ERROR: Did not count the same number of grapheme clusters: '
36+
'$gcSumForward forward vs. $gcSumBackwards backward.',
37+
);
38+
return;
39+
}
40+
3141
for (var i = 0; i < count; i++) {
32-
gcsf = benchForward(text, i, codePoints, codeUnits);
33-
gcsb = benchBackward(text, i, codePoints, codeUnits);
42+
gcSumForward = benchForward(text, i, codePoints, codeUnits, 1500);
43+
gcSumBackwards = benchBackward(text, i, codePoints, codeUnits, 1500);
3444
}
3545
print('gc: Grapheme Clusters, cp: Code Points, cu: Code Units.');
36-
if (gcsf != gcsb) {
46+
if (gcSumForward != gcSumBackwards) {
3747
print(
3848
'ERROR: Did not count the same number of grapheme clusters: '
39-
'$gcsf forward vs. $gcsb backward.',
49+
'$gcSumForward forward vs. $gcSumBackwards backward.',
4050
);
4151
} else {
42-
print('Total: $gcsf gc, $codePoints cp, $codeUnits cu');
43-
print('Avg ${(codePoints / gcsf).toStringAsFixed(3)} cp/gc');
44-
print('Avg ${(codeUnits / gcsf).toStringAsFixed(3)} cu/gc');
52+
var surrogates = codeUnits - codePoints;
53+
print(
54+
'Total: $gcSumForward gc, $codePoints cp, $codeUnits cu, '
55+
'$surrogates surrogates '
56+
'(${(surrogates / codePoints * 100).toStringAsFixed(3)}%)',
57+
);
58+
print('Avg ${(codePoints / gcSumForward).toStringAsFixed(3)} cp/gc');
59+
print('Avg ${(codeUnits / gcSumForward).toStringAsFixed(3)} cu/gc');
4560
}
4661
}
4762

4863
String recJoin(Iterable<List<String>> texts) =>
4964
texts.map((x) => x.join('')).join('\n');
5065

51-
int benchForward(String text, int i, int cp, int cu) {
66+
int benchForward(String text, int round, int cp, int cu, int limit) {
5267
var n = 0;
68+
var step = 10;
5369
var gc = 0;
5470
var e = 0;
5571
var sw = Stopwatch()..start();
5672
do {
57-
var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
58-
while (breaks.nextBreak() >= 0) {
59-
gc++;
73+
for (var i = 0; i < step; i++) {
74+
var breaks = Breaks(text, 0, text.length, stateSoTNoBreak);
75+
while (breaks.nextBreak() >= 0) {
76+
gc++;
77+
}
6078
}
6179
e = sw.elapsedMilliseconds;
62-
n++;
63-
} while (e < 2000);
64-
print(
65-
'Forward #$i: ${(gc / e).round()} gc/ms, '
66-
'${(n * cp / e).round()} cp/ms, '
67-
'${(n * cu / e).round()} cu/ms, '
68-
'$n rounds',
69-
);
80+
n += step;
81+
step += step;
82+
} while (e < limit);
83+
if (limit > 500) {
84+
print(
85+
'Forward #$round: ${(gc / e).round()} gc/ms, '
86+
'${(n * cp / e).round()} cp/ms, '
87+
'${(n * cu / e).round()} cu/ms, '
88+
'$n rounds in $e ms',
89+
);
90+
}
7091
return gc ~/ n;
7192
}
7293

73-
int benchBackward(String text, int i, int cp, int cu) {
94+
int benchBackward(String text, int round, int cp, int cu, int limit) {
7495
var n = 0;
96+
var step = 10;
7597
var gc = 0;
7698
var e = 0;
7799
var sw = Stopwatch()..start();
78100
do {
79-
var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
80-
while (breaks.nextBreak() >= 0) {
81-
gc++;
101+
for (var i = 0; i < step; i++) {
102+
var breaks = BackBreaks(text, text.length, 0, stateEoTNoBreak);
103+
while (breaks.nextBreak() >= 0) {
104+
gc++;
105+
}
82106
}
83107
e = sw.elapsedMilliseconds;
84-
n++;
85-
} while (e < 2000);
86-
print(
87-
'Backward #$i: ${(gc / e).round()} gc/ms, '
88-
'${(n * cp / e).round()} cp/ms, '
89-
'${(n * cu / e).round()} cu/ms, '
90-
'$n rounds',
91-
);
108+
n += step;
109+
step += step;
110+
} while (e < limit);
111+
if (limit > 500) {
112+
print(
113+
'Backward #$round: ${(gc / e).round()} gc/ms, '
114+
'${(n * cp / e).round()} cp/ms, '
115+
'${(n * cu / e).round()} cu/ms, '
116+
'$n rounds in $e ms',
117+
);
118+
}
92119
return gc ~/ n;
93120
}

0 commit comments

Comments
 (0)