Skip to content

Commit a0de358

Browse files
authored
merge-locals pass (#1334)
This optimizes the situation described in #1331. Namely, when x is copied into y, then on subsequent gets of x we could use y instead, and vice versa, as their value is equal. Specifically, this seems to get rid of the definite overlap in the live ranges of x and y, as removing it allows coalesce-locals to merge them. The pass therefore does nothing if the live range of y ends there anyhow. The danger here is that we may extend the live range so that it causes more conflicts with other things, so this is a heuristic, but I've tested it on every codebase I can find and it always produces a net win, even on one I saw a 0.4% reduction of code size, which surprised me. This is a fairly slow pass, because it uses LocalGraph which isn't much optimized. This PR includes a minor optimization for it, but we should rewrite it. Meanwhile this is just enabled in -O3 and -Oz. This PR also includes some fuzzing improvements, to better test stuff like this.
1 parent dc2c051 commit a0de358

11 files changed

+2328
-172
lines changed

src/ir/LocalGraph.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,8 @@ void LocalGraph::visitLoop(Loop* curr) {
127127
// the get trivially has fewer sets, so it overrode the loop entry sets
128128
return;
129129
}
130-
std::vector<SetLocal*> intersection;
131-
std::set_intersection(beforeSets.begin(), beforeSets.end(),
132-
getSets.begin(), getSets.end(),
133-
std::back_inserter(intersection));
134-
if (intersection.size() < beforeSets.size()) {
130+
if (!std::includes(getSets.begin(), getSets.end(),
131+
beforeSets.begin(), beforeSets.end())) {
135132
// the get has not the same sets as in the loop entry
136133
return;
137134
}

src/ir/local-graph.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ namespace wasm {
2828
// on this).
2929
//
3030
// TODO: the algorithm here is pretty simple, but also pretty slow,
31-
// we should optimize it. e.g. we rely on set_interaction
32-
// here, and worse we only use it to compute the size...
31+
// we should optimize it.
3332
struct LocalGraph : public PostWalker<LocalGraph> {
3433
// main API
3534

src/passes/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ SET(passes_SOURCES
1717
InstrumentMemory.cpp
1818
MemoryPacking.cpp
1919
MergeBlocks.cpp
20+
MergeLocals.cpp
2021
Metrics.cpp
2122
NameList.cpp
2223
OptimizeInstructions.cpp

src/passes/MergeLocals.cpp

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*
2+
* Copyright 2016 WebAssembly Community Group participants
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
//
18+
// Merges locals when it is beneficial to do so.
19+
//
20+
// An obvious case is when locals are copied. In that case, two locals have the
21+
// same value in a range, and we can pick which of the two to use. For
22+
// example, in
23+
//
24+
// (if (result i32)
25+
// (tee_local $x
26+
// (get_local $y)
27+
// )
28+
// (i32.const 100)
29+
// (get_local $x)
30+
// )
31+
//
32+
// If that assignment of $y is never used again, everything is fine. But if
33+
// if is, then the live range of $y does not end in that get, and will
34+
// necessarily overlap with that of $x - making them appear to interfere
35+
// with each other in coalesce-locals, even though the value is identical.
36+
//
37+
// To fix that, we replace uses of $y with uses of $x. This extends $x's
38+
// live range and shrinks $y's live range. This tradeoff is not always good,
39+
// but $x and $y definitely overlap already, so trying to shrink the overlap
40+
// makes sense - if we remove the overlap entirely, we may be able to let
41+
// $x and $y be coalesced later.
42+
//
43+
// If we can remove only some of $y's uses, then we are definitely not
44+
// removing the overlap, and they do conflict. In that case, it's not clear
45+
// if this is beneficial or not, and we don't do it for now
46+
// TODO: investigate more
47+
//
48+
49+
#include <wasm.h>
50+
#include <pass.h>
51+
#include <wasm-builder.h>
52+
#include <ir/local-graph.h>
53+
54+
namespace wasm {
55+
56+
struct MergeLocals : public WalkerPass<PostWalker<MergeLocals, UnifiedExpressionVisitor<MergeLocals>>> {
57+
bool isFunctionParallel() override { return true; }
58+
59+
Pass* create() override { return new MergeLocals(); }
60+
61+
void doWalkFunction(Function* func) {
62+
// first, instrument the graph by modifying each copy
63+
// (set_local $x
64+
// (get_local $y)
65+
// )
66+
// to
67+
// (set_local $x
68+
// (tee_local $y
69+
// (get_local $y)
70+
// )
71+
// )
72+
// That is, we add a trivial assign of $y. This ensures we
73+
// have a new assignment of $y at the location of the copy,
74+
// which makes it easy for us to see if the value if $y
75+
// is still used after that point
76+
super::doWalkFunction(func);
77+
78+
// optimize the copies, merging when we can, and removing
79+
// the trivial assigns we added temporarily
80+
optimizeCopies();
81+
}
82+
83+
std::vector<SetLocal*> copies;
84+
85+
void visitSetLocal(SetLocal* curr) {
86+
if (auto* get = curr->value->dynCast<GetLocal>()) {
87+
if (get->index != curr->index) {
88+
Builder builder(*getModule());
89+
auto* trivial = builder.makeTeeLocal(get->index, get);
90+
curr->value = trivial;
91+
copies.push_back(curr);
92+
}
93+
}
94+
}
95+
96+
void optimizeCopies() {
97+
if (copies.empty()) return;
98+
// compute all dependencies
99+
LocalGraph preGraph(getFunction(), getModule());
100+
preGraph.computeInfluences();
101+
// optimize each copy
102+
std::unordered_map<SetLocal*, SetLocal*> optimizedToCopy, optimizedToTrivial;
103+
for (auto* copy : copies) {
104+
auto* trivial = copy->value->cast<SetLocal>();
105+
bool canOptimizeToCopy = false;
106+
auto& trivialInfluences = preGraph.setInfluences[trivial];
107+
if (!trivialInfluences.empty()) {
108+
canOptimizeToCopy = true;
109+
for (auto* influencedGet : trivialInfluences) {
110+
// this get uses the trivial write, so it uses the value in the copy.
111+
// however, it may depend on other writes too, if there is a merge/phi,
112+
// and in that case we can't do anything
113+
assert(influencedGet->index == trivial->index);
114+
if (preGraph.getSetses[influencedGet].size() == 1) {
115+
// this is ok
116+
assert(*preGraph.getSetses[influencedGet].begin() == trivial);
117+
} else {
118+
canOptimizeToCopy = false;
119+
break;
120+
}
121+
}
122+
}
123+
if (canOptimizeToCopy) {
124+
// worth it for this copy, do it
125+
for (auto* influencedGet : trivialInfluences) {
126+
influencedGet->index = copy->index;
127+
}
128+
optimizedToCopy[copy] = trivial;
129+
} else {
130+
// alternatively, we can try to remove the conflict in the opposite way: given
131+
// (set_local $x
132+
// (get_local $y)
133+
// )
134+
// we can look for uses of $x that could instead be uses of $y. this extends
135+
// $y's live range, but if it removes the conflict between $x and $y, it may be
136+
// worth it
137+
if (!trivialInfluences.empty()) { // if the trivial set we added has influences, it means $y lives on
138+
auto& copyInfluences = preGraph.setInfluences[copy];
139+
if (!copyInfluences.empty()) {
140+
bool canOptimizeToTrivial = true;
141+
for (auto* influencedGet : copyInfluences) {
142+
// as above, avoid merges/phis
143+
assert(influencedGet->index == copy->index);
144+
if (preGraph.getSetses[influencedGet].size() == 1) {
145+
// this is ok
146+
assert(*preGraph.getSetses[influencedGet].begin() == copy);
147+
} else {
148+
canOptimizeToTrivial = false;
149+
break;
150+
}
151+
}
152+
if (canOptimizeToTrivial) {
153+
// worth it for this copy, do it
154+
for (auto* influencedGet : copyInfluences) {
155+
influencedGet->index = trivial->index;
156+
}
157+
optimizedToTrivial[copy] = trivial;
158+
// note that we don't
159+
}
160+
}
161+
}
162+
}
163+
}
164+
if (!optimizedToCopy.empty() || !optimizedToTrivial.empty()) {
165+
// finally, we need to verify that the changes work properly, that is,
166+
// they use the value from the right place (and are not affected by
167+
// another set of the index we changed to).
168+
// if one does not work, we need to undo all its siblings (don't extend
169+
// the live range unless we are definitely removing a conflict, same
170+
// logic as before).
171+
LocalGraph postGraph(getFunction(), getModule());
172+
postGraph.computeInfluences();
173+
for (auto& pair : optimizedToCopy) {
174+
auto* copy = pair.first;
175+
auto* trivial = pair.second;
176+
auto& trivialInfluences = preGraph.setInfluences[trivial];
177+
for (auto* influencedGet : trivialInfluences) {
178+
// verify the set
179+
auto& sets = postGraph.getSetses[influencedGet];
180+
if (sets.size() != 1 || *sets.begin() != copy) {
181+
// not good, undo all the changes for this copy
182+
for (auto* undo : trivialInfluences) {
183+
undo->index = trivial->index;
184+
}
185+
break;
186+
}
187+
}
188+
}
189+
for (auto& pair : optimizedToTrivial) {
190+
auto* copy = pair.first;
191+
auto* trivial = pair.second;
192+
auto& copyInfluences = preGraph.setInfluences[copy];
193+
for (auto* influencedGet : copyInfluences) {
194+
// verify the set
195+
auto& sets = postGraph.getSetses[influencedGet];
196+
if (sets.size() != 1 || *sets.begin() != trivial) {
197+
// not good, undo all the changes for this copy
198+
for (auto* undo : copyInfluences) {
199+
undo->index = copy->index;
200+
}
201+
break;
202+
}
203+
}
204+
// if this change was ok, we can probably remove the copy itself,
205+
// but we leave that for other passes
206+
}
207+
}
208+
// remove the trivial sets
209+
for (auto* copy : copies) {
210+
copy->value = copy->value->cast<SetLocal>()->value;
211+
}
212+
}
213+
};
214+
215+
Pass *createMergeLocalsPass() {
216+
return new MergeLocals();
217+
}
218+
219+
} // namespace wasm
220+

src/passes/pass.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ void PassRegistry::registerPasses() {
8383
registerPass("instrument-memory", "instrument the build with code to intercept all loads and stores", createInstrumentMemoryPass);
8484
registerPass("memory-packing", "packs memory into separate segments, skipping zeros", createMemoryPackingPass);
8585
registerPass("merge-blocks", "merges blocks to their parents", createMergeBlocksPass);
86+
registerPass("merge-locals", "merges locals when beneficial", createMergeLocalsPass);
8687
registerPass("metrics", "reports metrics", createMetricsPass);
8788
registerPass("nm", "name list", createNameListPass);
8889
registerPass("optimize-instructions", "optimizes instruction combinations", createOptimizeInstructionsPass);
@@ -140,6 +141,10 @@ void PassRunner::addDefaultFunctionOptimizationPasses() {
140141
add("vacuum"); // previous pass creates garbage
141142
add("reorder-locals");
142143
add("remove-unused-brs"); // simplify-locals opens opportunities for optimizations
144+
// if we are willing to work hard, also optimize copies before coalescing
145+
if (options.optimizeLevel >= 3 || options.shrinkLevel >= 2) {
146+
add("merge-locals"); // very slow on e.g. sqlite
147+
}
143148
add("coalesce-locals");
144149
add("simplify-locals");
145150
add("vacuum"); // previous pass creates garbage

src/passes/passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Pass* createInstrumentLocalsPass();
4242
Pass* createInstrumentMemoryPass();
4343
Pass* createMemoryPackingPass();
4444
Pass* createMergeBlocksPass();
45+
Pass* createMergeLocalsPass();
4546
Pass* createMinifiedPrinterPass();
4647
Pass* createMetricsPass();
4748
Pass* createNameListPass();

src/tools/fuzzing.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,14 @@ class TranslateToFuzzReader {
480480
}
481481

482482
Expression* _makeConcrete(WasmType type) {
483+
auto choice = upTo(100);
484+
if (choice < 10) return makeConst(type);
485+
if (choice < 30) return makeSetLocal(type);
486+
if (choice < 50) return makeGetLocal(type);
487+
if (choice < 60) return makeBlock(type);
488+
if (choice < 70) return makeIf(type);
489+
if (choice < 80) return makeLoop(type);
490+
if (choice < 90) return makeBreak(type);
483491
switch (upTo(15)) {
484492
case 0: return makeBlock(type);
485493
case 1: return makeIf(type);
@@ -501,6 +509,12 @@ class TranslateToFuzzReader {
501509
}
502510

503511
Expression* _makenone() {
512+
auto choice = upTo(100);
513+
if (choice < 50) return makeSetLocal(none);
514+
if (choice < 60) return makeBlock(none);
515+
if (choice < 70) return makeIf(none);
516+
if (choice < 80) return makeLoop(none);
517+
if (choice < 90) return makeBreak(none);
504518
switch (upTo(11)) {
505519
case 0: return makeBlock(none);
506520
case 1: return makeIf(none);

src/tools/wasm-opt.cpp

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -182,36 +182,38 @@ int main(int argc, const char* argv[]) {
182182
std::cout << "[extra-fuzz-command first output:]\n" << firstOutput << '\n';
183183
}
184184

185+
Module* curr = &wasm;
186+
Module other;
187+
188+
if (fuzzExec && fuzzBinary) {
189+
BufferWithRandomAccess buffer(false);
190+
// write the binary
191+
WasmBinaryWriter writer(&wasm, buffer, false);
192+
writer.write();
193+
// read the binary
194+
auto input = buffer.getAsChars();
195+
WasmBinaryBuilder parser(other, input, false);
196+
parser.read();
197+
bool valid = WasmValidator().validate(other, features);
198+
if (!valid) {
199+
WasmPrinter::printModule(&other);
200+
}
201+
assert(valid);
202+
curr = &other;
203+
}
204+
185205
if (options.runningPasses()) {
186206
if (options.debug) std::cerr << "running passes...\n";
187-
options.runPasses(wasm);
188-
bool valid = WasmValidator().validate(wasm, features);
207+
options.runPasses(*curr);
208+
bool valid = WasmValidator().validate(*curr, features);
189209
if (!valid) {
190-
WasmPrinter::printModule(&wasm);
210+
WasmPrinter::printModule(&*curr);
191211
}
192212
assert(valid);
193213
}
194214

195215
if (fuzzExec) {
196-
auto* compare = &wasm;
197-
Module second;
198-
if (fuzzBinary) {
199-
compare = &second;
200-
BufferWithRandomAccess buffer(false);
201-
// write the binary
202-
WasmBinaryWriter writer(&wasm, buffer, false);
203-
writer.write();
204-
// read the binary
205-
auto input = buffer.getAsChars();
206-
WasmBinaryBuilder parser(second, input, false);
207-
parser.read();
208-
bool valid = WasmValidator().validate(second, features);
209-
if (!valid) {
210-
WasmPrinter::printModule(&second);
211-
}
212-
assert(valid);
213-
}
214-
results.check(*compare);
216+
results.check(*curr);
215217
}
216218

217219
if (options.extra.count("output") > 0) {
@@ -220,7 +222,7 @@ int main(int argc, const char* argv[]) {
220222
writer.setDebug(options.debug);
221223
writer.setBinary(emitBinary);
222224
writer.setDebugInfo(debugInfo);
223-
writer.write(wasm, options.extra["output"]);
225+
writer.write(*curr, options.extra["output"]);
224226

225227
if (extraFuzzCommand.size() > 0) {
226228
auto secondOutput = runCommand(extraFuzzCommand);

0 commit comments

Comments
 (0)