Skip to content

Commit 141f7ca

Browse files
authored
Precompute into select arms (#6212)
E.g. (i32.add (select (i32.const 100) (i32.const 200) (..condition..) ) (i32.const 50) ) ;; => (select (i32.const 150) (i32.const 250) (..condition..) ) We cannot fully precompute the select, but we can "partially precompute" it, by precomputing its arms using the parent. This may require looking several steps up the parent chain, which is an awkward operation in our simple walkers, so to do it we capture stacks of parents and operate directly on them. This is a little slower than a normal walk, so only do it when we see a promising select, and only in -O2 and above (this makes the pass 7% or so slower; not a large cost, but best to avoid it in -O1).
1 parent 97a61bd commit 141f7ca

File tree

2 files changed

+1106
-12
lines changed

2 files changed

+1106
-12
lines changed

src/passes/Precompute.cpp

Lines changed: 318 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,19 @@
2727
// looked at.
2828
//
2929

30-
#include <ir/literal-utils.h>
31-
#include <ir/local-graph.h>
32-
#include <ir/manipulation.h>
33-
#include <ir/properties.h>
34-
#include <ir/utils.h>
35-
#include <pass.h>
36-
#include <support/unique_deferring_queue.h>
37-
#include <wasm-builder.h>
38-
#include <wasm-interpreter.h>
39-
#include <wasm.h>
30+
#include "ir/effects.h"
31+
#include "ir/iteration.h"
32+
#include "ir/literal-utils.h"
33+
#include "ir/local-graph.h"
34+
#include "ir/manipulation.h"
35+
#include "ir/properties.h"
36+
#include "ir/utils.h"
37+
#include "pass.h"
38+
#include "support/insert_ordered.h"
39+
#include "support/unique_deferring_queue.h"
40+
#include "wasm-builder.h"
41+
#include "wasm-interpreter.h"
42+
#include "wasm.h"
4043

4144
namespace wasm {
4245

@@ -210,9 +213,16 @@ struct Precompute
210213
GetValues getValues;
211214
HeapValues heapValues;
212215

216+
bool canPartiallyPrecompute;
217+
213218
void doWalkFunction(Function* func) {
219+
// Perform partial precomputing only when the optimization level is non-
220+
// trivial, as it is slower and less likely to help.
221+
canPartiallyPrecompute = getPassOptions().optimizeLevel >= 2;
222+
214223
// Walk the function and precompute things.
215224
super::doWalkFunction(func);
225+
partiallyPrecompute(func);
216226
if (!propagate) {
217227
return;
218228
}
@@ -226,11 +236,13 @@ struct Precompute
226236
// another walk to apply them and perhaps other optimizations that are
227237
// unlocked.
228238
super::doWalkFunction(func);
239+
// We could also try to partially precompute again, but that is a somewhat
240+
// heavy operation, so we only do it the first time, and leave such things
241+
// for later runs of this pass and for --converge.
229242
}
230243
// Note that in principle even more cycles could find further work here, in
231244
// very rare cases. To avoid constructing a LocalGraph again just for that
232-
// unlikely chance, we leave such things for later runs of this pass and for
233-
// --converge.
245+
// unlikely chance, we leave such things for later.
234246
}
235247

236248
template<typename T> void reuseConstantNode(T* curr, Flow flow) {
@@ -281,6 +293,9 @@ struct Precompute
281293
}
282294
if (flow.breaking()) {
283295
if (flow.breakTo == NONCONSTANT_FLOW) {
296+
// This cannot be turned into a constant, but perhaps we can partially
297+
// precompute it.
298+
considerPartiallyPrecomputing(curr);
284299
return;
285300
}
286301
if (flow.breakTo == RETURN_FLOW) {
@@ -319,6 +334,273 @@ struct Precompute
319334
}
320335
}
321336

337+
// If we failed to precompute a constant, perhaps we can still precompute part
338+
// of an expression. Specifically, consider this case:
339+
//
340+
// (A
341+
// (select
342+
// (B)
343+
// (C)
344+
// (condition)
345+
// )
346+
// )
347+
//
348+
// Perhaps we can compute A(B) and A(C). If so, we can emit a better select:
349+
//
350+
// (select
351+
// (constant result of A(B))
352+
// (constant result of A(C))
353+
// (condition)
354+
// )
355+
//
356+
// Note that in general for code size we want to move operations *out* of
357+
// selects and ifs (OptimizeInstructions does that), but here we are
358+
// computing two constants which replace three expressions, so it is
359+
// worthwhile.
360+
//
361+
// To do such partial precomputing, in the main pass we note selects that look
362+
// promising. If we find any then we do a second pass later just for that (as
363+
// doing so requires walking up the stack in a manner that we want to avoid in
364+
// the main pass for overhead reasons; see below).
365+
//
366+
// Note that selects are all we really need here: Other passes would turn an
367+
// if into a select if the arms are simple enough, and only in those cases
368+
// (simple arms) do we have a chance at partially precomputing. For example,
369+
// if an arm is a constant then we can, but if it is a call then we can't.)
370+
// However, there are cases like an if with arms with side effects that end in
371+
// precomputable things, that are missed atm TODO
372+
std::unordered_set<Select*> partiallyPrecomputable;
373+
374+
void considerPartiallyPrecomputing(Expression* curr) {
375+
if (!canPartiallyPrecompute) {
376+
return;
377+
}
378+
379+
if (auto* select = curr->dynCast<Select>()) {
380+
// We only have a reasonable hope of success if the select arms are things
381+
// like constants or global gets. At a first approximation, allow the set
382+
// of things we allow in constant initializers (but we can probably allow
383+
// more here TODO).
384+
//
385+
// We also ignore selects with no parent (that are the entire function
386+
// body) as then there is nothing to optimize into their arms.
387+
auto& wasm = *getModule();
388+
if (Properties::isValidConstantExpression(wasm, select->ifTrue) &&
389+
Properties::isValidConstantExpression(wasm, select->ifFalse) &&
390+
getFunction()->body != select) {
391+
partiallyPrecomputable.insert(select);
392+
}
393+
}
394+
}
395+
396+
// To partially precompute selects we walk up the stack from them, like this:
397+
//
398+
// (A
399+
// (B
400+
// (select
401+
// (C)
402+
// (D)
403+
// (condition)
404+
// )
405+
// )
406+
// )
407+
//
408+
// First we try to apply B to C and D. If that works, we arrive at this:
409+
//
410+
// (A
411+
// (select
412+
// (constant result of B(C))
413+
// (constant result of B(D))
414+
// (condition)
415+
// )
416+
// )
417+
//
418+
// We can then proceed to perhaps apply A. However, even if we failed to apply
419+
// B then we can try to apply A and B together, because that combination may
420+
// succeed where incremental work fails, for example:
421+
//
422+
// (global $C
423+
// (struct.new ;; outer
424+
// (struct.new ;; inner
425+
// (i32.const 10)
426+
// )
427+
// )
428+
// )
429+
//
430+
// (struct.get ;; outer
431+
// (struct.get ;; inner
432+
// (select
433+
// (global.get $C)
434+
// (global.get $D)
435+
// (condition)
436+
// )
437+
// )
438+
// )
439+
//
440+
// Applying the inner struct.get to $C leads us to the inner struct.new, but
441+
// that is an interior pointer in the global - it is not something we can
442+
// refer to using a global.get, so precomputing it fails. However, when we
443+
// apply both struct.gets at once we arrive at the outer struct.new, which is
444+
// in fact the global $C, and we succeed.
445+
void partiallyPrecompute(Function* func) {
446+
if (!canPartiallyPrecompute || partiallyPrecomputable.empty()) {
447+
// Nothing to do.
448+
return;
449+
}
450+
451+
// Walk the function to find the parent stacks of the promising selects. We
452+
// copy the stacks and process them later. We do it like this because if we
453+
// wanted to process stacks as we reached them then we'd trip over
454+
// ourselves: when we optimize we replace a parent, but that parent is an
455+
// expression we'll reach later in the walk, so modifying it is unsafe.
456+
struct StackFinder : public ExpressionStackWalker<StackFinder> {
457+
Precompute& parent;
458+
459+
StackFinder(Precompute& parent) : parent(parent) {}
460+
461+
// We will later iterate on this in the order of insertion, which keeps
462+
// things deterministic, and also usually lets us do consecutive work
463+
// like a select nested in another select's condition, simply because we
464+
// will traverse the selects in postorder (however, because we cannot
465+
// always succeed in an incremental manner - see the comment on this
466+
// function - it is possible in theory that some work can happen only in a
467+
// later execution of the pass).
468+
InsertOrderedMap<Select*, ExpressionStack> stackMap;
469+
470+
void visitSelect(Select* curr) {
471+
if (parent.partiallyPrecomputable.count(curr)) {
472+
stackMap[curr] = expressionStack;
473+
}
474+
}
475+
} stackFinder(*this);
476+
stackFinder.walkFunction(func);
477+
478+
// Note which expressions we've modified as we go, as it is invalid to
479+
// modify more than once. This could happen in theory in a situation like
480+
// this:
481+
//
482+
// (ternary.f32.max ;; fictional instruction for explanatory purposes
483+
// (select ..)
484+
// (select ..)
485+
// (f32.infinity)
486+
// )
487+
//
488+
// When we consider the first select we can see that the computation result
489+
// is always infinity, so we can optimize here and replace the ternary. Then
490+
// the same thing happens with the second select, causing the ternary to be
491+
// replaced again, which is unsafe because it no longer exists after we
492+
// precomputed it the first time. (Note that in this example the result is
493+
// the same either way, but at least in theory an instruction could exist
494+
// for whom there was a difference.) In practice it does not seem that wasm
495+
// has instructions capable of this atm but this code is still useful to
496+
// guard against future problems, and as a minor speedup (quickly skip code
497+
// if it was already modified).
498+
std::unordered_set<Expression*> modified;
499+
500+
for (auto& [select, stack] : stackFinder.stackMap) {
501+
// Each stack ends in the select itself, and contains more than the select
502+
// itself (otherwise we'd have ignored the select), i.e., the select has a
503+
// parent that we can try to optimize into the arms.
504+
assert(stack.back() == select);
505+
assert(stack.size() >= 2);
506+
Index selectIndex = stack.size() - 1;
507+
assert(selectIndex >= 1);
508+
509+
if (modified.count(select)) {
510+
// This select was modified; go to the next one.
511+
continue;
512+
}
513+
514+
// Go up through the parents, until we can't do any more work. At each
515+
// parent we'll try to execute it and all intermediate parents into the
516+
// select arms.
517+
for (Index parentIndex = selectIndex - 1; parentIndex != Index(-1);
518+
parentIndex--) {
519+
auto* parent = stack[parentIndex];
520+
if (modified.count(parent)) {
521+
// This parent was modified; exit the loop on parents as no upper
522+
// parent is valid to try either.
523+
break;
524+
}
525+
526+
// If the parent lacks a concrete type then we can't move it into the
527+
// select: the select needs a concrete (and non-tuple) type. For example
528+
// if the parent is a drop or is unreachable, those are things we don't
529+
// want to handle, and we stop here (once we see one such parent we
530+
// can't expect to make any more progress).
531+
if (!parent->type.isConcrete() || parent->type.isTuple()) {
532+
break;
533+
}
534+
535+
// We are precomputing the select arms, but leaving the condition as-is.
536+
// If the condition breaks to the parent, then we can't move the parent
537+
// into the select arms:
538+
//
539+
// (block $name ;; this must stay outside of the select
540+
// (select
541+
// (B)
542+
// (C)
543+
// (block ;; condition
544+
// (br_if $target
545+
//
546+
// Ignore all control flow for simplicity, as they aren't interesting
547+
// for us, and other passes should have removed them anyhow.
548+
if (Properties::isControlFlowStructure(parent)) {
549+
break;
550+
}
551+
552+
// This looks promising, so try to precompute here. What we do is
553+
// precompute twice, once with the select replaced with the left arm,
554+
// and once with the right. If both succeed then we can create a new
555+
// select (with the same condition as before) whose arms are the
556+
// precomputed values.
557+
auto isValidPrecomputation = [&](const Flow& flow) {
558+
// For now we handle simple concrete values. We could also handle
559+
// breaks in principle TODO
560+
return canEmitConstantFor(flow.values) && !flow.breaking() &&
561+
flow.values.isConcrete();
562+
};
563+
564+
// Find the pointer to the select in its immediate parent so that we can
565+
// replace it first with one arm and then the other.
566+
auto** pointerToSelect =
567+
getChildPointerInImmediateParent(stack, selectIndex, func);
568+
*pointerToSelect = select->ifTrue;
569+
auto ifTrue = precomputeExpression(parent);
570+
if (isValidPrecomputation(ifTrue)) {
571+
*pointerToSelect = select->ifFalse;
572+
auto ifFalse = precomputeExpression(parent);
573+
if (isValidPrecomputation(ifFalse)) {
574+
// Wonderful, we can precompute here! The select can now contain the
575+
// computed values in its arms.
576+
select->ifTrue = ifTrue.getConstExpression(*getModule());
577+
select->ifFalse = ifFalse.getConstExpression(*getModule());
578+
select->finalize();
579+
580+
// The parent of the select is now replaced by the select.
581+
auto** pointerToParent =
582+
getChildPointerInImmediateParent(stack, parentIndex, func);
583+
*pointerToParent = select;
584+
585+
// Update state for further iterations: Mark everything modified and
586+
// move the select to the parent's location.
587+
for (Index i = parentIndex; i <= selectIndex; i++) {
588+
modified.insert(stack[i]);
589+
}
590+
selectIndex = parentIndex;
591+
stack[selectIndex] = select;
592+
stack.resize(selectIndex + 1);
593+
}
594+
}
595+
596+
// Whether we succeeded to precompute here or not, restore the parent's
597+
// pointer to its original state (if we precomputed, the parent is no
598+
// longer in use, but there is no harm in modifying it).
599+
*pointerToSelect = select;
600+
}
601+
}
602+
}
603+
322604
void visitFunction(Function* curr) {
323605
// removing breaks can alter types
324606
ReFinalize().walkFunctionInModule(curr, getModule());
@@ -531,6 +813,30 @@ struct Precompute
531813

532814
return true;
533815
}
816+
817+
// Helpers for partial precomputing.
818+
819+
// Given a stack of expressions and the index of an expression in it, find
820+
// the pointer to that expression in the parent. This gives us a pointer that
821+
// allows us to replace the expression.
822+
Expression** getChildPointerInImmediateParent(const ExpressionStack& stack,
823+
Index index,
824+
Function* func) {
825+
if (index == 0) {
826+
// There is nothing above this expression, so the pointer referring to it
827+
// is the function's body.
828+
return &func->body;
829+
}
830+
831+
auto* child = stack[index];
832+
for (auto** currChild : ChildIterator(stack[index - 1]).children) {
833+
if (*currChild == child) {
834+
return currChild;
835+
}
836+
}
837+
838+
WASM_UNREACHABLE("child not found in parent");
839+
}
534840
};
535841

536842
Pass* createPrecomputePass() { return new Precompute(false); }

0 commit comments

Comments
 (0)