27
27
// looked at.
28
28
//
29
29
30
- #include < ir/literal-utils.h>
31
- #include < ir/local-graph.h>
32
- #include < ir/manipulation.h>
33
- #include < ir/properties.h>
34
- #include < ir/utils.h>
35
- #include < pass.h>
36
- #include < support/unique_deferring_queue.h>
37
- #include < wasm-builder.h>
38
- #include < wasm-interpreter.h>
39
- #include < wasm.h>
30
+ #include " ir/effects.h"
31
+ #include " ir/iteration.h"
32
+ #include " ir/literal-utils.h"
33
+ #include " ir/local-graph.h"
34
+ #include " ir/manipulation.h"
35
+ #include " ir/properties.h"
36
+ #include " ir/utils.h"
37
+ #include " pass.h"
38
+ #include " support/insert_ordered.h"
39
+ #include " support/unique_deferring_queue.h"
40
+ #include " wasm-builder.h"
41
+ #include " wasm-interpreter.h"
42
+ #include " wasm.h"
40
43
41
44
namespace wasm {
42
45
@@ -210,9 +213,16 @@ struct Precompute
210
213
GetValues getValues;
211
214
HeapValues heapValues;
212
215
216
+ bool canPartiallyPrecompute;
217
+
213
218
void doWalkFunction (Function* func) {
219
+ // Perform partial precomputing only when the optimization level is non-
220
+ // trivial, as it is slower and less likely to help.
221
+ canPartiallyPrecompute = getPassOptions ().optimizeLevel >= 2 ;
222
+
214
223
// Walk the function and precompute things.
215
224
super::doWalkFunction (func);
225
+ partiallyPrecompute (func);
216
226
if (!propagate) {
217
227
return ;
218
228
}
@@ -226,11 +236,13 @@ struct Precompute
226
236
// another walk to apply them and perhaps other optimizations that are
227
237
// unlocked.
228
238
super::doWalkFunction (func);
239
+ // We could also try to partially precompute again, but that is a somewhat
240
+ // heavy operation, so we only do it the first time, and leave such things
241
+ // for later runs of this pass and for --converge.
229
242
}
230
243
// Note that in principle even more cycles could find further work here, in
231
244
// very rare cases. To avoid constructing a LocalGraph again just for that
232
- // unlikely chance, we leave such things for later runs of this pass and for
233
- // --converge.
245
+ // unlikely chance, we leave such things for later.
234
246
}
235
247
236
248
template <typename T> void reuseConstantNode (T* curr, Flow flow) {
@@ -281,6 +293,9 @@ struct Precompute
281
293
}
282
294
if (flow.breaking ()) {
283
295
if (flow.breakTo == NONCONSTANT_FLOW) {
296
+ // This cannot be turned into a constant, but perhaps we can partially
297
+ // precompute it.
298
+ considerPartiallyPrecomputing (curr);
284
299
return ;
285
300
}
286
301
if (flow.breakTo == RETURN_FLOW) {
@@ -319,6 +334,273 @@ struct Precompute
319
334
}
320
335
}
321
336
337
+ // If we failed to precompute a constant, perhaps we can still precompute part
338
+ // of an expression. Specifically, consider this case:
339
+ //
340
+ // (A
341
+ // (select
342
+ // (B)
343
+ // (C)
344
+ // (condition)
345
+ // )
346
+ // )
347
+ //
348
+ // Perhaps we can compute A(B) and A(C). If so, we can emit a better select:
349
+ //
350
+ // (select
351
+ // (constant result of A(B))
352
+ // (constant result of A(C))
353
+ // (condition)
354
+ // )
355
+ //
356
+ // Note that in general for code size we want to move operations *out* of
357
+ // selects and ifs (OptimizeInstructions does that), but here we are
358
+ // computing two constants which replace three expressions, so it is
359
+ // worthwhile.
360
+ //
361
+ // To do such partial precomputing, in the main pass we note selects that look
362
+ // promising. If we find any then we do a second pass later just for that (as
363
+ // doing so requires walking up the stack in a manner that we want to avoid in
364
+ // the main pass for overhead reasons; see below).
365
+ //
366
+ // Note that selects are all we really need here: Other passes would turn an
367
+ // if into a select if the arms are simple enough, and only in those cases
368
+ // (simple arms) do we have a chance at partially precomputing. For example,
369
+ // if an arm is a constant then we can, but if it is a call then we can't.)
370
+ // However, there are cases like an if with arms with side effects that end in
371
+ // precomputable things, that are missed atm TODO
372
+ std::unordered_set<Select*> partiallyPrecomputable;
373
+
374
+ void considerPartiallyPrecomputing (Expression* curr) {
375
+ if (!canPartiallyPrecompute) {
376
+ return ;
377
+ }
378
+
379
+ if (auto * select = curr->dynCast <Select>()) {
380
+ // We only have a reasonable hope of success if the select arms are things
381
+ // like constants or global gets. At a first approximation, allow the set
382
+ // of things we allow in constant initializers (but we can probably allow
383
+ // more here TODO).
384
+ //
385
+ // We also ignore selects with no parent (that are the entire function
386
+ // body) as then there is nothing to optimize into their arms.
387
+ auto & wasm = *getModule ();
388
+ if (Properties::isValidConstantExpression (wasm, select ->ifTrue ) &&
389
+ Properties::isValidConstantExpression (wasm, select ->ifFalse ) &&
390
+ getFunction ()->body != select ) {
391
+ partiallyPrecomputable.insert (select );
392
+ }
393
+ }
394
+ }
395
+
396
+ // To partially precompute selects we walk up the stack from them, like this:
397
+ //
398
+ // (A
399
+ // (B
400
+ // (select
401
+ // (C)
402
+ // (D)
403
+ // (condition)
404
+ // )
405
+ // )
406
+ // )
407
+ //
408
+ // First we try to apply B to C and D. If that works, we arrive at this:
409
+ //
410
+ // (A
411
+ // (select
412
+ // (constant result of B(C))
413
+ // (constant result of B(D))
414
+ // (condition)
415
+ // )
416
+ // )
417
+ //
418
+ // We can then proceed to perhaps apply A. However, even if we failed to apply
419
+ // B then we can try to apply A and B together, because that combination may
420
+ // succeed where incremental work fails, for example:
421
+ //
422
+ // (global $C
423
+ // (struct.new ;; outer
424
+ // (struct.new ;; inner
425
+ // (i32.const 10)
426
+ // )
427
+ // )
428
+ // )
429
+ //
430
+ // (struct.get ;; outer
431
+ // (struct.get ;; inner
432
+ // (select
433
+ // (global.get $C)
434
+ // (global.get $D)
435
+ // (condition)
436
+ // )
437
+ // )
438
+ // )
439
+ //
440
+ // Applying the inner struct.get to $C leads us to the inner struct.new, but
441
+ // that is an interior pointer in the global - it is not something we can
442
+ // refer to using a global.get, so precomputing it fails. However, when we
443
+ // apply both struct.gets at once we arrive at the outer struct.new, which is
444
+ // in fact the global $C, and we succeed.
445
+ void partiallyPrecompute (Function* func) {
446
+ if (!canPartiallyPrecompute || partiallyPrecomputable.empty ()) {
447
+ // Nothing to do.
448
+ return ;
449
+ }
450
+
451
+ // Walk the function to find the parent stacks of the promising selects. We
452
+ // copy the stacks and process them later. We do it like this because if we
453
+ // wanted to process stacks as we reached them then we'd trip over
454
+ // ourselves: when we optimize we replace a parent, but that parent is an
455
+ // expression we'll reach later in the walk, so modifying it is unsafe.
456
+ struct StackFinder : public ExpressionStackWalker <StackFinder> {
457
+ Precompute& parent;
458
+
459
+ StackFinder (Precompute& parent) : parent(parent) {}
460
+
461
+ // We will later iterate on this in the order of insertion, which keeps
462
+ // things deterministic, and also usually lets us do consecutive work
463
+ // like a select nested in another select's condition, simply because we
464
+ // will traverse the selects in postorder (however, because we cannot
465
+ // always succeed in an incremental manner - see the comment on this
466
+ // function - it is possible in theory that some work can happen only in a
467
+ // later execution of the pass).
468
+ InsertOrderedMap<Select*, ExpressionStack> stackMap;
469
+
470
+ void visitSelect (Select* curr) {
471
+ if (parent.partiallyPrecomputable .count (curr)) {
472
+ stackMap[curr] = expressionStack;
473
+ }
474
+ }
475
+ } stackFinder (*this );
476
+ stackFinder.walkFunction (func);
477
+
478
+ // Note which expressions we've modified as we go, as it is invalid to
479
+ // modify more than once. This could happen in theory in a situation like
480
+ // this:
481
+ //
482
+ // (ternary.f32.max ;; fictional instruction for explanatory purposes
483
+ // (select ..)
484
+ // (select ..)
485
+ // (f32.infinity)
486
+ // )
487
+ //
488
+ // When we consider the first select we can see that the computation result
489
+ // is always infinity, so we can optimize here and replace the ternary. Then
490
+ // the same thing happens with the second select, causing the ternary to be
491
+ // replaced again, which is unsafe because it no longer exists after we
492
+ // precomputed it the first time. (Note that in this example the result is
493
+ // the same either way, but at least in theory an instruction could exist
494
+ // for whom there was a difference.) In practice it does not seem that wasm
495
+ // has instructions capable of this atm but this code is still useful to
496
+ // guard against future problems, and as a minor speedup (quickly skip code
497
+ // if it was already modified).
498
+ std::unordered_set<Expression*> modified;
499
+
500
+ for (auto & [select , stack] : stackFinder.stackMap ) {
501
+ // Each stack ends in the select itself, and contains more than the select
502
+ // itself (otherwise we'd have ignored the select), i.e., the select has a
503
+ // parent that we can try to optimize into the arms.
504
+ assert (stack.back () == select );
505
+ assert (stack.size () >= 2 );
506
+ Index selectIndex = stack.size () - 1 ;
507
+ assert (selectIndex >= 1 );
508
+
509
+ if (modified.count (select )) {
510
+ // This select was modified; go to the next one.
511
+ continue ;
512
+ }
513
+
514
+ // Go up through the parents, until we can't do any more work. At each
515
+ // parent we'll try to execute it and all intermediate parents into the
516
+ // select arms.
517
+ for (Index parentIndex = selectIndex - 1 ; parentIndex != Index (-1 );
518
+ parentIndex--) {
519
+ auto * parent = stack[parentIndex];
520
+ if (modified.count (parent)) {
521
+ // This parent was modified; exit the loop on parents as no upper
522
+ // parent is valid to try either.
523
+ break ;
524
+ }
525
+
526
+ // If the parent lacks a concrete type then we can't move it into the
527
+ // select: the select needs a concrete (and non-tuple) type. For example
528
+ // if the parent is a drop or is unreachable, those are things we don't
529
+ // want to handle, and we stop here (once we see one such parent we
530
+ // can't expect to make any more progress).
531
+ if (!parent->type .isConcrete () || parent->type .isTuple ()) {
532
+ break ;
533
+ }
534
+
535
+ // We are precomputing the select arms, but leaving the condition as-is.
536
+ // If the condition breaks to the parent, then we can't move the parent
537
+ // into the select arms:
538
+ //
539
+ // (block $name ;; this must stay outside of the select
540
+ // (select
541
+ // (B)
542
+ // (C)
543
+ // (block ;; condition
544
+ // (br_if $target
545
+ //
546
+ // Ignore all control flow for simplicity, as they aren't interesting
547
+ // for us, and other passes should have removed them anyhow.
548
+ if (Properties::isControlFlowStructure (parent)) {
549
+ break ;
550
+ }
551
+
552
+ // This looks promising, so try to precompute here. What we do is
553
+ // precompute twice, once with the select replaced with the left arm,
554
+ // and once with the right. If both succeed then we can create a new
555
+ // select (with the same condition as before) whose arms are the
556
+ // precomputed values.
557
+ auto isValidPrecomputation = [&](const Flow& flow) {
558
+ // For now we handle simple concrete values. We could also handle
559
+ // breaks in principle TODO
560
+ return canEmitConstantFor (flow.values ) && !flow.breaking () &&
561
+ flow.values .isConcrete ();
562
+ };
563
+
564
+ // Find the pointer to the select in its immediate parent so that we can
565
+ // replace it first with one arm and then the other.
566
+ auto ** pointerToSelect =
567
+ getChildPointerInImmediateParent (stack, selectIndex, func);
568
+ *pointerToSelect = select ->ifTrue ;
569
+ auto ifTrue = precomputeExpression (parent);
570
+ if (isValidPrecomputation (ifTrue)) {
571
+ *pointerToSelect = select ->ifFalse ;
572
+ auto ifFalse = precomputeExpression (parent);
573
+ if (isValidPrecomputation (ifFalse)) {
574
+ // Wonderful, we can precompute here! The select can now contain the
575
+ // computed values in its arms.
576
+ select ->ifTrue = ifTrue.getConstExpression (*getModule ());
577
+ select ->ifFalse = ifFalse.getConstExpression (*getModule ());
578
+ select ->finalize ();
579
+
580
+ // The parent of the select is now replaced by the select.
581
+ auto ** pointerToParent =
582
+ getChildPointerInImmediateParent (stack, parentIndex, func);
583
+ *pointerToParent = select ;
584
+
585
+ // Update state for further iterations: Mark everything modified and
586
+ // move the select to the parent's location.
587
+ for (Index i = parentIndex; i <= selectIndex; i++) {
588
+ modified.insert (stack[i]);
589
+ }
590
+ selectIndex = parentIndex;
591
+ stack[selectIndex] = select ;
592
+ stack.resize (selectIndex + 1 );
593
+ }
594
+ }
595
+
596
+ // Whether we succeeded to precompute here or not, restore the parent's
597
+ // pointer to its original state (if we precomputed, the parent is no
598
+ // longer in use, but there is no harm in modifying it).
599
+ *pointerToSelect = select ;
600
+ }
601
+ }
602
+ }
603
+
322
604
void visitFunction (Function* curr) {
323
605
// removing breaks can alter types
324
606
ReFinalize ().walkFunctionInModule (curr, getModule ());
@@ -531,6 +813,30 @@ struct Precompute
531
813
532
814
return true ;
533
815
}
816
+
817
+ // Helpers for partial precomputing.
818
+
819
+ // Given a stack of expressions and the index of an expression in it, find
820
+ // the pointer to that expression in the parent. This gives us a pointer that
821
+ // allows us to replace the expression.
822
+ Expression** getChildPointerInImmediateParent (const ExpressionStack& stack,
823
+ Index index,
824
+ Function* func) {
825
+ if (index == 0 ) {
826
+ // There is nothing above this expression, so the pointer referring to it
827
+ // is the function's body.
828
+ return &func->body ;
829
+ }
830
+
831
+ auto * child = stack[index ];
832
+ for (auto ** currChild : ChildIterator (stack[index - 1 ]).children ) {
833
+ if (*currChild == child) {
834
+ return currChild;
835
+ }
836
+ }
837
+
838
+ WASM_UNREACHABLE (" child not found in parent" );
839
+ }
534
840
};
535
841
536
842
Pass* createPrecomputePass () { return new Precompute (false ); }
0 commit comments