diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go index 4a24a181e5de19..b484ed265c433a 100644 --- a/src/cmd/compile/internal/ssa/block.go +++ b/src/cmd/compile/internal/ssa/block.go @@ -257,6 +257,38 @@ func (b *Block) resetWithControl2(kind BlockKind, v, w *Value) { w.Uses++ } +// ReplaceSucc replaces b->oldSucc to b->newSucc, n indicates which predecessor +// index of newSucc refers to b. It is the responsibility of the caller to clear +// the corresponding predecessor of oldSucc. +func (b *Block) ReplaceSucc(oldSucc, newSucc *Block, n int) { + for i := 0; i < len(b.Succs); i++ { + succ := &b.Succs[i] + if succ.b == oldSucc { + succ.b = newSucc + succ.i = n + newSucc.Preds[n] = Edge{b, i} + return + } + } + panic(fmt.Sprintf("Can not found %v->%v", b, oldSucc)) +} + +// ReplacePred replaces oldPred->b to newPred->b, n indicates which successor +// index of newPred refers to b. It is the responsibility of the caller to clear +// the corresponding successor of oldPred. +func (b *Block) ReplacePred(oldPred, newPred *Block, n int) { + for i := 0; i < len(b.Preds); i++ { + pred := &b.Preds[i] + if pred.b == oldPred { + pred.b = newPred + pred.i = n + newPred.Succs[n] = Edge{b, i} + return + } + } + panic(fmt.Sprintf("Can not found %v->%v", oldPred, b)) +} + // truncateValues truncates b.Values at the ith element, zeroing subsequent elements. // The values in b.Values after i must already have had their args reset, // to maintain correct value uses counts. diff --git a/src/cmd/compile/internal/ssa/branchelim.go b/src/cmd/compile/internal/ssa/branchelim.go index f16959dd572973..158e5eca7b226f 100644 --- a/src/cmd/compile/internal/ssa/branchelim.go +++ b/src/cmd/compile/internal/ssa/branchelim.go @@ -424,6 +424,14 @@ func shouldElimIfElse(no, yes, post *Block, arch string) bool { } } +func isAccessMemory(v *Value) bool { + if v.Op == OpPhi || v.Type.IsMemory() || + v.MemoryArg() != nil || opcodeTable[v.Op].hasSideEffects { + return true + } + return false +} + // canSpeculativelyExecute reports whether every value in the block can // be evaluated without causing any observable side effects (memory // accesses, panics and so on) except for execution time changes. It @@ -436,8 +444,8 @@ func canSpeculativelyExecute(b *Block) bool { // don't fuse memory ops, Phi ops, divides (can panic), // or anything else with side-effects for _, v := range b.Values { - if v.Op == OpPhi || isDivMod(v.Op) || isPtrArithmetic(v.Op) || v.Type.IsMemory() || - v.MemoryArg() != nil || opcodeTable[v.Op].hasSideEffects { + if v.Op == OpPhi || isDivMod(v.Op) || isPtrArithmetic(v.Op) || + isAccessMemory(v) { return false } } diff --git a/src/cmd/compile/internal/ssa/check.go b/src/cmd/compile/internal/ssa/check.go index bbfdaceaad90b0..d4cee595fcdb48 100644 --- a/src/cmd/compile/internal/ssa/check.go +++ b/src/cmd/compile/internal/ssa/check.go @@ -460,33 +460,8 @@ func checkFunc(f *Func) { memCheck(f) } -func memCheck(f *Func) { - // Check that if a tuple has a memory type, it is second. - for _, b := range f.Blocks { - for _, v := range b.Values { - if v.Type.IsTuple() && v.Type.FieldType(0).IsMemory() { - f.Fatalf("memory is first in a tuple: %s\n", v.LongString()) - } - } - } - - // Single live memory checks. - // These checks only work if there are no memory copies. - // (Memory copies introduce ambiguity about which mem value is really live. - // probably fixable, but it's easier to avoid the problem.) - // For the same reason, disable this check if some memory ops are unused. - for _, b := range f.Blocks { - for _, v := range b.Values { - if (v.Op == OpCopy || v.Uses == 0) && v.Type.IsMemory() { - return - } - } - if b != f.Entry && len(b.Preds) == 0 { - return - } - } - - // Compute live memory at the end of each block. +// computeLastMem compute live memory at the end of each block. +func computeLastMem(f *Func) []*Value { lastmem := make([]*Value, f.NumBlocks()) ss := newSparseSet(f.NumValues()) for _, b := range f.Blocks { @@ -552,6 +527,36 @@ func memCheck(f *Func) { break } } + return lastmem +} + +func memCheck(f *Func) { + // Check that if a tuple has a memory type, it is second. + for _, b := range f.Blocks { + for _, v := range b.Values { + if v.Type.IsTuple() && v.Type.FieldType(0).IsMemory() { + f.Fatalf("memory is first in a tuple: %s\n", v.LongString()) + } + } + } + + // Single live memory checks. + // These checks only work if there are no memory copies. + // (Memory copies introduce ambiguity about which mem value is really live. + // probably fixable, but it's easier to avoid the problem.) + // For the same reason, disable this check if some memory ops are unused. + for _, b := range f.Blocks { + for _, v := range b.Values { + if (v.Op == OpCopy || v.Uses == 0) && v.Type.IsMemory() { + return + } + } + if b != f.Entry && len(b.Preds) == 0 { + return + } + } + + lastmem := computeLastMem(f) // Check merge points. for _, b := range f.Blocks { for _, v := range b.Values { diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index d125891f88c58f..9037c6049308ea 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -453,12 +453,13 @@ commas. For example: return fmt.Sprintf("Did not find a phase matching %s in -d=ssa/... debug option", phase) } +var EnableLoopOpts = buildcfg.Experiment.LoopOpts + // list of passes for the compiler var passes = [...]pass{ - // TODO: combine phielim and copyelim into a single pass? + // Generic Optimizations {name: "number lines", fn: numberLines, required: true}, {name: "early phielim", fn: phielim}, - {name: "early copyelim", fn: copyelim}, {name: "early deadcode", fn: deadcode}, // remove generated dead code to avoid doing pointless work during opt {name: "short circuit", fn: shortcircuit}, {name: "decompose user", fn: decomposeUser, required: true}, @@ -484,9 +485,18 @@ var passes = [...]pass{ {name: "late fuse", fn: fuseLate}, {name: "dse", fn: dse}, {name: "memcombine", fn: memcombine}, - {name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops + // Loop Optimizations + {name: "loop deadcode", fn: deadcode, disabled: !EnableLoopOpts}, // remove dead blocks before loop opts to avoid extra work + {name: "loop invariant code motion", fn: licm, disabled: !EnableLoopOpts}, // hoist loop invariant code out of loops + {name: "lcssa destruct", fn: phielim, disabled: !EnableLoopOpts}, // eliminate LCSSA proxy phi to restore general SSA form + {name: "loop sccp", fn: sccp, disabled: !EnableLoopOpts}, // optimize loop guard conditional test + {name: "loop opt", fn: opt, disabled: !EnableLoopOpts}, // further optimize loop guard conditional test + {name: "loop deadcode late", fn: deadcode, disabled: !EnableLoopOpts}, // remove dead loop guard to simplify cfg + {name: "loop nilcheckelim", fn: nilcheckelim, disabled: !EnableLoopOpts}, // remove duplicated nil check in loop guard + {name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops {name: "insert resched checks", fn: insertLoopReschedChecks, disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops. + // Code Generation {name: "lower", fn: lower, required: true}, {name: "addressing modes", fn: addressingModes, required: false}, {name: "late lower", fn: lateLower, required: true}, @@ -497,7 +507,6 @@ var passes = [...]pass{ {name: "lowered deadcode", fn: deadcode, required: true}, {name: "checkLower", fn: checkLower, required: true}, {name: "late phielim", fn: phielim}, - {name: "late copyelim", fn: copyelim}, {name: "tighten", fn: tighten, required: true}, // move values closer to their uses {name: "late deadcode", fn: deadcode}, {name: "critical", fn: critical, required: true}, // remove critical edges @@ -508,7 +517,7 @@ var passes = [...]pass{ {name: "late nilcheck", fn: nilcheckelim2}, {name: "flagalloc", fn: flagalloc, required: true}, // allocate flags register {name: "regalloc", fn: regalloc, required: true}, // allocate int & float registers + stack slots - {name: "loop rotate", fn: loopRotate}, + {name: "layout loop", fn: layoutLoop}, {name: "trim", fn: trim}, // remove empty blocks } @@ -577,8 +586,8 @@ var passOrder = [...]constraint{ {"schedule", "flagalloc"}, // regalloc needs flags to be allocated first. {"flagalloc", "regalloc"}, - // loopRotate will confuse regalloc. - {"regalloc", "loop rotate"}, + // layout loop will confuse regalloc. + {"regalloc", "layout loop"}, // trim needs regalloc to be done first. {"regalloc", "trim"}, // memcombine works better if fuse happens first, to help merge stores. diff --git a/src/cmd/compile/internal/ssa/func.go b/src/cmd/compile/internal/ssa/func.go index 529c119dc3da5a..ae1e57fe8f37ab 100644 --- a/src/cmd/compile/internal/ssa/func.go +++ b/src/cmd/compile/internal/ssa/func.go @@ -86,6 +86,13 @@ type LocalSlotSplitKey struct { Type *types.Type // type of slot } +// assert is used for development sanity check +func assert(cond bool, fx string, msg ...interface{}) { + if !cond { + panic(fmt.Sprintf(fx, msg...)) + } +} + // NewFunc returns a new, empty function object. // Caller must reset cache before calling NewFunc. func (c *Config) NewFunc(fe Frontend, cache *Cache) *Func { @@ -298,7 +305,7 @@ func (f *Func) newValue(op Op, t *types.Type, b *Block, pos src.XPos) *Value { // newValueNoBlock allocates a new Value with the given fields. // The returned value is not placed in any block. Once the caller // decides on a block b, it must set b.Block and append -// the returned value to b.Values. +// the returned value to b.Values or simply use placeValue. func (f *Func) newValueNoBlock(op Op, t *types.Type, pos src.XPos) *Value { var v *Value if f.freeValues != nil { @@ -324,6 +331,12 @@ func (f *Func) newValueNoBlock(op Op, t *types.Type, pos src.XPos) *Value { return v } +// placeValue places new Value that not placed yet into given block. +func (block *Block) placeValue(v *Value) { + v.Block = block + block.Values = append(block.Values, v) +} + // LogStat writes a string key and int value as a warning in a // tab-separated format easily handled by spreadsheets or awk. // file names, lines, and function names are included to provide enough (?) diff --git a/src/cmd/compile/internal/ssa/layout.go b/src/cmd/compile/internal/ssa/layout.go index e4a8c6ffbf0dde..eeb84308b53bb9 100644 --- a/src/cmd/compile/internal/ssa/layout.go +++ b/src/cmd/compile/internal/ssa/layout.go @@ -88,6 +88,7 @@ func layoutOrder(f *Func) []*Block { } bid := f.Entry.ID + blockTrace := false blockloop: for { // add block to schedule @@ -120,7 +121,6 @@ blockloop: } // Pick the next block to schedule - // Pick among the successor blocks that have not been scheduled yet. // Use likely direction if we have it. var likely *Block @@ -131,10 +131,27 @@ blockloop: likely = b.Succs[1].b } if likely != nil && !scheduled[likely.ID] { + blockTrace = true bid = likely.ID continue } + // Pick the next block in the path trace if possible, trace starts with + // statically predicted branch, e.g. + // b0: ... If -> b1(likely),b2 + // b1: ... Plain -> b3 + // schedule the path trace b0->b1->b3 sequentially + if blockTrace { + if len(b.Succs) == 1 { + s := b.Succs[0].b + if !scheduled[s.ID] { + bid = s.ID + continue blockloop + } + } + blockTrace = false + } + // Use degree for now. bid = 0 // TODO: improve this part diff --git a/src/cmd/compile/internal/ssa/lcssa.go b/src/cmd/compile/internal/ssa/lcssa.go new file mode 100644 index 00000000000000..123ef740b78561 --- /dev/null +++ b/src/cmd/compile/internal/ssa/lcssa.go @@ -0,0 +1,403 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "fmt" + "sort" +) + +// ---------------------------------------------------------------------------- +// Loop Closed SSA Form +// +// loop closed SSA form is a special form of SSA form, which is used to simplify +// loop optimization. It ensures that all values defined inside the loop are only +// used within loop. The transformation looks up loop uses outside the loop and +// inserts the appropriate "proxy phi" at the loop exit, after which the outside +// of the loop does not use the loop def directly but the proxy phi. +// +// loop header: loop header: +// v3 = Phi(0, v4) v3 = Phi(0, v4) +// If cond->loop latch,loop exit If cond->loop latch,loop exit +// +// loop latch: loop latch: +// v4 = Add(v3, 1) => v4 = Add(v3, 1) +// Plain->loop header Plain->loop header +// +// loop exit: loop exit: +// v5 = Add(5, v3) v6 = Phi(v3) <= Proxy Phi +// Ret v18 v5 = Add(5, v6) +// Ret v18 +// +// Previously, v5 used v3 directly, where v5 is in the loop exit which is outside +// the loop. After LCSSA transformation, v5 uses v6, which in turn uses v3. Here, +// v6 is the proxy phi. In the context of LCSSA, we can consider the use block of +// v6 to be the loop header rather than the loop exit. This way, all values defined +// in the loop are loop "closed", i.e. only used within the loop. +// +// Any further changes to the loop definition only need to update the proxy phi, +// rather than iterating through all its uses and handling properties such as +// dominance relationships carefully, which is error prone and hard to maintain. + +// Def-Use utilities +type user struct { + def *Value // the definition + val *Value // used by value + block *Block // used by block's ctrl value + idx int // in which arg index of user is def located +} + +type defUses map[*Value][]*user + +func (u *user) String() string { + if u.val != nil { + return fmt.Sprintf("{%v:%v}", u.def, u.val) + } else { + return fmt.Sprintf("{%v:%v}", u.def, u.block) + } +} + +// useBlock returns the block where the def is used +func (u *user) useBlock() *Block { + if u.val != nil { + return u.val.Block + } else { + return u.block + } +} + +// replaceUse replaces the use of def with new use at given index +func (u *user) replaceUse(newUse *Value) { + if val := u.val; val != nil { + idx := u.idx + assert(val.Args[idx] == u.def, "sanity check") + val.SetArg(idx, newUse) + } else if block := u.block; block != nil { + idx := u.idx + assert(block.ControlValues()[idx] == u.def, "sanity check") + block.ReplaceControl(idx, newUse) + } else { + panic("def is neither used by value nor by block ctrl") + } +} + +// buildDefUses builds def-use map for given defs Values +func buildDefUses(fn *Func, defs []*Value) defUses { + defUses := make(defUses, 0) + for _, def := range defs { + if _, exist := defUses[def]; !exist { + // Many duplicate definitions, avoid redundant memory allocations + defUses[def] = make([]*user, 0, def.Uses) + } + } + for _, block := range fn.Blocks { + for _, val := range block.Values { + for iarg, arg := range val.Args { + if _, exist := defUses[arg]; exist { + defUses[arg] = append(defUses[arg], &user{arg, val, nil, iarg}) + } + } + } + for ictrl, ctrl := range block.ControlValues() { + if _, exist := defUses[ctrl]; exist { + defUses[ctrl] = append(defUses[ctrl], &user{ctrl, nil, block, ictrl}) + } + } + } + return defUses +} + +// stableDefs returns the defs in stable order for deterministic compilation +func stableDefs(defUses defUses) []*Value { + keys := make([]*Value, 0) + for k := range defUses { + keys = append(keys, k) + } + sort.SliceStable(keys, func(i, j int) bool { + return keys[i].ID < keys[j].ID + }) + + return keys +} + +type lcssa struct { + fn *Func + mphis []*Value // inserted memory proxy phi + e2phi map[*Block]*Value // exit block to proxy phi mapping +} + +// findUseBlock returns the block where the def is used. If the use is type of Phi, +// then the use block is the corresponding incoming block. Note that this is ONLY +// valid in context of LCSSA. +func findUseBlock(u *user) *Block { + var ub *Block + if val := u.val; val != nil { + if val.Op == OpPhi { + ipred := u.idx + ub = val.Block.Preds[ipred].b + } else { + ub = val.Block + } + } else { + ub = u.block + } + assert(ub != nil, "no use block") + return ub +} + +// containsBlock returns true if the block is part of the loop or part of the +// inner loop +func (ln *loopnest) containsBlock(loop *loop, block *Block) bool { + assert(ln.initializedChildren, "initialize loopnest children first") + + // Block is part of current loop? + if ln.b2l[block.ID] == loop { + return true + } + // Block is part of inner loop? + for _, child := range loop.children { + if ln.containsBlock(child, block) { + return true + } + } + return false +} + +// allocateProxyPhi allocates a proxy phi at specific loop exit +func (lc *lcssa) allocateProxyPhi(exit *Block, loopDef ...*Value) *Value { + assert(len(loopDef) > 0, "must have at least one loop def") + if phival, exist := lc.e2phi[exit]; exist { + return phival + } + + phi := lc.fn.newValueNoBlock(OpPhi, loopDef[0].Type, loopDef[0].Pos) + if len(loopDef) == 1 { + phiArgs := make([]*Value, len(exit.Preds)) + for idx := range exit.Preds { + phiArgs[idx] = loopDef[0] + } + phi.AddArgs(phiArgs...) + } else { + phi.AddArgs(loopDef...) + } + + exit.placeValue(phi) + lc.e2phi[exit] = phi + if phi.Type.IsMemory() { + lc.mphis = append(lc.mphis, phi) + } + return phi +} + +func (lc *lcssa) fixProxyPhiMem(fn *Func) { + if len(lc.mphis) == 0 { + // No mem proxy phi to fix + return + } + lastMem := computeLastMem(fn) + for _, phi := range lc.mphis { + assert(phi.Type.IsMemory(), "must be memory phi") + + for iarg, arg := range phi.Args { + mem := lastMem[phi.Block.Preds[iarg].b.ID] + if mem != arg && mem != nil { + if mem.Args[0] != arg { + fn.Fatalf("must use old memory") + } + oldPhiStr := phi.LongString() + phi.SetArg(iarg, mem) + if fn.pass.debug > 1 { + fmt.Printf("== Fix memory proxy phi %v to %v\n", + oldPhiStr, phi.LongString()) + } + } + } + } +} + +// placeProxyPhi places the proxy phi at loop exits to make sure all uses of a +// loop defined value are dominated by the proxy phi +func (lc *lcssa) placeProxyPhi(ln *loopnest, loop *loop, defs []*Value) bool { + defUses := buildDefUses(ln.f, defs) + + use2exits := make(map[*user][]*Block, 0) + loopDefs := stableDefs(defUses) + for _, loopDef := range loopDefs { + for _, use := range defUses[loopDef] { + useBlock := findUseBlock(use) + // It's an in-loop use? + if ln.b2l[useBlock.ID] == loop { + continue + } + + // Loop def does not dominate use? Possibly dead block + if !ln.sdom.IsAncestorEq(loopDef.Block, useBlock) { + continue + } + + // Possibly a dead block, ignore it + if len(useBlock.Preds) == 0 { + assert(useBlock.Kind == BlockInvalid, "why not otherwise") + continue + } + + // Only loop use that is not part of current loop takes into account. + if useBlock != loopDef.Block && !ln.containsBlock(loop, useBlock) { + // Simple case, try to find a loop exit that dominates the use + // block and place the proxy phi at this loop exit, this is the + // most common case + var domExit *Block + for _, exit := range loop.exits { + if ln.sdom.IsAncestorEq(exit, useBlock) { + domExit = exit + break + } + } + if domExit != nil { + use2exits[use] = append(use2exits[use], domExit) + continue + } + // Harder case, loop use block is not dominated by a single loop + // exit, instead it has many predecessors and all of them are + // dominated by different loop exits, we are probably reaching to + // it from all of these predecessors. In this case, we need to + // place the proxy phi at all loop exits and merge them at loop + // use block by yet another proxy phi + domExits := make([]*Block, 0, len(useBlock.Preds)) + for _, pred := range useBlock.Preds { + found := false + for _, e := range loop.exits { + if ln.sdom.IsAncestorEq(e, pred.b) { + domExits = append(domExits, e) + found = true + break + } + } + if !found { + break + } + } + if cap(domExits) == len(domExits) { + use2exits[use] = domExits + continue + } + + // Worst case, loop use block is not dominated by any of loop exits + // we start from all loop exits(including inner loop exits) though + // dominance frontier and see if we can reach to the use block, + // if so, we place the proxy phi at the loop exit that is closest + // to the use block. This is rare, but it does happen, give up + // for now as it's hard to handle. + // TODO(yyang): Correctly handle this case + if ln.f.pass.debug > 1 { + fmt.Printf("== Can not process use %v in %v\n", use, loop) + } + return false + } + } + } + + // For every use of loop def, place the proxy phi at proper exit block + // and replace such use with the proxy phi, this is the core of LCSSA, + // since proxy phi is "inside the loop" in context of LCSSA, now all uses + // of loop def are loop closed, e.g. lives in the loop. + for _, loopDef := range loopDefs { + uses := defUses[loopDef] + if len(uses) == 0 { + continue + } + // multiple uses shares the same proxy phi if they live in same exit block + // also note that only users of the same loop def could share proxy phi + lc.e2phi = make(map[*Block]*Value, 0) + for _, use := range uses { + useBlock := findUseBlock(use) + exits := use2exits[use] + if len(exits) == 1 { + domExit := exits[0] + // Replace all uses of loop def with new proxy phi + lcphi := lc.allocateProxyPhi(domExit, loopDef) + if ln.f.pass.debug > 1 { + fmt.Printf("== Replace use %v with proxy phi %v\n", + use, lcphi.LongString()) + } + use.replaceUse(lcphi) + } else if len(exits) > 1 { + // Place proxy phi at all dominator loop exits + phis := make([]*Value, 0, len(exits)) + for _, exit := range exits { + lcphi := lc.allocateProxyPhi(exit, loopDef) + phis = append(phis, lcphi) + } + // Merge them at loop use block by yet another proxy phi + lcphi := lc.allocateProxyPhi(useBlock, phis...) + use.replaceUse(lcphi) + if ln.f.pass.debug > 1 { + fmt.Printf("== Replace use %v with proxy phi %v\n", + use, lcphi.LongString()) + } + } + } + } + + // Since we may have placed memory proxy phi at some loop exits, which + // use loop def and produce new memory. If this block is a predecessor + // of another loop exit, we need to use memory proxy phi instead of loop + // def as a parameter of new proxy phi. + lc.fixProxyPhiMem(ln.f) + + return true +} + +// BuildLoopClosedForm builds loop closed SSA form upon original loop, this is +// the cornerstone of other loop optimizations such as LICM, loop unswitching +// and empty loop elimination. +func (fn *Func) BuildLoopClosedForm(ln *loopnest, loop *loop) bool { + assert(ln.initializedExits && ln.initializedChildren, "must be initialized") + if len(loop.exits) == 0 { + return true + } + + sdom := ln.sdom // lcssa does not wire up CFG, reusing sdom is okay + domBlocks := make([]*Block, 0) + blocks := make([]*Block, 0) + blocks = append(blocks, loop.exits...) + + // Outside the loop we can only use values defined in the blocks of arbitrary + // loop exit dominators, so first collect these blocks and treat the Values + // in them as loop def + for len(blocks) > 0 { + block := blocks[0] + blocks = blocks[1:] + if block == loop.header { + continue + } + idom := sdom.Parent(block) + if ln.b2l[idom.ID] != loop { + continue + } + + domBlocks = append(domBlocks, idom) + blocks = append(blocks, idom) + } + + // Look for out-of-loop users of these loop defs + defs := make([]*Value, 0) + for _, block := range domBlocks { + for _, val := range block.Values { + if val.Uses == 0 { + continue + } + defs = append(defs, val) + } + } + + // For every use of loop def, place the proxy phi at the proper block + lc := &lcssa{ + fn: fn, + mphis: make([]*Value, 0, len(defs)), + e2phi: nil, + } + return lc.placeProxyPhi(ln, loop, defs) +} diff --git a/src/cmd/compile/internal/ssa/lcssa_test.go b/src/cmd/compile/internal/ssa/lcssa_test.go new file mode 100644 index 00000000000000..f25f5adb07a214 --- /dev/null +++ b/src/cmd/compile/internal/ssa/lcssa_test.go @@ -0,0 +1,203 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "cmd/compile/internal/types" + "testing" +) + +func doLCSSA(fun fun) bool { + CheckFunc(fun.f) + f := fun.f + loopnest := f.loopnest() + loopnest.assembleChildren() + loopnest.findExits() + for _, loop := range loopnest.loops { + if f.BuildLoopClosedForm(loopnest, loop) { + CheckFunc(fun.f) + return true + } + } + return false +} + +// Simple Case: use block is dominated by a single loop exit +func TestLoopUse1(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Goto("useBlock")), + Bloc("useBlock", + Valu("use", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Exit("mem"))) + + if !doLCSSA(fun) { + t.Fatal("Failed to build LCSSA") + } + + // loop header: + // i = phi(0, inc) + // .... + // + // loop exit: + // p1 = phi(i) <= proxy phi + // Plain useBlock + // + // useBlock: + // use = p1 + 1 + verifyNumValue(fun, t, OpPhi, 2 /*var i + 1 proxy phi*/) +} + +// Harder Case: use block is reachable from multiple loop exits +func TestLoopUse2(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopBody", "loopExit")), + Bloc("loopBody", + Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp2", "loopExit1", "loopLatch")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit1", + Goto("useBlock")), + Bloc("loopExit", + Goto("useBlock")), + Bloc("useBlock", + Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"), + Exit("mem"))) + + if !doLCSSA(fun) { + t.Fatal("Failed to build LCSSA") + } + + // loop header: + // i = phi(0, inc) + // .... + // + // loop exit: + // p1 = phi(i) <= proxy phi + // Plain useBlock + // + // loop exit1: + // p2 = phi(i) <= proxy phi + // Plain useBlock + // + // useBlock: + // p3 = phi(p1, p2) <= proxy phi + // use = p1 + 1 + verifyNumValue(fun, t, OpPhi, 4 /*var i + 3 proxy phi*/) +} + +// Used by ctrl valule +func TestLoopUse3(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + // used by ctrl value + If("cmp", "exit1", "exit2")), + Bloc("exit1", + Goto("exit2")), + Bloc("exit2", + Exit("mem"))) + + if !doLCSSA(fun) { + t.Fatal("Failed to build LCSSA") + } + + // loop header: + // i = phi(0, inc) + // .... + // + // loop exit: + // p1 = phi(i) <= proxy phi + // If p1-> exit1, exit2 + verifyNumValue(fun, t, OpPhi, 2 /*var i + 1 proxy phi*/) +} + +// Used by Phi +func TestLoopUse4(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopBody", "loopExit")), + Bloc("loopBody", + Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp2", "loopExit1", "loopLatch")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit1", + Goto("useBlock")), + Bloc("loopExit", + Goto("useBlock")), + Bloc("useBlock", + Valu("use", OpPhi, c.config.Types.Int64, 0, nil, "i", "i"), + Exit("mem"))) + + if !doLCSSA(fun) { + t.Fatal("Failed to build LCSSA") + } + + // loop header: + // i = phi(0, inc) + // .... + // + // loop exit: + // p1 = phi(i) <= proxy phi + // Plain useBlock + // + // loop exit1: + // p2 = phi(i) <= proxy phi + // Plain useBlock + // + // useBlock: + // use = phi(p1, p2) + verifyNumValue(fun, t, OpPhi, 3 /*var i + 2 proxy phi*/ +1 /*original phi*/) +} diff --git a/src/cmd/compile/internal/ssa/licm.go b/src/cmd/compile/internal/ssa/licm.go new file mode 100644 index 00000000000000..6b361b2f709c3e --- /dev/null +++ b/src/cmd/compile/internal/ssa/licm.go @@ -0,0 +1,384 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "fmt" + "sort" +) + +// ---------------------------------------------------------------------------- +// Loop Invariant Code Motion +// +// The main idea behind LICM is to move loop invariant values outside of the loop +// so that they are only executed once, instead of being repeatedly executed with +// each iteration of the loop. In the context of LICM, if a loop invariant can be +// speculatively executed, then it can be freely hoisted to the loop entry. +// However, if it cannot be speculatively executed, there is still a chance that +// it can be hoisted outside the loop under a few prerequisites: +// +// #1 Instruction is guaranteed to execute unconditionally +// #2 Instruction does not access memory locations that may alias with other +// memory operations inside the loop +// +// For #1, this is guaranteed by loop rotation, where the loop is guaranteed to +// execute at least once after rotation. But that's not the whole story. If the +// instruction is guarded by a conditional expression (e.g., loading from a memory +// address usually guarded by an IsInBound check), in this case, we try to hoist +// it only if the loop invariant dominates all loop exits, which implies that it +// will be executed unconditionally as soon as it enters the loop. +// For #2, we always pessimistically assume that they are must-aliases and stop +// optimizing if we saw both load and store + +func logInvariant(val *Value, src *Block, dest *Block) { + hoistType := "Simple" + if isHoistable(val) { + hoistType = "Complex" + } + if dest.Func.pass.debug > 2 { + fmt.Printf("Hoist%s %v from %v to %v in %v\n", + hoistType, val.LongString(), src, dest, dest.Func.Name) + } +} + +func moveTo(val *Value, block *Block) { + for valIdx, v := range val.Block.Values { + if val != v { + continue + } + val.moveTo(block, valIdx) + break + } +} + +func isMemoryDef(val *Value) bool { + switch val.Op { + case OpStore, OpMove, OpZero, OpStoreWB, OpMoveWB, OpZeroWB, + OpPanicBounds, OpPanicExtend, + OpPubBarrier, + OpVarDef, OpVarLive, OpKeepAlive: + return true + } + return false +} + +// alwaysExecute checks if Value is guaranteed to execute during loop iterations +// Otherwise, it should not be hoisted. The most common cases are invariants +// guarded by a conditional expression. +// TODO: If we can prove that Value can speculative execute nevertheless, e.g. +// Load from non-null pointer, this is not really necessary +func alwaysExecute(sdom SparseTree, loop *loop, val *Value) bool { + block := val.Block + // Because loop header can always jump to the loop exit, all blocks + // inside the loop are never post-dominated by any loop exit. + // Therefore, we need to first apply loop rotation to eliminate the path + // from the loop header to the loop exit. + for _, exit := range loop.exits { + if exit == loop.exit { + if !sdom.IsAncestorEq(block, loop.latch) { + return false + } + continue + } + if !sdom.IsAncestorEq(block, exit) { + return false + } + } + return true +} + +func isHoistable(val *Value) bool { + // The protagonist of the whole story + switch val.Op { + case OpLoad, OpStore, OpNilCheck, OpGetG, OpVarDef, OpConvert: + return true + } + return false +} + +type hoister struct { + fn *Func + sdom SparseTree + ln *loopnest + hoisted map[*Value]bool +} + +func (h *hoister) hoist(block *Block, val *Value) { + if arg := val.MemoryArg(); arg != nil { + // If val produces memory, all its uses should be replaced with incoming + // memory input of val + if isMemoryDef(val) { + mem := arg + for _, b := range h.fn.Blocks { + b.replaceUses(val, mem) + } + } + } + + srcBlock := val.Block + moveTo(val, block) + logInvariant(val, srcBlock, block) + h.hoisted[val] = true +} + +// tryHoist hoists profitable loop invariant to block that dominates the entire +// loop. Value is considered as loop invariant if all its inputs are defined +// outside the loop or all its inputs are loop invariants. Since loop invariant +// will immediately moved to dominator block of loop, the first rule actually +// already implies the second rule +func (h *hoister) tryHoist(loop *loop, invariants loopInvariants, val *Value) bool { + // Value is already hoisted + if hoisted, exist := h.hoisted[val]; exist { + return hoisted + } + // Value is type of Phi, we can not hoist it now + if val.Op == OpPhi { + h.hoisted[val] = false + return false + } + + // Try to hoist arguments of value first, they are guaranteed to be loop + // invariants but not necessarily hoistable + h.hoisted[val] = false + for _, arg := range val.Args { + if arg.Type.IsMemory() { + if !isMemoryDef(arg) { + continue + } + } + if _, isInvariant := invariants[arg]; isInvariant { + if !h.tryHoist(loop, invariants, arg) { + return false + } + } else { + // Value is not loop invariant, it must dominate the loop header + // or type of memory, simply check it + if arg.Op != OpUnknown && arg.Op != OpInvalid && + !arg.Type.IsMemory() && + !h.sdom.IsAncestorEq(arg.Block, loop.header) { + h.fn.Fatalf("arg %v must define outside loop", arg) + } + } + } + + // This catches most common case, e.g. arithmetic, bit operation, etc. + if !isAccessMemory(val) { + assert(val.MemoryArg() == nil, "sanity check") + h.hoist(loop.land, val) + return true + } + + // Instructions are selected ones? + if isHoistable(val) { + assert(loop.IsRotatedForm(), "loop must be rotated") + + // Instructions are guaranteed to execute unconditionally? + if !alwaysExecute(h.sdom, loop, val) { + if h.fn.pass.debug > 1 { + fmt.Printf("LICM failure: %v not always execute\n", val.LongString()) + } + return false + } + + h.hoist(loop.land, val) + return true + } + + if h.fn.pass.debug > 1 { + fmt.Printf("LICM failure: %v is not hoistable\n", val.LongString()) + } + return false +} + +// Hoisting memory def to loop land may break memory state of loop header, this +// should be fixed after CFG transformation done +func (h *hoister) fixMemoryState(loop *loop, startMem, endMem []*Value) { + // No instruction hoisted? Do nothing them + if len(h.hoisted) == 0 { + return + } + + // Find last memory def in loop entry, which in turns become last memory + // or loop guard, this implies that loop guard can not contain memory def + lastMem := endMem[loop.entry.ID] + for _, val := range loop.guard.Values { + if isMemoryDef(val) { + h.fn.Fatalf("Loop guard %v contains memory def %v", loop.guard, val) + } + } + + // Find last memory def in loop land + oldLastMem := lastMem + for _, val := range loop.land.Values { + if arg := val.MemoryArg(); arg != nil { + val.SetArg(len(val.Args)-1, lastMem) + } + if isMemoryDef(val) { + lastMem = val + } + } + + // If loop land has new memory def, memory state of loop header should be + // updated as well + if oldLastMem != lastMem { + headerMem := startMem[loop.header.ID] + if headerMem == nil { + h.fn.Fatalf("Canot find start memory of loop header %v", loop.header) + } + if headerMem.Op == OpPhi { + landIdx := -1 + for idx, pred := range loop.header.Preds { + if pred.b == loop.land { + landIdx = idx + break + } + } + headerMem.SetArg(landIdx, lastMem) + } else { + loop.header.replaceUses(headerMem, lastMem) + } + } +} + +type loopInvariants map[*Value]bool + +func stableKeys(li loopInvariants) []*Value { + keys := make([]*Value, 0) + for k, _ := range li { + keys = append(keys, k) + } + sort.SliceStable(keys, func(i, j int) bool { + return keys[i].ID < keys[j].ID + }) + return keys +} + +// findInviant finds all loop invariants within the loop +func (loop *loop) findInvariant(ln *loopnest) loopInvariants { + loopValues := make(map[*Value]bool) + invariants := make(map[*Value]bool) + loopBlocks := ln.findLoopBlocks(loop) + + // First, collect all def inside loop + hasLoad, hasStore := false, false + for _, block := range loopBlocks { + for _, value := range block.Values { + if value.Op == OpLoad { + hasLoad = true + } else if value.Op == OpStore { + hasStore = true + } else if value.Op.IsCall() { + if ln.f.pass.debug > 1 { + fmt.Printf("LICM failure: find call %v\n", value.LongString()) + } + return nil + } + loopValues[value] = true + } + } + + // See if loop contains both Load and Store and pessimistically assume that + // they are must-aliases and stop optimizing + // TODO: We can do better here by using type-based alias analysis in + // some cases + if hasLoad && hasStore { + if ln.f.pass.debug > 1 { + fmt.Printf("LICM failure: %v has both load and store\n", loop) + } + return nil + } + + changed := true + for changed { + numInvar := len(invariants) + for val, _ := range loopValues { + // If basic block is located in a nested loop rather than directly in + // the current loop, it will not be processed. + if ln.b2l[val.Block.ID] != loop { + continue + } + isInvariant := true + for _, use := range val.Args { + if use.Type.IsMemory() { + // Discard last memory value + continue + } + if _, exist := invariants[use]; exist { + continue + } + if _, exist := loopValues[use]; exist { + isInvariant = false + break + } + } + if isInvariant { + invariants[val] = true + } + } + changed = (len(invariants) != numInvar) + } + + return invariants +} + +// licm stands for Loop Invariant Code Motion, it hoists expressions that computes +// the same value outside loop +func licm(fn *Func) { + loopnest := fn.loopnest() + if loopnest.hasIrreducible { + return + } + if len(loopnest.loops) == 0 { + return + } + + loopnest.assembleChildren() + loopnest.findExits() + lcssa := make(map[*loop]bool, 0) + + // Transform all loops to loop closed form + for _, loop := range loopnest.loops { + lcssa[loop] = fn.BuildLoopClosedForm(loopnest, loop) + } + + h := &hoister{ + fn: fn, + ln: loopnest, + hoisted: make(map[*Value]bool), + } + // Remember initial memory subgraph before LICM + startMem, endMem := memState(fn) + for _, loop := range loopnest.loops { + // See if loop is in form of LCSSA + if wellFormed := lcssa[loop]; !wellFormed { + continue + } + + // Rotate the loop to ensures that loop executes at least once + if !fn.RotateLoop(loop) { + continue + } + + // Find loop invariants within the loop + invariants := loop.findInvariant(loopnest) + if invariants == nil || len(invariants) == 0 { + continue + } + + // Create a home for hoistable Values after rotation + if !loop.CreateLoopLand(fn) { + fn.Fatalf("Can not create loop land for %v", loop.LongString()) + } + + // All prerequisites are satisfied, try to hoist loop invariants + h.sdom = fn.Sdom() + for _, val := range stableKeys(invariants) { + h.tryHoist(loop, invariants, val) + } + + // Fix broken memory state given that CFG no longer changes + h.fixMemoryState(loop, startMem, endMem) + } +} diff --git a/src/cmd/compile/internal/ssa/licm_test.go b/src/cmd/compile/internal/ssa/licm_test.go new file mode 100644 index 00000000000000..4100c5f7d9b61d --- /dev/null +++ b/src/cmd/compile/internal/ssa/licm_test.go @@ -0,0 +1,192 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "cmd/compile/internal/types" + "fmt" + "testing" +) + +func doLICM(fun fun) { + CheckFunc(fun.f) + licm(fun.f) + CheckFunc(fun.f) +} + +func checkHoist(t *testing.T, fun fun, loopInvariants ...string) { + loopHeader := fun.blocks["loopHeader"] + // Find loop land block + sdom := fun.f.Sdom() + var loopLand *Block + for _, pred := range loopHeader.Preds { + if sdom.isAncestor(pred.b, loopHeader) { + loopLand = pred.b + break + } + } + if loopLand == nil { + fmt.Printf("== After LICM: %v\n", fun.f.String()) + t.Errorf("Error: loop land block not found\n") + } + if len(loopLand.Preds) != 1 || len(loopLand.Succs) != 1 { + fmt.Printf("== After LICM: %v\n", fun.f.String()) + t.Errorf("Error: bad loop land\n") + } + // Find expected loop invariant from loop land + cnt := 0 + for _, li := range loopInvariants { + for _, val := range loopLand.Values { + if val == fun.values[li] { + cnt++ + break + } + } + } + + if cnt != len(loopInvariants) { + fmt.Printf("== After LICM: %v\n", fun.f.String()) + t.Errorf("Error: loop invariant not found in loop land") + } +} + +// Hoist simple arithmetic loop invariant +// +// for i := 0; i < 10; i++ { +// li := 10 * 10 +// } +func TestHoistSimpleLI(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("li", OpMul64, c.config.Types.Int64, 0, nil, "ten", "one"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + doLICM(fun) + checkHoist(t, fun, "li") +} + +// Hoist simple arithmetic but may trap execution +// +// func foo(arg1 int) +// for i := 0; i < 10; i++ { +// li := (10*10) / arg1 /*may be zero*/ +// } +// } +func TestHoistTrapDiv(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("arg1", OpArg, c.config.Types.Int64, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("li", OpMul64, c.config.Types.Int64, 0, nil, "ten", "one"), + Valu("li2", OpDiv64, c.config.Types.Int64, 0, nil, "li", "arg1"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + doLICM(fun) + checkHoist(t, fun, "li", "li2") +} + +// Hoist load from loop +func TestHoistLoad(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("addr1", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"), + Valu("load", OpLoad, c.config.Types.Int8, 0, nil, "addr1", "mem"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + doLICM(fun) + checkHoist(t, fun, "load", "addr1") +} + +func TestHoistStore(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("addr", OpAddr, c.config.Types.Int8.PtrTo(), 0, nil, "sb"), + Valu("store", OpStore, types.TypeMem, 0, nil, "addr", "one", "mem"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + doLICM(fun) + checkHoist(t, fun, "store", "addr") +} + +// Hoist nil check from loop +func TestHoistNilCheck(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Valu("addr", OpAddr, c.config.Types.Int8.PtrTo(), 0, nil, "sb"), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("nilcheck", OpNilCheck, c.config.Types.IntPtr, 0, nil, "addr", "mem"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + doLICM(fun) + checkHoist(t, fun, "nilcheck") +} diff --git a/src/cmd/compile/internal/ssa/likelyadjust.go b/src/cmd/compile/internal/ssa/likelyadjust.go index 1d0e53cf5b6086..78313fb2844a88 100644 --- a/src/cmd/compile/internal/ssa/likelyadjust.go +++ b/src/cmd/compile/internal/ssa/likelyadjust.go @@ -4,26 +4,81 @@ package ssa -import ( - "fmt" -) - +import "fmt" + +// ---------------------------------------------------------------------------- +// The Loop +// +// The natural loop usually looks like in below IR form: +// +// loop entry +// │ +// │ ┌───loop latch +// ▼ ▼ ▲ +// loop header │ +// │ │ │ +// │ └──►loop body +// ▼ +// loop exit +// +// In the terminology, loop entry dominates the entire loop, loop header contains +// the loop conditional test, loop body refers to the code that is repeated, loop +// latch contains the backedge to loop header, for simple loops, the loop body is +// equal to loop latch, and loop exit refers to the block that dominated by the +// entire loop. +// +// After loop rotation, the loop will be transformed to below form with additional +// guard block and land block: +// +// loop entry +// │ +// │ +// ▼ +// ┌──loop guard +// │ │ +// │ │ +// │ ▼ +// | loop land <= safe land to place Values +// │ │ +// │ │ +// │ ▼ +// │ loop header◄──┐ +// │ │ │ +// │ │ │ +// │ ▼ │ +// │ loop body │ +// │ │ │ +// │ │ │ +// │ ▼ │ +// │ loop latch────┘ +// │ │ +// │ │ +// │ ▼ +// └─► loop exit +// +// Where loop guard ensures the loop body is executed at least once, and loop +// land is a safe place to place Values that are moved out of the loop and only +// executed once before the loop. type loop struct { header *Block // The header node of this (reducible) loop - outer *loop // loop containing this loop + entry *Block // loop entry which jumps to header directly + exit *Block // The unique main exit block of this loop, if any + latch *Block // Source of backedge, where increment happens + body *Block // The first loop body, near to the header + guard *Block // Ensure loop executed at least once after rotation + land *Block // Safe land block to place instructions after rotation + outer *loop // Outer loop containing this loop // By default, children, exits, and depth are not initialized. children []*loop // loops nested directly within this loop. Initialized by assembleChildren(). exits []*Block // exits records blocks reached by exits from this loop. Initialized by findExits(). - // Next three fields used by regalloc and/or + // Next four fields used by regalloc and/or // aid in computation of inner-ness and list of blocks. - nBlocks int32 // Number of blocks in this loop but not within inner loops - depth int16 // Nesting depth of the loop; 1 is outermost. Initialized by calculateDepths(). - isInner bool // True if never discovered to contain a loop - - // register allocation uses this. - containsUnavoidableCall bool // True if all paths through the loop have a call + nBlocks int32 // Number of blocks in this loop but not within inner loops + depth int16 // Nesting depth of the loop; 1 is outermost. Initialized by calculateDepths(). + isInner bool // True if never discovered to contain a loop + containsUnavoidableCall bool // True if all paths through the loop have a call } // outerinner records that outer contains inner @@ -63,7 +118,7 @@ func checkContainsCall(bb *Block) bool { type loopnest struct { f *Func - b2l []*loop + b2l []*loop // block id to loop mapping po []*Block sdom SparseTree loops []*loop @@ -236,21 +291,36 @@ func likelyadjust(f *Func) { } func (l *loop) String() string { - return fmt.Sprintf("hdr:%s", l.header) + return fmt.Sprintf("Loop@%s", l.header) } -func (l *loop) LongString() string { - i := "" - o := "" - if l.isInner { - i = ", INNER" - } - if l.outer != nil { - o = ", o=" + l.outer.header.String() +func (loop *loop) LongString() string { + // Loop: loop header + // T: loop entry + // B: loop body + // E: loop exit + // L: loop latch + // G: loop guard + // + // * denotes main loop exit + if len(loop.exits) == 1 { + return fmt.Sprintf("Loop@%v(B@%v E@%v L@%v G@%v T@%v)", + loop.header, loop.body, loop.exit, loop.latch, loop.guard, loop.entry) + } else { + s := "" + for i, exit := range loop.exits { + s += exit.String() + if exit == loop.exit { + s += "*" + } + if i != len(loop.exits)-1 { + s += " " + } + } + return fmt.Sprintf("Loop@%v(B@%v E@(%v) L@%v G@%v T@%v)", + loop.header, loop.body, s, loop.latch, loop.guard, loop.entry) } - return fmt.Sprintf("hdr:%s%s%s", l.header, i, o) } - func (l *loop) isWithinOrEq(ll *loop) bool { if ll == nil { // nil means whole program return true @@ -511,6 +581,18 @@ func (ln *loopnest) calculateDepths() { ln.initializedDepth = true } +func removeDuplicate(blocks []*Block) []*Block { + allKeys := make(map[*Block]bool) + list := []*Block{} + for _, item := range blocks { + if _, value := allKeys[item]; !value { + allKeys[item] = true + list = append(list, item) + } + } + return list +} + // findExits uses loop depth information to find the // exits from a loop. func (ln *loopnest) findExits() { @@ -521,17 +603,29 @@ func (ln *loopnest) findExits() { b2l := ln.b2l for _, b := range ln.po { l := b2l[b.ID] - if l != nil && len(b.Succs) == 2 { - sl := b2l[b.Succs[0].b.ID] - if recordIfExit(l, sl, b.Succs[0].b) { - continue - } - sl = b2l[b.Succs[1].b.ID] - if recordIfExit(l, sl, b.Succs[1].b) { - continue + if l != nil { + if len(b.Succs) == 2 { + sl := b2l[b.Succs[0].b.ID] + if recordExit(l, sl, b.Succs[0].b) { + continue + } + sl = b2l[b.Succs[1].b.ID] + if recordExit(l, sl, b.Succs[1].b) { + continue + } + } else if len(b.Succs) > 2 { // JumpTable + assert(b.Kind == BlockJumpTable, "why not otherwise") + for _, s := range b.Succs { + sl := b2l[s.b.ID] + recordExit(l, sl, s.b) + } } } } + // Remove duplicated exits for every loop + for _, loop := range ln.loops { + loop.exits = removeDuplicate(loop.exits) + } ln.initializedExits = true } @@ -543,10 +637,10 @@ func (ln *loopnest) depth(b ID) int16 { return 0 } -// recordIfExit checks sl (the loop containing b) to see if it +// recordExit checks sl (the loop containing b) to see if it // is outside of loop l, and if so, records b as an exit block // from l and returns true. -func recordIfExit(l, sl *loop, b *Block) bool { +func recordExit(l, sl *loop, b *Block) bool { if sl != l { if sl == nil || sl.depth <= l.depth { l.exits = append(l.exits, b) @@ -578,3 +672,45 @@ func (l *loop) setDepth(d int16) { func (l *loop) iterationEnd(b *Block, b2l []*loop) bool { return b == l.header || b2l[b.ID] == nil || (b2l[b.ID] != l && b2l[b.ID].depth <= l.depth) } + +// contains checks if receiver loop contains inner loop in any depth +func (loop *loop) contains(inner *loop) bool { + // Find from current loop + for _, child := range loop.children { + if child == inner { + return true + } + } + // Find from child of current loop + for _, child := range loop.children { + if child.contains(inner) { + return true + } + } + return false +} + +// findLoopBlocks returnss all basic blocks, including those contained in nested loops. +func (ln *loopnest) findLoopBlocks(loop *loop) []*Block { + ln.assembleChildren() + loopBlocks := make([]*Block, 0) + for id, tloop := range ln.b2l { + if tloop == nil { + continue + } + if tloop == loop { + // Find block by id and append it + for _, block := range ln.f.Blocks { + if int32(block.ID) == int32(id) { + loopBlocks = append(loopBlocks, block) + break + } + } + } else if loop.contains(tloop) { + // Otherwise, check if this block is within inner loops + blocks := ln.findLoopBlocks(tloop) + loopBlocks = append(loopBlocks, blocks...) + } + } + return loopBlocks +} diff --git a/src/cmd/compile/internal/ssa/looprotate.go b/src/cmd/compile/internal/ssa/looprotate.go index 844a8f712447c9..4574e5508f88fc 100644 --- a/src/cmd/compile/internal/ssa/looprotate.go +++ b/src/cmd/compile/internal/ssa/looprotate.go @@ -1,12 +1,1111 @@ -// Copyright 2017 The Go Authors. All rights reserved. +// Copyright 2023 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package ssa -// loopRotate converts loops with a check-loop-condition-at-beginning -// to loops with a check-loop-condition-at-end. -// This helps loops avoid extra unnecessary jumps. +import ( + "fmt" + "sort" +) + +// ---------------------------------------------------------------------------- +// Loop Rotation +// +// Loop rotation transforms while/for loop to do-while style loop. The original +// natural loop is in form of below IR +// +// loop entry +// │ +// │ ┌───loop latch +// ▼ ▼ ▲ +// loop header │ +// │ │ │ +// │ └──►loop body +// ▼ +// loop exit +// +// We move the conditional test from loop header to loop latch, incoming backedge +// argument of conditional test should be updated as well otherwise we would lose +// one update. Also note that any other uses of moved values should be updated +// because moved Values now live in loop latch and may no longer dominates their +// uses. At this point, loop latch determines whether loop continues or exits +// based on rotated test. +// +// loop entry +// │ +// │ +// ▼ +// loop header◄──┐ +// │ │ +// │ │ +// ▼ │ +// loop body │ +// │ │ +// │ │ +// ▼ │ +// loop latch────┘ +// │ +// │ +// ▼ +// loop exit +// +// Now loop header and loop body are executed unconditionally, this may changes +// program semantics while original program executes them only if test is okay. +// A so-called loop guard is inserted to ensure loop is executed at least once. +// +// loop entry +// │ +// │ +// ▼ +// ┌──loop guard +// │ │ +// │ │ +// │ ▼ +// │ loop header◄──┐ +// │ │ │ +// │ │ │ +// │ ▼ │ +// │ loop body │ +// │ │ │ +// │ │ │ +// │ ▼ │ +// │ loop latch────┘ +// │ │ +// │ │ +// │ ▼ +// └─► loop exit +// +// Loop header no longer dominates entire loop, loop guard dominates it instead. +// If Values defined in the loop were used outside loop, all these uses should be +// replaced by a new Phi node at loop exit which merges control flow from loop +// header and loop guard. Based on Loop Closed SSA Form, these Phis have already +// been created. All we need to do is simply reset their operands to accurately +// reflect the fact that loop exit is a merge point now. +// +// One of the main purposes of Loop Rotation is to assist other optimizations +// such as LICM. They may require that the rotated loop has a proper while safe +// block to place new Values, an optional loop land block is hereby created to +// give these optimizations a chance to keep them from being homeless. +// +// loop entry +// │ +// │ +// ▼ +// ┌──loop guard +// │ │ +// │ │ +// │ ▼ +// | loop land <= safe land to place Values +// │ │ +// │ │ +// │ ▼ +// │ loop header◄──┐ +// │ │ │ +// │ │ │ +// │ ▼ │ +// │ loop body │ +// │ │ │ +// │ │ │ +// │ ▼ │ +// │ loop latch────┘ +// │ │ +// │ │ +// │ ▼ +// └─► loop exit +// +// The detailed loop rotation algorithm is summarized as following steps +// +// 1. Transform the loop to Loop Closed SSA Form +// * All uses of loop defined Values will be replaced by uses of proxy phis +// +// 2. Check whether loop can apply loop rotate +// * Loop must be a natural loop and have a single exit and so on.. +// +// 3. Rotate loop conditional test and rewire loop edges +// * Rewire loop header to loop body unconditionally. +// * Rewire loop latch to header and exit based on new conditional test. +// * Create new loop guard block and rewire loop entry to loop guard. +// * Clone conditional test from loop header to loop guard. +// * Rewire loop guard to original loop header and loop exit +// +// 4. Reconcile broken data dependency after CFG transformation +// * Move conditional test from loop header to loop latch +// * Update uses of moved Values because these defs no longer dominates uses +// after they were moved to loop latch +// * Add corresponding argument for phis at loop exits since new edge from +// loop guard to loop exit had been created +// * Update proxy phi to use the loop phi's incoming argument which comes +// from loop latch since loop latch may terminate the loop now + +// checkLoopForm checks if loop is well formed and returns failure reason if not +func (loop *loop) checkLoopForm(fn *Func, sdom SparseTree) string { + loopHeader := loop.header + // Check if loop header is well formed block + if len(loopHeader.Preds) != 2 || len(loopHeader.Succs) != 2 || + loopHeader.Kind != BlockIf { + return "bad loop header" + } + + // Check if loop exit nears the loop header + fn.loopnest().findExits() // initialize loop exits + e1, e2 := loopHeader.Succs[1].b, loopHeader.Succs[0].b + found := false + for _, exit := range loop.exits { + if exit == e1 { + loop.exit = e1 + loop.body = loopHeader.Succs[0].b + found = true + break + } else if exit == e2 { + loop.exit = e2 + loop.body = loopHeader.Succs[1].b + found = true + break + } + } + if !found { + return "far loop exit beyond header" + } + + loop.latch = loopHeader.Preds[1].b + + // Check if loop header dominates all loop exits + if len(loop.exits) != 1 { + for _, exit := range loop.exits { + if exit == loop.exit { + continue + } + // Loop header may not dominate all loop exist, given up for these + // exotic guys + if !sdom.IsAncestorEq(loopHeader, exit) { + return "loop exit is not dominated by header" + } + } + } + + // Check loop conditional test is "trivial" + for _, ctrl := range loop.header.ControlValues() { + if !loop.isTrivial(sdom, ctrl, true) { + return "non trivial loop cond" + } + } + + // Check if all loop uses are "trivial" + for ipred, pred := range loop.exit.Preds { + if pred.b == loop.header { + for _, val := range loop.exit.Values { + // TODO: Relax or remove this restriction + if val.Op == OpPhi { + if arg := val.Args[ipred]; arg.Block == loop.header { + if !loop.isTrivial(sdom, arg, false) { + return "use non trivial loop def outside loop" + } + } + } else if val.Block == loop.header { + if !loop.isTrivial(sdom, val, false) { + return "use non trivial loop def outside loop" + } + } + } + break + } + } + return "" +} + +// A loop def is "trivial" if, starting from the value, it is looked up along its +// argument until it encounters the loop phi defined in the loop header, no +// intractable values are encountered in the process, or the lookup depth does +// not exceed the MaxDepth. We need this restriction because all the values in +// the chain from the loop phi to the trivial loop def could be cloned into other +// block, and cloning without careful scrutiny would lead to code bloat and extra +// performance penalty. +const ( + InitDepth = 0 + MaxDepth = 5 +) + +type loopTrivialVal struct { + cloning bool + valBlock *Block + touched map[*Value]*Value + visited map[*Value]bool +} + +func (t *loopTrivialVal) clone(val *Value, dest *Block, depth int) *Value { + // If seeing Phi or value that lives different from source block? They must + // not part of trivial loop def chain, do nothing + if val.Op == OpPhi || val.Block != t.valBlock { + return val + } + + // If val is already cloned? Use cloned value instead. + if c, exist := t.touched[val]; exist { + return c + } + + // Clone val and its arguments recursively + clone := dest.Func.newValueNoBlock(val.Op, val.Type, val.Pos) + clone.AuxInt = val.AuxInt + clone.Aux = val.Aux + args := make([]*Value, len(val.Args)) + for i := 0; i < len(val.Args); i++ { + args[i] = t.clone(val.Args[i], dest, depth+1) + } + clone.AddArgs(args...) + dest.placeValue(clone) + t.touched[val] = clone // cache cloned value after cloning its arguments + return clone +} + +func (t *loopTrivialVal) move(val *Value, dest *Block, depth int) { + if val.Op == OpPhi || val.Block != t.valBlock { + return + } + for _, arg := range val.Args { + t.move(arg, dest, depth+1) + } + moveTo(val, dest) +} + +func (t *loopTrivialVal) update(val *Value, loop *loop, loopPhiIdx, depth int) { + // It's a Phi or value that lives different from source block? It must not + // part of trivial loop def chain, do nothing + if val.Op == OpPhi || val.Block != t.valBlock { + return + } + if _, hasCycle := t.visited[val]; hasCycle { + // Just skip it to avoid infinite recursion + return + } + t.visited[val] = true + for iarg, arg := range val.Args { + // If arg of val is a Phi which lives in loop header? + if arg.Op == OpPhi && arg.Block == loop.header { + // If expected incoming argument of arg is not visited, this implies + // that it may comes from loop latch, this is the most common case, + // update val to use incoming argument instead of arg. Otherwise, + // there is a cyclic dependency, see below for more details. + newUse := arg.Args[loopPhiIdx] + if _, livesInHeader := t.touched[newUse]; !livesInHeader { + // In original while/for loop, a critical edge is inserted at the + // end of each iteration, Phi values are updated. All subsequent + // uses of Phi rely on updated values. However, when converted + // to a do-while loop, Phi nodes may be used at the end of each + // iteration before they are updated. Therefore, we need to + // replace all subsequent uses of Phi with use of Phi parameter. + // This way, it is equivalent to using updated values of Phi + // values. Here is a simple example: + // + // Normal case, if v2 uses v1 phi, and the backedge operand v4 + // of v1 phi is located in the loop latch block, we only need to + // modify the usage of v1 by v2 to the usage of v4. This prevents + // loss of updates, and the dominance relationship will not be + // broken even after v2 is moved to the loop latch. + // + // Before: + // loop header: + // v1 = phi(0, v4) + // v2 = v1 + 1 + // If v2 < 3 -> loop body, loop exit + // + // loop latch: + // v4 = const 512 + // + // After: + // loop header: + // v1 = phi(0, v4) + // + // loop latch: + // v4 = const 512 + // v2 = v4 + 1 + // If v2 < 3 -> loop header, loop exit + val.SetArg(iarg, newUse) + // After updating uses of val, we may create yet another cyclic + // dependency, i.e. + // + // loop header: + // v1 = phi(0, v4) + // v2 = v1 + 1 + // If v2 < 3 -> loop body, loop exit + // + // loop latch: + // v4 = v2 + 1 + // + // After updating iarg of val to newUse, it becomes + // + // loop header: + // v1 = phi(0, v4) + // + // loop latch: + // v2 = v4 + 1 ;;; cyclic dependency + // v4 = v2 + 1 + // If v2 < 3 -> loop header, loop exit + // + // This is similiar to below case, and it would be properly handled + // by updateMovedUses. For now, we just skip it to avoid infinite + // recursion. + } else { + // If there is a value v1 in the loop header that is used to define + // a v2 phi in the same basic block, and this v2 phi is used in + // turn to use the value v1, there is a cyclic dependency, i.e. + // + // loop header: + // v1 = phi(0, v2) ;;; cyclic dependency + // v2 = v1 + 1 + // If v2 < 3 -> loop body, loop exit + // + // In this case, we need to first convert the v1 phi into its + // normal form, where its back edge parameter uses the value defined + // in the loop latch. + // + // loop header: + // v1 = phi(0, v3) + // v2 = v1 + 1 + // If v2 < 3 -> loop body, loop exit + // + // loop latch: + // v3 = Copy v2 + // + // After this, the strange v1 phi is treated in the same way as + // other phis. After moving the conditional test to the loop latch, + // the relevant parameters will also be updated, i.e., v2 will + // use v3 instead of v1 phi: + // + // loop header: + // v1 = phi(0, v3) + // + // loop latch: + // v3 = Copy v2 + // v2 = v3 + 1 + // If v2 < 3 -> loop header, loop exit + // + // Finally, since v3 is use of v2, after moving v2 to the loop + // latch, updateMovedUses will update these uses and insert a + // new v4 Phi. + // + // loop header: + // v1 = phi(0, v3) + // v4 = phi(v2', v2) ;;; v2' lives in loop guard + // + // loop latch: + // v3 = Copy v4 + // v2 = v3 + 1 + // If v2 < 3 -> loop header, loop exit + + // Copy from cyclic dependency value and place it to loop latch + fn := arg.Block.Func + copy := fn.newValueNoBlock(OpCopy, arg.Type, arg.Pos) + if t.cloning { + // If we are cloning, we need to be very careful when updating + // the clonee, not the clone, otherwise, it can lead to another + // disastrous circular dependencies, e.g. + // + // loop header: + // v1 = phi(0, v3) + // + // loop latch: + // v3 = Copy v2 + // v2 = v3 + 1 + // If v2 < 3 -> loop header, loop exit + // + // critical block(between loop latch and loop exit): + // v3' = Copy v2 ;;; copy from v2 instead of v2' + // v2' = v3' + 1 + for clonee, clone := range t.touched { + if clone == val { + copy.SetArgs1(clonee) + break + } + } + if len(copy.Args) == 0 { + fn.Fatalf("can not found clone from clonee") + } + } else { + copy.SetArgs1(newUse) + } + loop.latch.placeValue(copy) + // Replace incoming argument of loop phi to copied value + arg.SetArg(loopPhiIdx, copy) + // Update val to use copied value as usual + val.SetArg(iarg, copy) + + if fn.pass.debug > 1 { + fmt.Printf("== Insert %v during updating %v\n", copy, val) + } + } + } else { + t.update(arg, loop, loopPhiIdx, depth+1) + } + } +} + +func (t *loopTrivialVal) valid(sdom SparseTree, val *Value, allowSideEffect bool, depth int) bool { + if depth >= MaxDepth { + return false + } + + if sdom.isAncestor(val.Block, t.valBlock) { + return true + } + + if val.Op == OpPhi { + if val.Block == t.valBlock { + return true + } + return false + } + + if !allowSideEffect { + if val.Op != OpLoad && isAccessMemory(val) { + return false + } + } + + for _, arg := range val.Args { + if !t.valid(sdom, arg, allowSideEffect, depth+1) { + return false + } + } + return true +} + +// isTrivial checks if val is "trivial" and returns true if it is, otherwise false. +func (loop *loop) isTrivial(sdom SparseTree, val *Value, allowSideEffect bool) bool { + t := &loopTrivialVal{ + valBlock: loop.header, + } + return t.valid(sdom, val, allowSideEffect, InitDepth) +} + +// cloneTrivial clones val to destination block and updates its uses accordingly +func (loop *loop) cloneTrivial(val *Value, dest *Block, loopPhiIdx int) (*Value, map[*Value]*Value) { + t := &loopTrivialVal{ + cloning: true, + valBlock: val.Block, + touched: make(map[*Value]*Value), + visited: make(map[*Value]bool), + } + clone := t.clone(val, dest, InitDepth) + t.valBlock = dest + t.update(clone, loop, loopPhiIdx, InitDepth) + return clone, t.touched +} + +// moveTrivial moves val to destination block and updates its uses accordingly +func (loop *loop) moveTrivial(val *Value, dest *Block, cloned map[*Value]*Value, loopPhiIdx int) { + t := &loopTrivialVal{ + cloning: false, + valBlock: val.Block, + visited: make(map[*Value]bool), + } + t.move(val, dest, InitDepth) + t.valBlock = dest + t.touched = cloned + t.update(val, loop, loopPhiIdx, InitDepth) +} + +// moveCond moves conditional test from loop header to loop latch +func (loop *loop) moveCond(cond *Value, cloned map[*Value]*Value) { + if cond.Block != loop.header { + // More rare, ctrl Value is not live in loop header, do nothing + return + } + + if cond.Op == OpPhi { + // Rare case, Phi is used as conditional test, use its incoming argument + // If (Phi v1 v2) -> loop body, loop exit + // => If v1 -> loop header, loop exit + cond = cond.Args[LoopLatch2HeaderPredIdx] + loop.latch.SetControl(cond) + return + } + + // Normal case, update as usual + // If (Less v1 Phi(v2 v3)) -> loop body, loop exit + // => If (Less v1 v2) -> loop header, loop exit + loop.moveTrivial(cond, loop.latch, cloned, LoopLatch2HeaderPredIdx) +} + +// cloneCond clones conditional test from loop header to loop guard +func (loop *loop) cloneCond(cond *Value) (*Value, map[*Value]*Value) { + if cond.Block != loop.header { + // Dont clone if ctrl Value is not live in loop header + return cond, nil + } + + if cond.Op == OpPhi { + // Use incoming argument of Phi as conditional test directly + guardCond := cond.Args[LoopGuard2HeaderPredIdx] + return guardCond, nil + } + + // Clone as usual + return loop.cloneTrivial(cond, loop.guard, LoopGuard2HeaderPredIdx) +} + +const ( + LoopGuard2HeaderPredIdx = 0 + LoopLatch2HeaderPredIdx = 1 +) + +// rewireLoopHeader rewires loop header to loop body unconditionally +func (loop *loop) rewireLoopHeader() { + loopHeader := loop.header + loopHeader.Reset(BlockPlain) + + // loopHeader -> loopBody(0) + loopHeader.Succs = loopHeader.Succs[:1] + loopHeader.Succs[0] = Edge{loop.body, 0} + assert(len(loop.body.Preds) == 1, "why not otherwise") + loop.body.Preds[0] = Edge{loopHeader, 0} +} + +// rewireLoopLatch rewires loop latch to loop header and loop exit +func (loop *loop) rewireLoopLatch(ctrl *Value, exitIdx int) { + loopExit := loop.exit + loopLatch := loop.latch + loopHeader := loop.header + loopLatch.resetWithControl(BlockIf, ctrl) + loopLatch.Likely = loopHeader.Likely + loopLatch.Pos = ctrl.Pos + loopHeader.Likely = BranchUnknown + + var idx = -1 + for i := 0; i < len(loopExit.Preds); i++ { + if loopExit.Preds[i].b == loop.header { + idx = i + break + } + } + if exitIdx == 1 { + // loopLatch -> loopHeader(0), loopExit(1) + loopLatch.Succs = append(loopLatch.Succs, Edge{loopExit, idx}) + } else { + // loopLatch -> loopExit(0), loopHeader(1) + loopLatch.Succs = append([]Edge{{loopExit, idx}}, loopLatch.Succs[:]...) + } + // loopExit <- loopLatch, ... + loopExit.Preds[idx] = Edge{loopLatch, exitIdx} + // loopHeader <- loopLatch, ... + for i := 0; i < len(loopHeader.Preds); i++ { + if loopHeader.Preds[i].b == loopLatch { + idx = i + break + } + } + loopHeader.Preds[idx] = Edge{loopLatch, 1 - exitIdx} +} + +// rewireLoopGuard rewires loop guard to loop header and loop exit +func (loop *loop) rewireLoopGuard(guardCond *Value, exitIdx int) { + assert(len(loop.guard.Preds) == 1, "already setup") + loopHeader := loop.header + loopGuard := loop.guard + loopGuard.Pos = loopHeader.Pos + loopGuard.Likely = loopHeader.Likely // respect header's branch predication + loopGuard.SetControl(guardCond) + + var idx = -1 + assert(len(loopHeader.Preds) == 2, "sanity check") + for i := 0; i < len(loopHeader.Preds); i++ { + if loopHeader.Preds[i].b != loop.latch { + idx = i + break + } + } + + loopExit := loop.exit + numExitPred := len(loopExit.Preds) + if exitIdx == 1 { + // loopGuard -> loopHeader(0), loopExit(1) + loopGuard.Succs = append(loopGuard.Succs, Edge{loopHeader, idx}) + loopGuard.Succs = append(loopGuard.Succs, Edge{loopExit, numExitPred}) + loopExit.Preds = append(loopExit.Preds, Edge{loopGuard, 1}) + loopHeader.Preds[idx] = Edge{loopGuard, 0} + } else { + // loopGuard -> loopExit(0), loopHeader(1) + loopGuard.Succs = append(loopGuard.Succs, Edge{loopExit, numExitPred}) + loopGuard.Succs = append(loopGuard.Succs, Edge{loopHeader, idx}) + loopExit.Preds = append(loopExit.Preds, Edge{loopGuard, 0}) + loopHeader.Preds[idx] = Edge{loopGuard, 1} + } +} + +// rewireLoopEntry rewires loop entry to loop guard +func (loop *loop) rewireLoopEntry(loopGuard *Block) { + assert(len(loop.header.Preds) == 2, "sanity check") + + // Find loop entry from predecessor of loop header + for _, pred := range loop.header.Preds { + if pred.b != loop.latch { + loop.entry = pred.b + break + } + } + assert(loop.entry != nil, "missing loop entry") + + // If loop entry is plain block, simply add edge from loop entry to guard + loopEntry := loop.entry + if len(loopEntry.Succs) == 1 { + // loopEntry(0) -> loopGuard + loopEntry.Succs = loopEntry.Succs[:0] + loopEntry.AddEdgeTo(loopGuard) + } else { + // Rewire corresponding successor of loop entry to loop guard (This could + // be constructed in artificial IR test, but does it really happen?...) + var idx = -1 + for isucc, succ := range loopEntry.Succs { + if succ.b == loop.header { + idx = isucc + break + } + } + // loopEntry(idx) -> loopGuard, ... + loopEntry.Succs[idx] = Edge{loopGuard, 0} + loopGuard.Preds = append(loopGuard.Preds, Edge{loopEntry, idx}) + } +} + +// insertBetween inserts an empty block in the middle of start and end block. +// If such block already exists, it will be returned instead. +func insertBetween(fn *Func, start, end *Block) *Block { + for _, succ := range start.Succs { + if succ.b == end { + break + } else if len(succ.b.Succs) == 1 && succ.b.Succs[0].b == end { + return succ.b + } + } + empty := fn.NewBlock(BlockPlain) + empty.Preds = make([]Edge, 1, 1) + empty.Succs = make([]Edge, 1, 1) + start.ReplaceSucc(end, empty, 0) + end.ReplacePred(start, empty, 0) + return empty +} + +func (loop *loop) findLoopGuardIndex() int { + if loop.header.Preds[0].b == loop.latch { + return 1 + } + return 0 +} + +func (loop *loop) findLoopBackedgeIndex() int { + return 1 - loop.findLoopGuardIndex() +} + +// Loop header no longer dominates loop exit, a new edge from loop guard to loop +// exit is created, this is not reflected in proxy phis in loop exits, i.e. these +// proxy phis miss one argument that comes from loop guard, we need to reconcile +// the divergence +// +// loop guard +// | +// loop exit loop exit / +// | => | / +// v1=phi(v1) v1=phi(v1 v1') <= add missing g2e argument v1' +// +// Since LCSSA ensures that all loop uses are closed, i.e. any out-of-loop uses +// are replaced by proxy phis in loop exit, we only need to add missing argument +// v1' to v1 proxy phi +func (loop *loop) addG2EArg(fn *Func, sdom SparseTree) { + var holder *Block + for _, val := range loop.exit.Values { + // Not even a phi? + if val.Op != OpPhi { + continue + } + // Num of args already satisfies the num of predecessors of loop exit? + if len(val.Args) == len(loop.exit.Preds) { + continue + } + if len(val.Args)+1 != len(loop.exit.Preds) { + fn.Fatalf("Only miss one g2e arg") + } + assert(val.Block == loop.exit, "sanity check") + + // If arguments of the phi is not matched with predecessors of loop exit, + // then add corresponding g2e argument to reflect the new edge from loop + // guard to loop exit + var g2eArg *Value // loop guard to loop exit + for iarg, arg := range val.Args { + exitPred := val.Block.Preds[iarg].b + // If this predecessor is either loop header or inserted block? + if exitPred == loop.latch || exitPred == holder { + if sdom.isAncestor(arg.Block, loop.header) { + // arg lives in block that dominates loop header, it could + // be used as g2eArg directly + g2eArg = arg + } else if arg.Block == loop.header { + // arg lives in loop header, find its counterpart from loop + // guard or create a new one if not exist + guardIdx := loop.findLoopGuardIndex() + + // It's a phi? Simply use its incoming argument that comes + // from loop guard as g2eArg + if arg.Op == OpPhi { + g2eArg = arg.Args[guardIdx] + } else { + // Otherwise, split critical edge from loop guard to exit + // and clone arg into new block, it becomes new g2eArg + holder = insertBetween(fn, loop.guard, loop.exit) + guardArg, _ := loop.cloneTrivial(arg, holder, guardIdx) + g2eArg = guardArg + } + } + } + } + + // Add g2e argument for phi to reconcile the divergence between the num + // of block predecessors and the num of phi arguments + if g2eArg == nil { + fn.Fatalf("Can not create new g2e arg for %v", val.LongString()) + } + newArgs := make([]*Value, len(loop.exit.Preds)) + copy(newArgs, val.Args) + newArgs[len(newArgs)-1] = g2eArg + oldVal := val.LongString() + val.resetArgs() + val.AddArgs(newArgs...) + if fn.pass.debug > 1 { + fmt.Printf("== Add g2e argument %v to %v(%v)\n", + g2eArg, val.LongString(), oldVal) + } + } +} + +func (loop *loop) findGuardArg(fn *Func, val *Value) *Value { + assert(val.Block == loop.header, "mirror comes from loop header") + guardIdx := loop.findLoopGuardIndex() + + // It's a phi? Simply use its incoming argument that comes from loop guard + // as counterpart + if val.Op == OpPhi { + return val.Args[guardIdx] + } + + // Otherwise, split critical edge from loop guard to loop exit and + // clone arg into the new block, this is the new counterpart + holder := insertBetween(fn, loop.guard, loop.exit) + guardArg, _ := loop.cloneTrivial(val, holder, guardIdx) + return guardArg +} + +func (loop *loop) findBackedgeArg(fn *Func, val *Value, start, end *Block) *Value { + assert(val.Block == loop.header, "mirror comes from loop header") + backedgeIdx := loop.findLoopBackedgeIndex() + + // It's a phi? Simply use its incoming argument that comes from loop latch + // as counterpart + if val.Op == OpPhi { + return val.Args[backedgeIdx] + } + + // Otherwise, split edge from start to end and clone arg into the new block, + // this is the new counterpart + holder := insertBetween(fn, start, end) + backedgeArg, _ := loop.cloneTrivial(val, holder, backedgeIdx) + return backedgeArg +} + +// Loop latch now terminates the loop. If proxy phi uses the loop phi that lives +// in loop header, it should be replaced by using the loop phi's incoming argument +// which comes from loop latch instead, this avoids losing one update. +// +// Before: +// loop header: +// v1 = phi(0, v4) +// +// loop latch: +// v4 = v1 + 1 +// +// loop exit +// v3 = phi(v1, ...) +// +// After: +// loop header: +// v1 = phi(0, v4) +// +// loop latch: +// v4 = v1 + 1 +// +// loop exit +// v3 = phi(v4, ...) ;; use v4 instead of v1 +func (loop *loop) updateLoopUse(fn *Func) { + fn.invalidateCFG() + sdom := fn.Sdom() + + for _, loopExit := range loop.exits { + // The loop exit is still dominated by loop header? + if sdom.isAncestor(loop.header, loopExit) { + continue + } + // Loop header no longer dominates this loop exit, find the corresponding + // incoming argument and update it for every phi in exit block + for _, val := range loopExit.Values { + if val.Op != OpPhi { + continue + } + + sdom := fn.Sdom() + loopExit := val.Block + for iarg, arg := range val.Args { + // Only arg lives in the loop header is of interest + if arg.Block != loop.header { + continue + } + // See if corresponding predecessor was not dominated by loop + // header, if so, use corresponding argument to avoid losing one + exitPred := loopExit.Preds[iarg].b + if !sdom.isAncestor(loop.header, exitPred) { + newArg := loop.findGuardArg(fn, arg) + val.SetArg(iarg, newArg) + if fn.pass.debug > 1 { + fmt.Printf("== Update guard arg %v\n", val.LongString()) + } + continue + } + + // If the predecessor of loop exit was dominated by loop latch, + // use corresponding argument to avoid losing one update + if sdom.IsAncestorEq(loop.latch, exitPred) { + newArg := loop.findBackedgeArg(fn, arg, exitPred, loopExit) + val.SetArg(iarg, newArg) + if fn.pass.debug > 1 { + fmt.Printf("== Update backedge arg %v\n", val.LongString()) + } + continue + } + } + } + } +} + +// If the loop conditional test is "trivial", we will move the chain of this +// conditional test values to the loop latch, after that, they may not dominate +// the in-loop uses anymore: +// +// loop header +// v1 = phi(0, ...) +// v2 = v1 + 1 +// If v2 < 3 ... +// +// loop body: +// v4 = v2 - 1 +// +// So we need to create a new phi v5 at the loop header to merge the control flow +// from the loop guard to the loop header and the loop latch to the loop header +// and use this phi to replace the in-loop use v4. e.g. +// +// loop header: +// v1 = phi(0, ...) +// v5 = phi(v2', v2) ;;; v2' lives in loop guard +// +// loop body: +// v4 = v5 - 1 +// +// loop latch: +// v2 = v1 + 1 +// If v2 < 3 ... +func (loop *loop) updateMovedUses(fn *Func, cloned map[*Value]*Value) { + // Find all moved values and sort them in order to ensure determinism + moved := make([]*Value, 0) + for key, _ := range cloned { + moved = append(moved, key) + } + sort.SliceStable(moved, func(i, j int) bool { + return moved[i].ID < moved[j].ID + }) + + // One def may have multiple uses, all of these uses should be replaced by + // the same def replacement + replacement := make(map[*Value]*Value) + // For each of moved value, find its uses inside loop + defUses := buildDefUses(fn, moved) + for _, def := range moved { + uses := defUses[def] + if def.Uses == 1 { + assert(uses[0].useBlock() == loop.latch, "used by another moved val") + continue + } + // For each use of def, if it is not one of the moved values or loop phi + // in loop header, replace it with inserted Phi + for _, use := range uses { + // Used by other moved value or by loop phi in header? Skip them as + // they are not needed to update + if use.val != nil { + if _, exist := cloned[use.val]; exist { + continue + } + if use.val.Op == OpPhi && use.val.Block == loop.header { + continue + } + } else { + if _, exist := cloned[use.block.ControlValues()[0]]; exist { + continue + } + } + // Since LCSSA ensures that all uses of loop defined values are in + // loop we can safely do replacement then + // TODO: Add verification here to check if it does lives inside loop + + // Create phi at loop header, merge control flow from loop guard and + // loop latch, and replace use with such phi. If phi already exists, + // use it instead of creating a new one. + var newUse *Value + if phi, exist := replacement[def]; exist { + newUse = phi + } else { + phi := fn.newValueNoBlock(OpPhi, def.Type, def.Pos) + // Merge control flow from loop guard and loop latch + arg1 := cloned[def] + arg2 := def + if arg1.Block != loop.guard { + fn.Fatalf("arg1 must be live in loop guard") + } + if arg2.Block != loop.latch { + fn.Fatalf("arg2 must be live in loop latch") + } + phi.AddArg2(arg1, arg2) + loop.header.placeValue(phi) + replacement[def] = phi + newUse = phi + } + if fn.pass.debug > 1 { + fmt.Printf("== Update moved use %v %v\n", use, newUse.LongString()) + } + use.replaceUse(newUse) + } + } +} + +// verifyRotatedForm verifies if given loop is rotated form +func (loop *loop) verifyRotatedForm(fn *Func) { + if len(loop.header.Succs) != 1 || len(loop.exit.Preds) < 2 || + len(loop.latch.Succs) != 2 || len(loop.guard.Succs) != 2 { + fn.Fatalf("Bad loop %v after rotation", loop.LongString()) + } +} + +// IsRotatedForm returns true if loop is rotated +func (loop *loop) IsRotatedForm() bool { + if loop.guard == nil { + return false + } + return true +} + +// CreateLoopLand creates a land block between loop guard and loop header, it +// executes only if entering loop. +func (loop *loop) CreateLoopLand(fn *Func) bool { + if !loop.IsRotatedForm() { + return false + } + if loop.land != nil { + return true + } + + // loopGuard -> loopLand + // loopLand -> loopHeader + loop.land = insertBetween(fn, loop.guard, loop.header) + + return true +} + +// RotateLoop rotates the original loop to become a do-while style loop, returns +// true if loop is rotated, false otherwise. +func (fn *Func) RotateLoop(loop *loop) bool { + if loop.IsRotatedForm() { + return true + } + + // Check loop form and bail out if failure + sdom := fn.Sdom() + if msg := loop.checkLoopForm(fn, sdom); msg != "" { + if fn.pass.debug > 0 { + fmt.Printf("Exotic %v for rotation: %s %v\n", loop.LongString(), msg, fn.Name) + } + return false + } + + exitIdx := 1 // which successor of loop header wires to loop exit + if loop.header.Succs[0].b == loop.exit { + exitIdx = 0 + } + + assert(len(loop.header.ControlValues()) == 1, "more than 1 ctrl value") + cond := loop.header.Controls[0] + + // Rewire loop header to loop body unconditionally + loop.rewireLoopHeader() + + // Rewire loop latch to header and exit based on new conditional test + loop.rewireLoopLatch(cond, exitIdx) + + // Create loop guard block + // TODO(yyang): Creation of loop guard can be skipped if original IR already + // exists such form. e.g. if 0 < len(b) { for i := 0; i < len(b); i++ {...} } + loopGuard := fn.NewBlock(BlockIf) + loop.guard = loopGuard + + // Rewire entry to loop guard instead of original loop header + loop.rewireLoopEntry(loopGuard) + + // Clone old conditional test and its arguments to control loop guard + guardCond, cloned := loop.cloneCond(cond) + + // Rewire loop guard to original loop header and loop exit + loop.rewireLoopGuard(guardCond, exitIdx) + + // CFG changes are all done here, then update data dependencies accordingly + + // Move conditional test from loop header to loop latch + loop.moveCond(cond, cloned) + + // Update uses of moved Values because these defs no longer dominates uses + // after they were moved to loop latch + loop.updateMovedUses(fn, cloned) + + // Add corresponding argument for phis at loop exits since new edge from + // loop guard to loop exit had been created + loop.addG2EArg(fn, sdom) + + // Update proxy phi to use the loop phi's incoming argument which comes from + // loop latch since loop latch may terminate the loop now + loop.updateLoopUse(fn) + + // Gosh, loop is rotated + loop.verifyRotatedForm(fn) + + if fn.pass.debug > 0 { + fmt.Printf("%v rotated in %v\n", loop.LongString(), fn.Name) + } + fn.invalidateCFG() + return true +} + +func moveBlock(slice []*Block, from, to int) []*Block { + if from < 0 || to < 0 || from >= len(slice) || to >= len(slice) { + return slice + } + + elem := slice[from] + if from < to { + copy(slice[from:], slice[from+1:to+1]) + } else { + copy(slice[to+1:], slice[to:from]) + } + + slice[to] = elem + return slice +} + +// layoutLoop converts loops with a check-loop-condition-at-beginning +// to loops with a check-loop-condition-at-end by reordering blocks. no +// CFG changes here. This helps loops avoid extra unnecessary jumps. // // loop: // CMPQ ... @@ -21,7 +1120,7 @@ package ssa // entry: // CMPQ ... // JLT loop -func loopRotate(f *Func) { +func layoutLoop(f *Func) { loopnest := f.loopnest() if loopnest.hasIrreducible { return @@ -30,84 +1129,36 @@ func loopRotate(f *Func) { return } - idToIdx := f.Cache.allocIntSlice(f.NumBlocks()) - defer f.Cache.freeIntSlice(idToIdx) - for i, b := range f.Blocks { - idToIdx[b.ID] = i - } - - // Set of blocks we're moving, by ID. - move := map[ID]struct{}{} - - // Map from block ID to the moving blocks that should - // come right after it. - after := map[ID][]*Block{} - - // Check each loop header and decide if we want to move it. for _, loop := range loopnest.loops { - b := loop.header - var p *Block // b's in-loop predecessor - for _, e := range b.Preds { + header := loop.header + // If loop rotation is already applied, loop latch should be right after + // all loop body blocks + if header.Kind == BlockPlain && len(header.Succs) == 1 { + continue + } + // Otherwise, place loop header right after all body blocks + var latch *Block // b's in-loop predecessor + for _, e := range header.Preds { if e.b.Kind != BlockPlain { continue } if loopnest.b2l[e.b.ID] != loop { continue } - p = e.b + latch = e.b } - if p == nil || p == b { + if latch == nil || latch == header { continue } - after[p.ID] = []*Block{b} - for { - nextIdx := idToIdx[b.ID] + 1 - if nextIdx >= len(f.Blocks) { // reached end of function (maybe impossible?) - break - } - nextb := f.Blocks[nextIdx] - if nextb == p { // original loop predecessor is next - break - } - if loopnest.b2l[nextb.ID] == loop { - after[p.ID] = append(after[p.ID], nextb) + iheader, ilatch := 0, 0 + for ib, b := range f.Blocks { + if b == header { + iheader = ib + } else if b == latch { + ilatch = ib } - b = nextb - } - // Swap b and p so that we'll handle p before b when moving blocks. - f.Blocks[idToIdx[loop.header.ID]] = p - f.Blocks[idToIdx[p.ID]] = loop.header - idToIdx[loop.header.ID], idToIdx[p.ID] = idToIdx[p.ID], idToIdx[loop.header.ID] - - // Place b after p. - for _, b := range after[p.ID] { - move[b.ID] = struct{}{} - } - } - - // Move blocks to their destinations in a single pass. - // We rely here on the fact that loop headers must come - // before the rest of the loop. And that relies on the - // fact that we only identify reducible loops. - j := 0 - // Some blocks that are not part of a loop may be placed - // between loop blocks. In order to avoid these blocks from - // being overwritten, use a temporary slice. - oldOrder := f.Cache.allocBlockSlice(len(f.Blocks)) - defer f.Cache.freeBlockSlice(oldOrder) - copy(oldOrder, f.Blocks) - for _, b := range oldOrder { - if _, ok := move[b.ID]; ok { - continue - } - f.Blocks[j] = b - j++ - for _, a := range after[b.ID] { - f.Blocks[j] = a - j++ } - } - if j != len(oldOrder) { - f.Fatalf("bad reordering in looprotate") + // Reordering the loop blocks from [header,body,latch] to [latch,body,header] + f.Blocks = moveBlock(f.Blocks, iheader, ilatch) } } diff --git a/src/cmd/compile/internal/ssa/looprotate_test.go b/src/cmd/compile/internal/ssa/looprotate_test.go new file mode 100644 index 00000000000000..8004b76740aa4d --- /dev/null +++ b/src/cmd/compile/internal/ssa/looprotate_test.go @@ -0,0 +1,689 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "cmd/compile/internal/types" + "testing" +) + +func doLoopRotation(fun fun) bool { + CheckFunc(fun.f) + f := fun.f + loopnest := f.loopnest() + loopnest.assembleChildren() + loopnest.findExits() + for _, loop := range loopnest.loops { + if !f.RotateLoop(loop) { + return false + } + CheckFunc(fun.f) + } + return true +} + +func doLoopRotationWithLCSSSA(fun fun) bool { + CheckFunc(fun.f) + f := fun.f + loopnest := f.loopnest() + loopnest.assembleChildren() + loopnest.findExits() + for _, loop := range loopnest.loops { + if !f.BuildLoopClosedForm(loopnest, loop) { + panic("Failed to build loop closed form") + } + } + + for _, loop := range loopnest.loops { + if !f.RotateLoop(loop) { + return false + } + CheckFunc(fun.f) + } + return true +} + +func verifyRotatedCFG(fun fun, t *testing.T) { + // CFG is correctly wired? + cfg := map[string][]string{ + "loopHeader": {"loopLatch", "loopBody"}, + "loopLatch": {"loopHeader", "loopExit"}, + "loopBody": {"loopLatch"}, + } + for k, succs := range cfg { + for _, b := range fun.f.Blocks { + if fun.blocks[k] == b { + for _, succ := range succs { + succb := fun.blocks[succ] + found := false + for _, s := range b.Succs { + if s.b == succb { + found = true + break + } + } + if !found { + t.Fatalf("Illegal CFG") + } + } + } + break + } + } +} + +func verifyNumValue(fun fun, t *testing.T, expectedOp Op, expectedNum int) { + // Data flow is correctly set up? + num := 0 + for _, b := range fun.f.Blocks { + for _, val := range b.Values { + if val.Op == expectedOp { + num++ + } + } + } + if num != expectedNum { + t.Fatalf("unexpected num of operation %v", expectedOp) + } +} + +// The original loop looks like in below form +// +// for i := 0; i < 10; i++ { +// } +// +// After loop rotation, it should be like below +// +// if 0 < 10 { +// i := 0 +// do { +// i++ +// } while i < 10 +// } +// +// Loop defs are not used outside the loop, so simply performing loop rotation +// w/o LCSSA is okay. +func TestSimpleLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) +} + +// Loop header contains Values that may takes side effects and it was used by +// condiitonal test. +// +// for i := 0; i < *load; i++ { +// } +// +// After loop rotation, it should be like below +// +// if 0 < *load { +// i := 0 +// do { +// i+=*load +// } while *load < 10 +// } +// +// Loop defs are not used outside the loop, so simply performing loop rotation +// w/o LCSSA is okay. +func TestComplexLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"), + Valu("store", OpStore, types.TypeMem, 0, nil, "addr", "one", "mem"), + Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "store"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) + verifyNumValue(fun, t, OpLoad, 2) + verifyNumValue(fun, t, OpAddr, 2) + verifyNumValue(fun, t, OpStore, 2) +} + +// Similiar to TestSimpleLoop, but control value is not live in loop header +// +// i := 0 +// cmp := i < 10 +// for ; cmp; i++ { +// } +// +// After loop rotation, it should be like below +// +// i := 0 +// cmp := i < 10 +// if cmp { +// i := 0 +// do { +// i++ +// } while cmp +// } +// +// Loop defs are not used outside the loop, so simply performing loop rotation +// w/o LCSSA is okay. +func TestSimpleLoopCtrlElsewhere(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Valu("i", OpConst64, c.config.Types.Int64, 0, nil), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("phi", OpPhi, c.config.Types.Int64, 0, nil, "i", "inc"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "phi"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // no copy, no clone + verifyNumValue(fun, t, OpLess64, 1) + verifyNumValue(fun, t, OpPhi, 1) +} + +// Even more harder, Values in loop header have cyclic dependencies, i.e. +// +// loop header: +// +// v1 = phi(.., v3) +// v3 = add(v1, 1) +// If v3 < 10, ... +func TestCondCyclicLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + // cyclic dependency in loop header + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "inc", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) + verifyNumValue(fun, t, OpCopy, 1) + verifyNumValue(fun, t, OpPhi, 2) + + for _, b := range fun.f.Blocks { + for _, val := range b.Values { + switch val.Op { + case OpCopy: + if val.Block != fun.blocks["loopLatch"] { + t.Fatalf("copy must be in loop latch") + } + } + } + } +} + +// Cyclic dependencies may appear during updating +// +// loop header: +// v1 = phi(.., v4) +// v3 = add(v1, 1) +// If v3 < 10, ... +// +// loop latch: +// v4 = add(v3, 1) +func TestNewCyclicLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + // cyclic dependency in loop header + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc2"), + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "inc", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc2", OpAdd64, c.config.Types.Int64, 0, nil, "one", "inc"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) + // no copy because inc2 explicitly uses inc + verifyNumValue(fun, t, OpPhi, 2) +} + +// Use loop phi outside the loop, this requires LCSSA, which creates proxy phi +// and use such phi outside the loop. +// +// if 0 < 10 { +// i := 0 +// do { +// i++ +// } while i < 10 +// use := i * 10 +// } +func TestOutsideLoopUses(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"), + Goto("exit")), + Bloc("exit", + Exit("mem"))) + + // doLoopRotation fails because loop phi is used outside the loop. + if !doLoopRotationWithLCSSSA(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) + + loopExit := fun.blocks["loopExit"] + for _, val := range loopExit.Values { + if val.Op == OpPhi { + if len(val.Args) != len(loopExit.Preds) { + t.Fatalf("num of phi arguments mismatched with num of predecessors") + } + if 1 != val.Uses { + t.Fatalf("proxy phi must be used by p") + } + for _, arg := range val.Args { + switch arg.Op { + case OpConst64, OpAdd64: + default: + t.Fatalf("proxy phi must have only constants and add operands") + } + } + } + } +} + +// Ditto, but the loop phi has cyclic dependencies. +func TestPhiCondCyclicLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("true", OpConstBool, c.config.Types.Bool, 1, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("phi", OpPhi, c.config.Types.Bool, 0, nil, "true", "false"), + Valu("false", OpConstBool, c.config.Types.Bool, 0, nil), + If("phi", "loopLatch", "loopExit")), + Bloc("loopLatch", + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + // phi will not copy to loop guard, so only one phi exists + verifyNumValue(fun, t, OpPhi, 1) +} + +// Loop has multiple exits +// +// for i := 0; i < 10; i++ { +// if i == 1 { +// return +// } +// } +// +// After loop rotation, it should be like below +// +// if 0 < 10 { +// i := 0 +// do { +// if i == 1 { +// return +// } +// i++ +// } while i < 10 +// } +// +// Loop defs are not used outside the loop, so simply performing loop rotation +// w/o LCSSA is okay. +func TestMultiExitLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopBody", "loopExit")), + Bloc("loopBody", + Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp2", "loopExit1", "loopLatch")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit1", + Exit("mem")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotation(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) +} + +// Loop contains multiple exits, and every loop exit block contians at least one +// use that uses loop phi. +// +// if 0 < 10 { +// i := 0 +// do { +// if i == 1 { +// use1 = i * 10 +// return +// } +// i++ +// } while i < 10 +// } +// use2 = i * 10 +func TestMultiExitLoopUses(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopBody", "loopExit")), + Bloc("loopBody", + Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp2", "loopExit1", "loopLatch")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit1", + Valu("use1", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"), + Exit("mem")), + Bloc("loopExit", + Valu("use2", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"), + Exit("mem"))) + + if !doLoopRotationWithLCSSSA(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) + verifyNumValue(fun, t, OpPhi, 1 /*var i*/ +2 /*proxy phi*/) +} + +// Even harder, Values defined in loop header are used everywhere. +func TestMultiExitLoopUsesEverywhere(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"), + Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "mem"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"), + If("cmp", "loopBody", "loopExit")), + Bloc("loopBody", + Valu("use3", OpMul64, c.config.Types.Int64, 0, nil, "i", "load"), + Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"), + If("cmp2", "loopExit1", "loopLatch")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit1", + Valu("use1", OpMul64, c.config.Types.Int64, 0, nil, "i", "load"), + Exit("mem")), + Bloc("loopExit", + Valu("use2", OpMul64, c.config.Types.Int64, 0, nil, "i", "load"), + Exit("mem"))) + + if !doLoopRotationWithLCSSSA(fun) { + t.Fatal("Loop rotation failed") + } + + verifyRotatedCFG(fun, t) + + // one lives in loop latch and one lives in loop guard + verifyNumValue(fun, t, OpLess64, 2) + verifyNumValue(fun, t, OpLoad, 2) + numOfPhi := 2 /*two proxy phi in exit1*/ + 2 /*two proxy phi in exit*/ + + 2 /*i and inserted phi for load*/ + verifyNumValue(fun, t, OpPhi, numOfPhi) +} + +// Rotation the Loop inclduing nesting children +func TestNestLoopRotation(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("ten", OpConst64, c.config.Types.Int64, 10, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"), + If("cmp", "loopHeader2", "loopExit")), + Bloc("loopHeader2", + Valu("k", OpPhi, c.config.Types.Int64, 0, nil, "i", "inc2"), + Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "k", "one"), + If("cmp2", "loopLatch2", "loopLatch")), + Bloc("loopLatch2", + Valu("inc2", OpAdd64, c.config.Types.Int64, 0, nil, "one", "k"), + Goto("loopHeader2")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if !doLoopRotationWithLCSSSA(fun) { + t.Fatal("Loop rotation failed") + } +} + +// Store is defined in loop header and used outside the loop indirectly. +func TestBadLoop(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"), + Valu("store", OpStore, types.TypeMem, 0, nil, "addr", "one", "mem"), + Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "store"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "load", "one"), + Exit("mem"))) + + if doLoopRotationWithLCSSSA(fun) != false { + t.Fatal("Loop rotation is expected to fail") + } +} + +// Loop def is non trivial because it excesses max depth +func TestBadLoop2(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry", + Bloc("loopEntry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil), + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"), + Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"), + Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "mem"), + Valu("depth5", OpAdd64, c.config.Types.Int64, 0, nil, "one", "load"), + Valu("depth4", OpAdd64, c.config.Types.Int64, 0, nil, "one", "depth5"), + Valu("depth3", OpAdd64, c.config.Types.Int64, 0, nil, "one", "depth4"), + Valu("depth2", OpAdd64, c.config.Types.Int64, 0, nil, "one", "depth3"), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "depth2"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "load", "one"), + Exit("mem"))) + + if doLoopRotationWithLCSSSA(fun) != false { + t.Fatal("Loop rotation is expected to fail") + } +} + +// Loop header has multiple entries +func TestBadLoop3(t *testing.T) { + c := testConfig(t) + fun := c.Fun("loopEntry1", + Bloc("loopEntry1", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("zero", OpConst64, c.config.Types.Int64, 0, nil), + Valu("one", OpConst64, c.config.Types.Int64, 1, nil), + Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "zero", "one"), + If("cmp", "loopHeader", "loopEntry2")), + Bloc("loopEntry2", + Goto("loopHeader")), + Bloc("loopHeader", + Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "one", "inc"), + If("cmp", "loopLatch", "loopExit")), + Bloc("loopLatch", + Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"), + Goto("loopHeader")), + Bloc("loopExit", + Exit("mem"))) + + if doLoopRotationWithLCSSSA(fun) != false { + t.Fatal("Loop rotation is expected to fail") + } +} diff --git a/src/cmd/compile/internal/ssa/phielim.go b/src/cmd/compile/internal/ssa/phielim.go index 4fc942375fdef3..cadea4eeabfeba 100644 --- a/src/cmd/compile/internal/ssa/phielim.go +++ b/src/cmd/compile/internal/ssa/phielim.go @@ -35,6 +35,8 @@ func phielim(f *Func) { break } } + + copyelim(f) } // phielimValue tries to convert the phi v to a copy. diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go index 2325b9ee458412..676093756cb2bf 100644 --- a/src/cmd/compile/internal/ssa/regalloc.go +++ b/src/cmd/compile/internal/ssa/regalloc.go @@ -1242,12 +1242,17 @@ func (s *regAllocState) regalloc(f *Func) { // we get the right behavior for a block which branches to itself. for _, e := range b.Succs { succ := e.b + pidx := e.i + if succ.Kind == BlockPlain && len(succ.Values) == 0 { + ne := succ.Succs[0] + succ = ne.b + pidx = ne.i + } // TODO: prioritize likely successor? for _, x := range s.startRegs[succ.ID] { desired.add(x.v.ID, x.r) } // Process phi ops in succ. - pidx := e.i for _, v := range succ.Values { if v.Op != OpPhi { break diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go index fb38f40d63ab11..a0f9defee5325f 100644 --- a/src/cmd/compile/internal/ssa/schedule.go +++ b/src/cmd/compile/internal/ssa/schedule.go @@ -11,6 +11,7 @@ import ( "sort" ) +// Larger numbers are scheduled closer to the end of the block. const ( ScorePhi = iota // towards top of block ScoreArg // must occur at the top of the entry block @@ -204,6 +205,13 @@ func schedule(f *Func) { continue } score[c.ID] = ScoreControl + // schedule arguments of control values closer if they are defined + // in the same block and not compute score yet. + for _, arg := range c.Args { + if arg.Block == b && score[arg.ID] == ScoreDefault { + score[arg.ID] = ScoreControl - 1 + } + } } } priq.score = score @@ -390,6 +398,9 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value hasNilCheck := false sset.clear() // sset is the set of stores that are used in other values for _, v := range values { + if v.Op == OpInvalid { + continue + } if v.Type.IsMemory() { stores = append(stores, v) if v.Op == OpInitMem || v.Op == OpPhi { diff --git a/src/cmd/compile/internal/ssa/tighten.go b/src/cmd/compile/internal/ssa/tighten.go index 85b6a84cc3f426..6a5a2f32c46761 100644 --- a/src/cmd/compile/internal/ssa/tighten.go +++ b/src/cmd/compile/internal/ssa/tighten.go @@ -23,11 +23,7 @@ func tighten(f *Func) { defer f.Cache.freeBoolSlice(canMove) // Compute the memory states of each block. - startMem := f.Cache.allocValueSlice(f.NumBlocks()) - defer f.Cache.freeValueSlice(startMem) - endMem := f.Cache.allocValueSlice(f.NumBlocks()) - defer f.Cache.freeValueSlice(endMem) - memState(f, startMem, endMem) + startMem, _ := memState(f) for _, b := range f.Blocks { for _, v := range b.Values { @@ -214,7 +210,9 @@ func phiTighten(f *Func) { // 3. The algorithm first obtains the memory state of some blocks in the tree // in the first step. Then floods the known memory state to other nodes in // the second step. -func memState(f *Func, startMem, endMem []*Value) { +func memState(f *Func) ([]*Value, []*Value) { + startMem := make([]*Value, f.NumBlocks()) + endMem := make([]*Value, f.NumBlocks()) // This slice contains the set of blocks that have had their startMem set but this // startMem value has not yet been propagated to the endMem of its predecessors changed := make([]*Block, 0) @@ -266,4 +264,5 @@ func memState(f *Func, startMem, endMem []*Value) { } } } + return startMem, endMem } diff --git a/src/cmd/compile/internal/ssa/value.go b/src/cmd/compile/internal/ssa/value.go index 4eaab40354c171..9ab7e554eb18fe 100644 --- a/src/cmd/compile/internal/ssa/value.go +++ b/src/cmd/compile/internal/ssa/value.go @@ -105,6 +105,13 @@ func (v *Value) AuxInt32() int32 { return int32(v.AuxInt) } +func (v *Value) AuxInt64() int64 { + if opcodeTable[v.Op].auxType != auxInt64 { + v.Fatalf("op %s doesn't have an int64 aux field", v.Op) + } + return int64(v.AuxInt) +} + // AuxUnsigned returns v.AuxInt as an unsigned value for OpConst*. // v.AuxInt is always sign-extended to 64 bits, even if the // represented value is unsigned. This undoes that sign extension. diff --git a/src/internal/goexperiment/exp_loopopts_off.go b/src/internal/goexperiment/exp_loopopts_off.go new file mode 100644 index 00000000000000..fd9018132e752c --- /dev/null +++ b/src/internal/goexperiment/exp_loopopts_off.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build !goexperiment.loopopts + +package goexperiment + +const LoopOpts = false +const LoopOptsInt = 0 diff --git a/src/internal/goexperiment/exp_loopopts_on.go b/src/internal/goexperiment/exp_loopopts_on.go new file mode 100644 index 00000000000000..d372fd33ba564c --- /dev/null +++ b/src/internal/goexperiment/exp_loopopts_on.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build goexperiment.loopopts + +package goexperiment + +const LoopOpts = true +const LoopOptsInt = 1 diff --git a/src/internal/goexperiment/exp_range_off.go b/src/internal/goexperiment/exp_range_off.go new file mode 100644 index 00000000000000..82f5dc71b28320 --- /dev/null +++ b/src/internal/goexperiment/exp_range_off.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build !goexperiment.range + +package goexperiment + +const Range = false +const RangeInt = 0 diff --git a/src/internal/goexperiment/exp_range_on.go b/src/internal/goexperiment/exp_range_on.go new file mode 100644 index 00000000000000..1d0f30f49f83ed --- /dev/null +++ b/src/internal/goexperiment/exp_range_on.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build goexperiment.range + +package goexperiment + +const Range = true +const RangeInt = 1 diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go index dacc4c3b135732..cc44f8b6a04707 100644 --- a/src/internal/goexperiment/flags.go +++ b/src/internal/goexperiment/flags.go @@ -127,4 +127,8 @@ type Flags struct { // ExecTracer2 controls whether to use the new execution trace // implementation. ExecTracer2 bool + + // LoopOpts enables aggressive loop optimizations on SSA, which may takes + // more time to compile but produce faster code. + LoopOpts bool } diff --git a/test/nilptr3.go b/test/nilptr3.go index 2cc510beb635df..33963b75620f11 100644 --- a/test/nilptr3.go +++ b/test/nilptr3.go @@ -155,7 +155,7 @@ func f4(x *[10]int) { // and the offset is small enough that if x is nil, the address will still be // in the first unmapped page of memory. - _ = x[9] // ERROR "generated nil check" // bug: would like to remove this check (but nilcheck and load are in different blocks) + _ = x[9] // ERROR "removed nil check" for { if x[9] != 0 { // ERROR "removed nil check" diff --git a/test/opt_branchlikely.go b/test/opt_branchlikely.go index 0aee33f87a578b..f68d746db63fea 100644 --- a/test/opt_branchlikely.go +++ b/test/opt_branchlikely.go @@ -12,7 +12,7 @@ package foo func f(x, y, z int) int { a := 0 - for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop" + for i := 0; i < x; i++ { // ERROR "Branch prediction rule" for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop" a += j } @@ -45,7 +45,7 @@ func g(x, y, z int) int { panic("help help help") } if x != 0 { // ERROR "Branch prediction rule default < ret" - for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop" + for i := 0; i < x; i++ { // ERROR "Branch prediction rule" if x == 4 { // ERROR "Branch prediction rule stay in loop" return a } @@ -62,7 +62,7 @@ func g(x, y, z int) int { func h(x, y, z int) int { a := 0 - for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop" + for i := 0; i < x; i++ { // ERROR "Branch prediction rule" for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop" a += j if i == j { // ERROR "Branch prediction rule stay in loop" diff --git a/test/writebarrier.go b/test/writebarrier.go index 1b30fa509e5503..ddd37895780594 100644 --- a/test/writebarrier.go +++ b/test/writebarrier.go @@ -60,7 +60,7 @@ func f3a(x *string, y *string) { } func f4(x *[2]string, y [2]string) { - *x = y // ERROR "write barrier" + *x = y // no barrier (dead store) z := y // no barrier *x = z // ERROR "write barrier"