diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go
index 4a24a181e5de19..b484ed265c433a 100644
--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@@ -257,6 +257,38 @@ func (b *Block) resetWithControl2(kind BlockKind, v, w *Value) {
 	w.Uses++
 }
 
+// ReplaceSucc replaces b->oldSucc to b->newSucc, n indicates which predecessor
+// index of newSucc refers to b. It is the responsibility of the caller to clear
+// the corresponding predecessor of oldSucc.
+func (b *Block) ReplaceSucc(oldSucc, newSucc *Block, n int) {
+	for i := 0; i < len(b.Succs); i++ {
+		succ := &b.Succs[i]
+		if succ.b == oldSucc {
+			succ.b = newSucc
+			succ.i = n
+			newSucc.Preds[n] = Edge{b, i}
+			return
+		}
+	}
+	panic(fmt.Sprintf("Can not found %v->%v", b, oldSucc))
+}
+
+// ReplacePred replaces oldPred->b to newPred->b, n indicates which successor
+// index of newPred refers to b. It is the responsibility of the caller to clear
+// the corresponding successor of oldPred.
+func (b *Block) ReplacePred(oldPred, newPred *Block, n int) {
+	for i := 0; i < len(b.Preds); i++ {
+		pred := &b.Preds[i]
+		if pred.b == oldPred {
+			pred.b = newPred
+			pred.i = n
+			newPred.Succs[n] = Edge{b, i}
+			return
+		}
+	}
+	panic(fmt.Sprintf("Can not found %v->%v", oldPred, b))
+}
+
 // truncateValues truncates b.Values at the ith element, zeroing subsequent elements.
 // The values in b.Values after i must already have had their args reset,
 // to maintain correct value uses counts.
diff --git a/src/cmd/compile/internal/ssa/branchelim.go b/src/cmd/compile/internal/ssa/branchelim.go
index f16959dd572973..158e5eca7b226f 100644
--- a/src/cmd/compile/internal/ssa/branchelim.go
+++ b/src/cmd/compile/internal/ssa/branchelim.go
@@ -424,6 +424,14 @@ func shouldElimIfElse(no, yes, post *Block, arch string) bool {
 	}
 }
 
+func isAccessMemory(v *Value) bool {
+	if v.Op == OpPhi || v.Type.IsMemory() ||
+		v.MemoryArg() != nil || opcodeTable[v.Op].hasSideEffects {
+		return true
+	}
+	return false
+}
+
 // canSpeculativelyExecute reports whether every value in the block can
 // be evaluated without causing any observable side effects (memory
 // accesses, panics and so on) except for execution time changes. It
@@ -436,8 +444,8 @@ func canSpeculativelyExecute(b *Block) bool {
 	// don't fuse memory ops, Phi ops, divides (can panic),
 	// or anything else with side-effects
 	for _, v := range b.Values {
-		if v.Op == OpPhi || isDivMod(v.Op) || isPtrArithmetic(v.Op) || v.Type.IsMemory() ||
-			v.MemoryArg() != nil || opcodeTable[v.Op].hasSideEffects {
+		if v.Op == OpPhi || isDivMod(v.Op) || isPtrArithmetic(v.Op) ||
+			isAccessMemory(v) {
 			return false
 		}
 	}
diff --git a/src/cmd/compile/internal/ssa/check.go b/src/cmd/compile/internal/ssa/check.go
index bbfdaceaad90b0..d4cee595fcdb48 100644
--- a/src/cmd/compile/internal/ssa/check.go
+++ b/src/cmd/compile/internal/ssa/check.go
@@ -460,33 +460,8 @@ func checkFunc(f *Func) {
 	memCheck(f)
 }
 
-func memCheck(f *Func) {
-	// Check that if a tuple has a memory type, it is second.
-	for _, b := range f.Blocks {
-		for _, v := range b.Values {
-			if v.Type.IsTuple() && v.Type.FieldType(0).IsMemory() {
-				f.Fatalf("memory is first in a tuple: %s\n", v.LongString())
-			}
-		}
-	}
-
-	// Single live memory checks.
-	// These checks only work if there are no memory copies.
-	// (Memory copies introduce ambiguity about which mem value is really live.
-	// probably fixable, but it's easier to avoid the problem.)
-	// For the same reason, disable this check if some memory ops are unused.
-	for _, b := range f.Blocks {
-		for _, v := range b.Values {
-			if (v.Op == OpCopy || v.Uses == 0) && v.Type.IsMemory() {
-				return
-			}
-		}
-		if b != f.Entry && len(b.Preds) == 0 {
-			return
-		}
-	}
-
-	// Compute live memory at the end of each block.
+// computeLastMem compute live memory at the end of each block.
+func computeLastMem(f *Func) []*Value {
 	lastmem := make([]*Value, f.NumBlocks())
 	ss := newSparseSet(f.NumValues())
 	for _, b := range f.Blocks {
@@ -552,6 +527,36 @@ func memCheck(f *Func) {
 			break
 		}
 	}
+	return lastmem
+}
+
+func memCheck(f *Func) {
+	// Check that if a tuple has a memory type, it is second.
+	for _, b := range f.Blocks {
+		for _, v := range b.Values {
+			if v.Type.IsTuple() && v.Type.FieldType(0).IsMemory() {
+				f.Fatalf("memory is first in a tuple: %s\n", v.LongString())
+			}
+		}
+	}
+
+	// Single live memory checks.
+	// These checks only work if there are no memory copies.
+	// (Memory copies introduce ambiguity about which mem value is really live.
+	// probably fixable, but it's easier to avoid the problem.)
+	// For the same reason, disable this check if some memory ops are unused.
+	for _, b := range f.Blocks {
+		for _, v := range b.Values {
+			if (v.Op == OpCopy || v.Uses == 0) && v.Type.IsMemory() {
+				return
+			}
+		}
+		if b != f.Entry && len(b.Preds) == 0 {
+			return
+		}
+	}
+
+	lastmem := computeLastMem(f)
 	// Check merge points.
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go
index d125891f88c58f..9037c6049308ea 100644
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@@ -453,12 +453,13 @@ commas. For example:
 	return fmt.Sprintf("Did not find a phase matching %s in -d=ssa/... debug option", phase)
 }
 
+var EnableLoopOpts = buildcfg.Experiment.LoopOpts
+
 // list of passes for the compiler
 var passes = [...]pass{
-	// TODO: combine phielim and copyelim into a single pass?
+	// Generic Optimizations
 	{name: "number lines", fn: numberLines, required: true},
 	{name: "early phielim", fn: phielim},
-	{name: "early copyelim", fn: copyelim},
 	{name: "early deadcode", fn: deadcode}, // remove generated dead code to avoid doing pointless work during opt
 	{name: "short circuit", fn: shortcircuit},
 	{name: "decompose user", fn: decomposeUser, required: true},
@@ -484,9 +485,18 @@ var passes = [...]pass{
 	{name: "late fuse", fn: fuseLate},
 	{name: "dse", fn: dse},
 	{name: "memcombine", fn: memcombine},
-	{name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops
+	// Loop Optimizations
+	{name: "loop deadcode", fn: deadcode, disabled: !EnableLoopOpts},          // remove dead blocks before loop opts to avoid extra work
+	{name: "loop invariant code motion", fn: licm, disabled: !EnableLoopOpts}, // hoist loop invariant code out of loops
+	{name: "lcssa destruct", fn: phielim, disabled: !EnableLoopOpts},          // eliminate LCSSA proxy phi to restore general SSA form
+	{name: "loop sccp", fn: sccp, disabled: !EnableLoopOpts},                  // optimize loop guard conditional test
+	{name: "loop opt", fn: opt, disabled: !EnableLoopOpts},                    // further optimize loop guard conditional test
+	{name: "loop deadcode late", fn: deadcode, disabled: !EnableLoopOpts},     // remove dead loop guard to simplify cfg
+	{name: "loop nilcheckelim", fn: nilcheckelim, disabled: !EnableLoopOpts},  // remove duplicated nil check in loop guard
+	{name: "writebarrier", fn: writebarrier, required: true},                  // expand write barrier ops
 	{name: "insert resched checks", fn: insertLoopReschedChecks,
 		disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops.
+	// Code Generation
 	{name: "lower", fn: lower, required: true},
 	{name: "addressing modes", fn: addressingModes, required: false},
 	{name: "late lower", fn: lateLower, required: true},
@@ -497,7 +507,6 @@ var passes = [...]pass{
 	{name: "lowered deadcode", fn: deadcode, required: true},
 	{name: "checkLower", fn: checkLower, required: true},
 	{name: "late phielim", fn: phielim},
-	{name: "late copyelim", fn: copyelim},
 	{name: "tighten", fn: tighten, required: true}, // move values closer to their uses
 	{name: "late deadcode", fn: deadcode},
 	{name: "critical", fn: critical, required: true}, // remove critical edges
@@ -508,7 +517,7 @@ var passes = [...]pass{
 	{name: "late nilcheck", fn: nilcheckelim2},
 	{name: "flagalloc", fn: flagalloc, required: true}, // allocate flags register
 	{name: "regalloc", fn: regalloc, required: true},   // allocate int & float registers + stack slots
-	{name: "loop rotate", fn: loopRotate},
+	{name: "layout loop", fn: layoutLoop},
 	{name: "trim", fn: trim}, // remove empty blocks
 }
 
@@ -577,8 +586,8 @@ var passOrder = [...]constraint{
 	{"schedule", "flagalloc"},
 	// regalloc needs flags to be allocated first.
 	{"flagalloc", "regalloc"},
-	// loopRotate will confuse regalloc.
-	{"regalloc", "loop rotate"},
+	// layout loop will confuse regalloc.
+	{"regalloc", "layout loop"},
 	// trim needs regalloc to be done first.
 	{"regalloc", "trim"},
 	// memcombine works better if fuse happens first, to help merge stores.
diff --git a/src/cmd/compile/internal/ssa/func.go b/src/cmd/compile/internal/ssa/func.go
index 529c119dc3da5a..ae1e57fe8f37ab 100644
--- a/src/cmd/compile/internal/ssa/func.go
+++ b/src/cmd/compile/internal/ssa/func.go
@@ -86,6 +86,13 @@ type LocalSlotSplitKey struct {
 	Type   *types.Type // type of slot
 }
 
+// assert is used for development sanity check
+func assert(cond bool, fx string, msg ...interface{}) {
+	if !cond {
+		panic(fmt.Sprintf(fx, msg...))
+	}
+}
+
 // NewFunc returns a new, empty function object.
 // Caller must reset cache before calling NewFunc.
 func (c *Config) NewFunc(fe Frontend, cache *Cache) *Func {
@@ -298,7 +305,7 @@ func (f *Func) newValue(op Op, t *types.Type, b *Block, pos src.XPos) *Value {
 // newValueNoBlock allocates a new Value with the given fields.
 // The returned value is not placed in any block.  Once the caller
 // decides on a block b, it must set b.Block and append
-// the returned value to b.Values.
+// the returned value to b.Values or simply use placeValue.
 func (f *Func) newValueNoBlock(op Op, t *types.Type, pos src.XPos) *Value {
 	var v *Value
 	if f.freeValues != nil {
@@ -324,6 +331,12 @@ func (f *Func) newValueNoBlock(op Op, t *types.Type, pos src.XPos) *Value {
 	return v
 }
 
+// placeValue places new Value that not placed yet into given block.
+func (block *Block) placeValue(v *Value) {
+	v.Block = block
+	block.Values = append(block.Values, v)
+}
+
 // LogStat writes a string key and int value as a warning in a
 // tab-separated format easily handled by spreadsheets or awk.
 // file names, lines, and function names are included to provide enough (?)
diff --git a/src/cmd/compile/internal/ssa/layout.go b/src/cmd/compile/internal/ssa/layout.go
index e4a8c6ffbf0dde..eeb84308b53bb9 100644
--- a/src/cmd/compile/internal/ssa/layout.go
+++ b/src/cmd/compile/internal/ssa/layout.go
@@ -88,6 +88,7 @@ func layoutOrder(f *Func) []*Block {
 	}
 
 	bid := f.Entry.ID
+	blockTrace := false
 blockloop:
 	for {
 		// add block to schedule
@@ -120,7 +121,6 @@ blockloop:
 		}
 
 		// Pick the next block to schedule
-		// Pick among the successor blocks that have not been scheduled yet.
 
 		// Use likely direction if we have it.
 		var likely *Block
@@ -131,10 +131,27 @@ blockloop:
 			likely = b.Succs[1].b
 		}
 		if likely != nil && !scheduled[likely.ID] {
+			blockTrace = true
 			bid = likely.ID
 			continue
 		}
 
+		// Pick the next block in the path trace if possible, trace starts with
+		// statically predicted branch, e.g.
+		//   b0: ... If -> b1(likely),b2
+		//   b1: ... Plain -> b3
+		// schedule the path trace b0->b1->b3 sequentially
+		if blockTrace {
+			if len(b.Succs) == 1 {
+				s := b.Succs[0].b
+				if !scheduled[s.ID] {
+					bid = s.ID
+					continue blockloop
+				}
+			}
+			blockTrace = false
+		}
+
 		// Use degree for now.
 		bid = 0
 		// TODO: improve this part
diff --git a/src/cmd/compile/internal/ssa/lcssa.go b/src/cmd/compile/internal/ssa/lcssa.go
new file mode 100644
index 00000000000000..123ef740b78561
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/lcssa.go
@@ -0,0 +1,403 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"fmt"
+	"sort"
+)
+
+// ----------------------------------------------------------------------------
+// Loop Closed SSA Form
+//
+// loop closed SSA form is a special form of SSA form, which is used to simplify
+// loop optimization. It ensures that all values defined inside the loop are only
+// used within loop. The transformation looks up loop uses outside the loop and
+// inserts the appropriate "proxy phi" at the loop exit, after which the outside
+// of the loop does not use the loop def directly but the proxy phi.
+//
+//	 loop header:                         loop header:
+//	 v3 = Phi(0, v4)                      v3 = Phi(0, v4)
+//	 If cond->loop latch,loop exit        If cond->loop latch,loop exit
+//
+//	 loop latch:                          loop latch:
+//	 v4 = Add(v3, 1)                =>    v4 = Add(v3, 1)
+//	 Plain->loop header                   Plain->loop header
+//
+//	 loop exit:                           loop exit:
+//	 v5 = Add(5, v3)                      v6 = Phi(v3)  <= Proxy Phi
+//	 Ret v18                              v5 = Add(5, v6)
+//	                                      Ret v18
+//
+// Previously, v5 used v3 directly, where v5 is in the loop exit which is outside
+// the loop. After LCSSA transformation, v5 uses v6, which in turn uses v3. Here,
+// v6 is the proxy phi. In the context of LCSSA, we can consider the use block of
+// v6 to be the loop header rather than the loop exit. This way, all values defined
+// in the loop are loop "closed", i.e. only used within the loop.
+//
+// Any further changes to the loop definition only need to update the proxy phi,
+// rather than iterating through all its uses and handling properties such as
+// dominance relationships carefully, which is error prone and hard to maintain.
+
+// Def-Use utilities
+type user struct {
+	def   *Value // the definition
+	val   *Value // used by value
+	block *Block // used by block's ctrl value
+	idx   int    // in which arg index of user is def located
+}
+
+type defUses map[*Value][]*user
+
+func (u *user) String() string {
+	if u.val != nil {
+		return fmt.Sprintf("{%v:%v}", u.def, u.val)
+	} else {
+		return fmt.Sprintf("{%v:%v}", u.def, u.block)
+	}
+}
+
+// useBlock returns the block where the def is used
+func (u *user) useBlock() *Block {
+	if u.val != nil {
+		return u.val.Block
+	} else {
+		return u.block
+	}
+}
+
+// replaceUse replaces the use of def with new use at given index
+func (u *user) replaceUse(newUse *Value) {
+	if val := u.val; val != nil {
+		idx := u.idx
+		assert(val.Args[idx] == u.def, "sanity check")
+		val.SetArg(idx, newUse)
+	} else if block := u.block; block != nil {
+		idx := u.idx
+		assert(block.ControlValues()[idx] == u.def, "sanity check")
+		block.ReplaceControl(idx, newUse)
+	} else {
+		panic("def is neither used by value nor by block ctrl")
+	}
+}
+
+// buildDefUses builds def-use map for given defs Values
+func buildDefUses(fn *Func, defs []*Value) defUses {
+	defUses := make(defUses, 0)
+	for _, def := range defs {
+		if _, exist := defUses[def]; !exist {
+			// Many duplicate definitions, avoid redundant memory allocations
+			defUses[def] = make([]*user, 0, def.Uses)
+		}
+	}
+	for _, block := range fn.Blocks {
+		for _, val := range block.Values {
+			for iarg, arg := range val.Args {
+				if _, exist := defUses[arg]; exist {
+					defUses[arg] = append(defUses[arg], &user{arg, val, nil, iarg})
+				}
+			}
+		}
+		for ictrl, ctrl := range block.ControlValues() {
+			if _, exist := defUses[ctrl]; exist {
+				defUses[ctrl] = append(defUses[ctrl], &user{ctrl, nil, block, ictrl})
+			}
+		}
+	}
+	return defUses
+}
+
+// stableDefs returns the defs in stable order for deterministic compilation
+func stableDefs(defUses defUses) []*Value {
+	keys := make([]*Value, 0)
+	for k := range defUses {
+		keys = append(keys, k)
+	}
+	sort.SliceStable(keys, func(i, j int) bool {
+		return keys[i].ID < keys[j].ID
+	})
+
+	return keys
+}
+
+type lcssa struct {
+	fn    *Func
+	mphis []*Value          // inserted memory proxy phi
+	e2phi map[*Block]*Value // exit block to proxy phi mapping
+}
+
+// findUseBlock returns the block where the def is used. If the use is type of Phi,
+// then the use block is the corresponding incoming block. Note that this is ONLY
+// valid in context of LCSSA.
+func findUseBlock(u *user) *Block {
+	var ub *Block
+	if val := u.val; val != nil {
+		if val.Op == OpPhi {
+			ipred := u.idx
+			ub = val.Block.Preds[ipred].b
+		} else {
+			ub = val.Block
+		}
+	} else {
+		ub = u.block
+	}
+	assert(ub != nil, "no use block")
+	return ub
+}
+
+// containsBlock returns true if the block is part of the loop or part of the
+// inner loop
+func (ln *loopnest) containsBlock(loop *loop, block *Block) bool {
+	assert(ln.initializedChildren, "initialize loopnest children first")
+
+	// Block is part of current loop?
+	if ln.b2l[block.ID] == loop {
+		return true
+	}
+	// Block is part of inner loop?
+	for _, child := range loop.children {
+		if ln.containsBlock(child, block) {
+			return true
+		}
+	}
+	return false
+}
+
+// allocateProxyPhi allocates a proxy phi at specific loop exit
+func (lc *lcssa) allocateProxyPhi(exit *Block, loopDef ...*Value) *Value {
+	assert(len(loopDef) > 0, "must have at least one loop def")
+	if phival, exist := lc.e2phi[exit]; exist {
+		return phival
+	}
+
+	phi := lc.fn.newValueNoBlock(OpPhi, loopDef[0].Type, loopDef[0].Pos)
+	if len(loopDef) == 1 {
+		phiArgs := make([]*Value, len(exit.Preds))
+		for idx := range exit.Preds {
+			phiArgs[idx] = loopDef[0]
+		}
+		phi.AddArgs(phiArgs...)
+	} else {
+		phi.AddArgs(loopDef...)
+	}
+
+	exit.placeValue(phi)
+	lc.e2phi[exit] = phi
+	if phi.Type.IsMemory() {
+		lc.mphis = append(lc.mphis, phi)
+	}
+	return phi
+}
+
+func (lc *lcssa) fixProxyPhiMem(fn *Func) {
+	if len(lc.mphis) == 0 {
+		// No mem proxy phi to fix
+		return
+	}
+	lastMem := computeLastMem(fn)
+	for _, phi := range lc.mphis {
+		assert(phi.Type.IsMemory(), "must be memory phi")
+
+		for iarg, arg := range phi.Args {
+			mem := lastMem[phi.Block.Preds[iarg].b.ID]
+			if mem != arg && mem != nil {
+				if mem.Args[0] != arg {
+					fn.Fatalf("must use old memory")
+				}
+				oldPhiStr := phi.LongString()
+				phi.SetArg(iarg, mem)
+				if fn.pass.debug > 1 {
+					fmt.Printf("== Fix memory proxy phi %v to %v\n",
+						oldPhiStr, phi.LongString())
+				}
+			}
+		}
+	}
+}
+
+// placeProxyPhi places the proxy phi at loop exits to make sure all uses of a
+// loop defined value are dominated by the proxy phi
+func (lc *lcssa) placeProxyPhi(ln *loopnest, loop *loop, defs []*Value) bool {
+	defUses := buildDefUses(ln.f, defs)
+
+	use2exits := make(map[*user][]*Block, 0)
+	loopDefs := stableDefs(defUses)
+	for _, loopDef := range loopDefs {
+		for _, use := range defUses[loopDef] {
+			useBlock := findUseBlock(use)
+			// It's an in-loop use?
+			if ln.b2l[useBlock.ID] == loop {
+				continue
+			}
+
+			// Loop def does not dominate use? Possibly dead block
+			if !ln.sdom.IsAncestorEq(loopDef.Block, useBlock) {
+				continue
+			}
+
+			// Possibly a dead block, ignore it
+			if len(useBlock.Preds) == 0 {
+				assert(useBlock.Kind == BlockInvalid, "why not otherwise")
+				continue
+			}
+
+			// Only loop use that is not part of current loop takes into account.
+			if useBlock != loopDef.Block && !ln.containsBlock(loop, useBlock) {
+				// Simple case, try to find a loop exit that dominates the use
+				// block and place the proxy phi at this loop exit, this is the
+				// most common case
+				var domExit *Block
+				for _, exit := range loop.exits {
+					if ln.sdom.IsAncestorEq(exit, useBlock) {
+						domExit = exit
+						break
+					}
+				}
+				if domExit != nil {
+					use2exits[use] = append(use2exits[use], domExit)
+					continue
+				}
+				// Harder case, loop use block is not dominated by a single loop
+				// exit, instead it has many predecessors and all of them are
+				// dominated by different loop exits, we are probably reaching to
+				// it from all of these predecessors. In this case, we need to
+				// place the proxy phi at all loop exits and merge them at loop
+				// use block by yet another proxy phi
+				domExits := make([]*Block, 0, len(useBlock.Preds))
+				for _, pred := range useBlock.Preds {
+					found := false
+					for _, e := range loop.exits {
+						if ln.sdom.IsAncestorEq(e, pred.b) {
+							domExits = append(domExits, e)
+							found = true
+							break
+						}
+					}
+					if !found {
+						break
+					}
+				}
+				if cap(domExits) == len(domExits) {
+					use2exits[use] = domExits
+					continue
+				}
+
+				// Worst case, loop use block is not dominated by any of loop exits
+				// we start from all loop exits(including inner loop exits) though
+				// dominance frontier and see if we can reach to the use block,
+				// if so, we place the proxy phi at the loop exit that is closest
+				// to the use block. This is rare, but it does happen, give up
+				// for now as it's hard to handle.
+				// TODO(yyang): Correctly handle this case
+				if ln.f.pass.debug > 1 {
+					fmt.Printf("== Can not process use %v in %v\n", use, loop)
+				}
+				return false
+			}
+		}
+	}
+
+	// For every use of loop def, place the proxy phi at proper exit block
+	// and replace such use with the proxy phi, this is the core of LCSSA,
+	// since proxy phi is "inside the loop" in context of LCSSA, now all uses
+	// of loop def are loop closed, e.g. lives in the loop.
+	for _, loopDef := range loopDefs {
+		uses := defUses[loopDef]
+		if len(uses) == 0 {
+			continue
+		}
+		// multiple uses shares the same proxy phi if they live in same exit block
+		// also note that only users of the same loop def could share proxy phi
+		lc.e2phi = make(map[*Block]*Value, 0)
+		for _, use := range uses {
+			useBlock := findUseBlock(use)
+			exits := use2exits[use]
+			if len(exits) == 1 {
+				domExit := exits[0]
+				// Replace all uses of loop def with new proxy phi
+				lcphi := lc.allocateProxyPhi(domExit, loopDef)
+				if ln.f.pass.debug > 1 {
+					fmt.Printf("== Replace use %v with proxy phi %v\n",
+						use, lcphi.LongString())
+				}
+				use.replaceUse(lcphi)
+			} else if len(exits) > 1 {
+				// Place proxy phi at all dominator loop exits
+				phis := make([]*Value, 0, len(exits))
+				for _, exit := range exits {
+					lcphi := lc.allocateProxyPhi(exit, loopDef)
+					phis = append(phis, lcphi)
+				}
+				// Merge them at loop use block by yet another proxy phi
+				lcphi := lc.allocateProxyPhi(useBlock, phis...)
+				use.replaceUse(lcphi)
+				if ln.f.pass.debug > 1 {
+					fmt.Printf("== Replace use %v with proxy phi %v\n",
+						use, lcphi.LongString())
+				}
+			}
+		}
+	}
+
+	// Since we may have placed memory proxy phi at some loop exits, which
+	// use loop def and produce new memory. If this block is a predecessor
+	// of another loop exit, we need to use memory proxy phi instead of loop
+	// def as a parameter of new proxy phi.
+	lc.fixProxyPhiMem(ln.f)
+
+	return true
+}
+
+// BuildLoopClosedForm builds loop closed SSA form upon original loop, this is
+// the cornerstone of other loop optimizations such as LICM, loop unswitching
+// and empty loop elimination.
+func (fn *Func) BuildLoopClosedForm(ln *loopnest, loop *loop) bool {
+	assert(ln.initializedExits && ln.initializedChildren, "must be initialized")
+	if len(loop.exits) == 0 {
+		return true
+	}
+
+	sdom := ln.sdom // lcssa does not wire up CFG, reusing sdom is okay
+	domBlocks := make([]*Block, 0)
+	blocks := make([]*Block, 0)
+	blocks = append(blocks, loop.exits...)
+
+	// Outside the loop we can only use values defined in the blocks of arbitrary
+	// loop exit dominators, so first collect these blocks and treat the Values
+	// in them as loop def
+	for len(blocks) > 0 {
+		block := blocks[0]
+		blocks = blocks[1:]
+		if block == loop.header {
+			continue
+		}
+		idom := sdom.Parent(block)
+		if ln.b2l[idom.ID] != loop {
+			continue
+		}
+
+		domBlocks = append(domBlocks, idom)
+		blocks = append(blocks, idom)
+	}
+
+	// Look for out-of-loop users of these loop defs
+	defs := make([]*Value, 0)
+	for _, block := range domBlocks {
+		for _, val := range block.Values {
+			if val.Uses == 0 {
+				continue
+			}
+			defs = append(defs, val)
+		}
+	}
+
+	// For every use of loop def, place the proxy phi at the proper block
+	lc := &lcssa{
+		fn:    fn,
+		mphis: make([]*Value, 0, len(defs)),
+		e2phi: nil,
+	}
+	return lc.placeProxyPhi(ln, loop, defs)
+}
diff --git a/src/cmd/compile/internal/ssa/lcssa_test.go b/src/cmd/compile/internal/ssa/lcssa_test.go
new file mode 100644
index 00000000000000..f25f5adb07a214
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/lcssa_test.go
@@ -0,0 +1,203 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"cmd/compile/internal/types"
+	"testing"
+)
+
+func doLCSSA(fun fun) bool {
+	CheckFunc(fun.f)
+	f := fun.f
+	loopnest := f.loopnest()
+	loopnest.assembleChildren()
+	loopnest.findExits()
+	for _, loop := range loopnest.loops {
+		if f.BuildLoopClosedForm(loopnest, loop) {
+			CheckFunc(fun.f)
+			return true
+		}
+	}
+	return false
+}
+
+// Simple Case: use block is dominated by a single loop exit
+func TestLoopUse1(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Goto("useBlock")),
+		Bloc("useBlock",
+			Valu("use", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Exit("mem")))
+
+	if !doLCSSA(fun) {
+		t.Fatal("Failed to build LCSSA")
+	}
+
+	//  loop header:
+	//  i = phi(0, inc)
+	//  ....
+	//
+	//  loop exit:
+	//  p1 = phi(i) <= proxy phi
+	//  Plain useBlock
+	//
+	//  useBlock:
+	//  use = p1 + 1
+	verifyNumValue(fun, t, OpPhi, 2 /*var i + 1 proxy phi*/)
+}
+
+// Harder Case: use block is reachable from multiple loop exits
+func TestLoopUse2(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopBody", "loopExit")),
+		Bloc("loopBody",
+			Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp2", "loopExit1", "loopLatch")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit1",
+			Goto("useBlock")),
+		Bloc("loopExit",
+			Goto("useBlock")),
+		Bloc("useBlock",
+			Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"),
+			Exit("mem")))
+
+	if !doLCSSA(fun) {
+		t.Fatal("Failed to build LCSSA")
+	}
+
+	//  loop header:
+	//  i = phi(0, inc)
+	//  ....
+	//
+	//  loop exit:
+	//  p1 = phi(i) <= proxy phi
+	//  Plain useBlock
+	//
+	//  loop exit1:
+	//  p2 = phi(i) <= proxy phi
+	//  Plain useBlock
+	//
+	//  useBlock:
+	//  p3 = phi(p1, p2) <= proxy phi
+	//  use = p1 + 1
+	verifyNumValue(fun, t, OpPhi, 4 /*var i + 3 proxy phi*/)
+}
+
+// Used by ctrl valule
+func TestLoopUse3(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			// used by ctrl value
+			If("cmp", "exit1", "exit2")),
+		Bloc("exit1",
+			Goto("exit2")),
+		Bloc("exit2",
+			Exit("mem")))
+
+	if !doLCSSA(fun) {
+		t.Fatal("Failed to build LCSSA")
+	}
+
+	//  loop header:
+	//  i = phi(0, inc)
+	//  ....
+	//
+	//  loop exit:
+	//  p1 = phi(i) <= proxy phi
+	//  If p1-> exit1, exit2
+	verifyNumValue(fun, t, OpPhi, 2 /*var i + 1 proxy phi*/)
+}
+
+// Used by Phi
+func TestLoopUse4(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopBody", "loopExit")),
+		Bloc("loopBody",
+			Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp2", "loopExit1", "loopLatch")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit1",
+			Goto("useBlock")),
+		Bloc("loopExit",
+			Goto("useBlock")),
+		Bloc("useBlock",
+			Valu("use", OpPhi, c.config.Types.Int64, 0, nil, "i", "i"),
+			Exit("mem")))
+
+	if !doLCSSA(fun) {
+		t.Fatal("Failed to build LCSSA")
+	}
+
+	//  loop header:
+	//  i = phi(0, inc)
+	//  ....
+	//
+	//  loop exit:
+	//  p1 = phi(i) <= proxy phi
+	//  Plain useBlock
+	//
+	//  loop exit1:
+	//  p2 = phi(i) <= proxy phi
+	//  Plain useBlock
+	//
+	//  useBlock:
+	//  use = phi(p1, p2)
+	verifyNumValue(fun, t, OpPhi, 3 /*var i + 2 proxy phi*/ +1 /*original phi*/)
+}
diff --git a/src/cmd/compile/internal/ssa/licm.go b/src/cmd/compile/internal/ssa/licm.go
new file mode 100644
index 00000000000000..6b361b2f709c3e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/licm.go
@@ -0,0 +1,384 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"fmt"
+	"sort"
+)
+
+// ----------------------------------------------------------------------------
+// Loop Invariant Code Motion
+//
+// The main idea behind LICM is to move loop invariant values outside of the loop
+// so that they are only executed once, instead of being repeatedly executed with
+// each iteration of the loop. In the context of LICM, if a loop invariant can be
+// speculatively executed, then it can be freely hoisted to the loop entry.
+// However, if it cannot be speculatively executed, there is still a chance that
+// it can be hoisted outside the loop under a few prerequisites:
+//
+//  #1 Instruction is guaranteed to execute unconditionally
+//  #2 Instruction does not access memory locations that may alias with other
+//    memory operations inside the loop
+//
+// For #1, this is guaranteed by loop rotation, where the loop is guaranteed to
+// execute at least once after rotation. But that's not the whole story. If the
+// instruction is guarded by a conditional expression (e.g., loading from a memory
+// address usually guarded by an IsInBound check), in this case, we try to hoist
+// it only if the loop invariant dominates all loop exits, which implies that it
+// will be executed unconditionally as soon as it enters the loop.
+// For #2, we always pessimistically assume that they are must-aliases and stop
+// optimizing if we saw both load and store
+
+func logInvariant(val *Value, src *Block, dest *Block) {
+	hoistType := "Simple"
+	if isHoistable(val) {
+		hoistType = "Complex"
+	}
+	if dest.Func.pass.debug > 2 {
+		fmt.Printf("Hoist%s %v from %v to %v in %v\n",
+			hoistType, val.LongString(), src, dest, dest.Func.Name)
+	}
+}
+
+func moveTo(val *Value, block *Block) {
+	for valIdx, v := range val.Block.Values {
+		if val != v {
+			continue
+		}
+		val.moveTo(block, valIdx)
+		break
+	}
+}
+
+func isMemoryDef(val *Value) bool {
+	switch val.Op {
+	case OpStore, OpMove, OpZero, OpStoreWB, OpMoveWB, OpZeroWB,
+		OpPanicBounds, OpPanicExtend,
+		OpPubBarrier,
+		OpVarDef, OpVarLive, OpKeepAlive:
+		return true
+	}
+	return false
+}
+
+// alwaysExecute checks if Value is guaranteed to execute during loop iterations
+// Otherwise, it should not be hoisted. The most common cases are invariants
+// guarded by a conditional expression.
+// TODO: If we can prove that Value can speculative execute nevertheless, e.g.
+// Load from non-null pointer, this is not really necessary
+func alwaysExecute(sdom SparseTree, loop *loop, val *Value) bool {
+	block := val.Block
+	// Because loop header can always jump to the loop exit, all blocks
+	// inside the loop are never post-dominated by any loop exit.
+	// Therefore, we need to first apply loop rotation to eliminate the path
+	// from the loop header to the loop exit.
+	for _, exit := range loop.exits {
+		if exit == loop.exit {
+			if !sdom.IsAncestorEq(block, loop.latch) {
+				return false
+			}
+			continue
+		}
+		if !sdom.IsAncestorEq(block, exit) {
+			return false
+		}
+	}
+	return true
+}
+
+func isHoistable(val *Value) bool {
+	// The protagonist of the whole story
+	switch val.Op {
+	case OpLoad, OpStore, OpNilCheck, OpGetG, OpVarDef, OpConvert:
+		return true
+	}
+	return false
+}
+
+type hoister struct {
+	fn      *Func
+	sdom    SparseTree
+	ln      *loopnest
+	hoisted map[*Value]bool
+}
+
+func (h *hoister) hoist(block *Block, val *Value) {
+	if arg := val.MemoryArg(); arg != nil {
+		// If val produces memory, all its uses should be replaced with incoming
+		// memory input of val
+		if isMemoryDef(val) {
+			mem := arg
+			for _, b := range h.fn.Blocks {
+				b.replaceUses(val, mem)
+			}
+		}
+	}
+
+	srcBlock := val.Block
+	moveTo(val, block)
+	logInvariant(val, srcBlock, block)
+	h.hoisted[val] = true
+}
+
+// tryHoist hoists profitable loop invariant to block that dominates the entire
+// loop. Value is considered as loop invariant if all its inputs are defined
+// outside the loop or all its inputs are loop invariants. Since loop invariant
+// will immediately moved to dominator block of loop, the first rule actually
+// already implies the second rule
+func (h *hoister) tryHoist(loop *loop, invariants loopInvariants, val *Value) bool {
+	// Value is already hoisted
+	if hoisted, exist := h.hoisted[val]; exist {
+		return hoisted
+	}
+	// Value is type of Phi, we can not hoist it now
+	if val.Op == OpPhi {
+		h.hoisted[val] = false
+		return false
+	}
+
+	// Try to hoist arguments of value first, they are guaranteed to be loop
+	// invariants but not necessarily hoistable
+	h.hoisted[val] = false
+	for _, arg := range val.Args {
+		if arg.Type.IsMemory() {
+			if !isMemoryDef(arg) {
+				continue
+			}
+		}
+		if _, isInvariant := invariants[arg]; isInvariant {
+			if !h.tryHoist(loop, invariants, arg) {
+				return false
+			}
+		} else {
+			// Value is not loop invariant, it must dominate the loop header
+			// or type of memory, simply check it
+			if arg.Op != OpUnknown && arg.Op != OpInvalid &&
+				!arg.Type.IsMemory() &&
+				!h.sdom.IsAncestorEq(arg.Block, loop.header) {
+				h.fn.Fatalf("arg %v must define outside loop", arg)
+			}
+		}
+	}
+
+	// This catches most common case, e.g. arithmetic, bit operation, etc.
+	if !isAccessMemory(val) {
+		assert(val.MemoryArg() == nil, "sanity check")
+		h.hoist(loop.land, val)
+		return true
+	}
+
+	// Instructions are selected ones?
+	if isHoistable(val) {
+		assert(loop.IsRotatedForm(), "loop must be rotated")
+
+		// Instructions are guaranteed to execute unconditionally?
+		if !alwaysExecute(h.sdom, loop, val) {
+			if h.fn.pass.debug > 1 {
+				fmt.Printf("LICM failure: %v not always execute\n", val.LongString())
+			}
+			return false
+		}
+
+		h.hoist(loop.land, val)
+		return true
+	}
+
+	if h.fn.pass.debug > 1 {
+		fmt.Printf("LICM failure: %v is not hoistable\n", val.LongString())
+	}
+	return false
+}
+
+// Hoisting memory def to loop land may break memory state of loop header, this
+// should be fixed after CFG transformation done
+func (h *hoister) fixMemoryState(loop *loop, startMem, endMem []*Value) {
+	// No instruction hoisted? Do nothing them
+	if len(h.hoisted) == 0 {
+		return
+	}
+
+	// Find last memory def in loop entry, which in turns become last memory
+	// or loop guard, this implies that loop guard can not contain memory def
+	lastMem := endMem[loop.entry.ID]
+	for _, val := range loop.guard.Values {
+		if isMemoryDef(val) {
+			h.fn.Fatalf("Loop guard %v contains memory def %v", loop.guard, val)
+		}
+	}
+
+	// Find last memory def in loop land
+	oldLastMem := lastMem
+	for _, val := range loop.land.Values {
+		if arg := val.MemoryArg(); arg != nil {
+			val.SetArg(len(val.Args)-1, lastMem)
+		}
+		if isMemoryDef(val) {
+			lastMem = val
+		}
+	}
+
+	// If loop land has new memory def, memory state of loop header should be
+	// updated as well
+	if oldLastMem != lastMem {
+		headerMem := startMem[loop.header.ID]
+		if headerMem == nil {
+			h.fn.Fatalf("Canot find start memory of loop header %v", loop.header)
+		}
+		if headerMem.Op == OpPhi {
+			landIdx := -1
+			for idx, pred := range loop.header.Preds {
+				if pred.b == loop.land {
+					landIdx = idx
+					break
+				}
+			}
+			headerMem.SetArg(landIdx, lastMem)
+		} else {
+			loop.header.replaceUses(headerMem, lastMem)
+		}
+	}
+}
+
+type loopInvariants map[*Value]bool
+
+func stableKeys(li loopInvariants) []*Value {
+	keys := make([]*Value, 0)
+	for k, _ := range li {
+		keys = append(keys, k)
+	}
+	sort.SliceStable(keys, func(i, j int) bool {
+		return keys[i].ID < keys[j].ID
+	})
+	return keys
+}
+
+// findInviant finds all loop invariants within the loop
+func (loop *loop) findInvariant(ln *loopnest) loopInvariants {
+	loopValues := make(map[*Value]bool)
+	invariants := make(map[*Value]bool)
+	loopBlocks := ln.findLoopBlocks(loop)
+
+	// First, collect all def inside loop
+	hasLoad, hasStore := false, false
+	for _, block := range loopBlocks {
+		for _, value := range block.Values {
+			if value.Op == OpLoad {
+				hasLoad = true
+			} else if value.Op == OpStore {
+				hasStore = true
+			} else if value.Op.IsCall() {
+				if ln.f.pass.debug > 1 {
+					fmt.Printf("LICM failure: find call %v\n", value.LongString())
+				}
+				return nil
+			}
+			loopValues[value] = true
+		}
+	}
+
+	// See if loop contains both Load and Store and pessimistically assume that
+	// they are must-aliases and stop optimizing
+	// TODO: We can do better here by using type-based alias analysis in
+	// some cases
+	if hasLoad && hasStore {
+		if ln.f.pass.debug > 1 {
+			fmt.Printf("LICM failure: %v has both load and store\n", loop)
+		}
+		return nil
+	}
+
+	changed := true
+	for changed {
+		numInvar := len(invariants)
+		for val, _ := range loopValues {
+			// If basic block is located in a nested loop rather than directly in
+			// the current loop, it will not be processed.
+			if ln.b2l[val.Block.ID] != loop {
+				continue
+			}
+			isInvariant := true
+			for _, use := range val.Args {
+				if use.Type.IsMemory() {
+					// Discard last memory value
+					continue
+				}
+				if _, exist := invariants[use]; exist {
+					continue
+				}
+				if _, exist := loopValues[use]; exist {
+					isInvariant = false
+					break
+				}
+			}
+			if isInvariant {
+				invariants[val] = true
+			}
+		}
+		changed = (len(invariants) != numInvar)
+	}
+
+	return invariants
+}
+
+// licm stands for Loop Invariant Code Motion, it hoists expressions that computes
+// the same value outside loop
+func licm(fn *Func) {
+	loopnest := fn.loopnest()
+	if loopnest.hasIrreducible {
+		return
+	}
+	if len(loopnest.loops) == 0 {
+		return
+	}
+
+	loopnest.assembleChildren()
+	loopnest.findExits()
+	lcssa := make(map[*loop]bool, 0)
+
+	// Transform all loops to loop closed form
+	for _, loop := range loopnest.loops {
+		lcssa[loop] = fn.BuildLoopClosedForm(loopnest, loop)
+	}
+
+	h := &hoister{
+		fn:      fn,
+		ln:      loopnest,
+		hoisted: make(map[*Value]bool),
+	}
+	// Remember initial memory subgraph before LICM
+	startMem, endMem := memState(fn)
+	for _, loop := range loopnest.loops {
+		// See if loop is in form of LCSSA
+		if wellFormed := lcssa[loop]; !wellFormed {
+			continue
+		}
+
+		// Rotate the loop to ensures that loop executes at least once
+		if !fn.RotateLoop(loop) {
+			continue
+		}
+
+		// Find loop invariants within the loop
+		invariants := loop.findInvariant(loopnest)
+		if invariants == nil || len(invariants) == 0 {
+			continue
+		}
+
+		// Create a home for hoistable Values after rotation
+		if !loop.CreateLoopLand(fn) {
+			fn.Fatalf("Can not create loop land for %v", loop.LongString())
+		}
+
+		// All prerequisites are satisfied, try to hoist loop invariants
+		h.sdom = fn.Sdom()
+		for _, val := range stableKeys(invariants) {
+			h.tryHoist(loop, invariants, val)
+		}
+
+		// Fix broken memory state given that CFG no longer changes
+		h.fixMemoryState(loop, startMem, endMem)
+	}
+}
diff --git a/src/cmd/compile/internal/ssa/licm_test.go b/src/cmd/compile/internal/ssa/licm_test.go
new file mode 100644
index 00000000000000..4100c5f7d9b61d
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/licm_test.go
@@ -0,0 +1,192 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"cmd/compile/internal/types"
+	"fmt"
+	"testing"
+)
+
+func doLICM(fun fun) {
+	CheckFunc(fun.f)
+	licm(fun.f)
+	CheckFunc(fun.f)
+}
+
+func checkHoist(t *testing.T, fun fun, loopInvariants ...string) {
+	loopHeader := fun.blocks["loopHeader"]
+	// Find loop land block
+	sdom := fun.f.Sdom()
+	var loopLand *Block
+	for _, pred := range loopHeader.Preds {
+		if sdom.isAncestor(pred.b, loopHeader) {
+			loopLand = pred.b
+			break
+		}
+	}
+	if loopLand == nil {
+		fmt.Printf("== After LICM: %v\n", fun.f.String())
+		t.Errorf("Error: loop land block not found\n")
+	}
+	if len(loopLand.Preds) != 1 || len(loopLand.Succs) != 1 {
+		fmt.Printf("== After LICM: %v\n", fun.f.String())
+		t.Errorf("Error: bad loop land\n")
+	}
+	// Find expected loop invariant from loop land
+	cnt := 0
+	for _, li := range loopInvariants {
+		for _, val := range loopLand.Values {
+			if val == fun.values[li] {
+				cnt++
+				break
+			}
+		}
+	}
+
+	if cnt != len(loopInvariants) {
+		fmt.Printf("== After LICM: %v\n", fun.f.String())
+		t.Errorf("Error: loop invariant not found in loop land")
+	}
+}
+
+// Hoist simple arithmetic loop invariant
+//
+//	for i := 0; i < 10; i++ {
+//		li := 10 * 10
+//	}
+func TestHoistSimpleLI(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("li", OpMul64, c.config.Types.Int64, 0, nil, "ten", "one"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	doLICM(fun)
+	checkHoist(t, fun, "li")
+}
+
+// Hoist simple arithmetic but may trap execution
+//
+//	 func foo(arg1 int)
+//		for i := 0; i < 10; i++ {
+//			li := (10*10) / arg1 /*may be zero*/
+//		}
+//	 }
+func TestHoistTrapDiv(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("arg1", OpArg, c.config.Types.Int64, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("li", OpMul64, c.config.Types.Int64, 0, nil, "ten", "one"),
+			Valu("li2", OpDiv64, c.config.Types.Int64, 0, nil, "li", "arg1"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	doLICM(fun)
+	checkHoist(t, fun, "li", "li2")
+}
+
+// Hoist load from loop
+func TestHoistLoad(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("addr1", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"),
+			Valu("load", OpLoad, c.config.Types.Int8, 0, nil, "addr1", "mem"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	doLICM(fun)
+	checkHoist(t, fun, "load", "addr1")
+}
+
+func TestHoistStore(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("addr", OpAddr, c.config.Types.Int8.PtrTo(), 0, nil, "sb"),
+			Valu("store", OpStore, types.TypeMem, 0, nil, "addr", "one", "mem"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	doLICM(fun)
+	checkHoist(t, fun, "store", "addr")
+}
+
+// Hoist nil check from loop
+func TestHoistNilCheck(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Valu("addr", OpAddr, c.config.Types.Int8.PtrTo(), 0, nil, "sb"),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("nilcheck", OpNilCheck, c.config.Types.IntPtr, 0, nil, "addr", "mem"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	doLICM(fun)
+	checkHoist(t, fun, "nilcheck")
+}
diff --git a/src/cmd/compile/internal/ssa/likelyadjust.go b/src/cmd/compile/internal/ssa/likelyadjust.go
index 1d0e53cf5b6086..78313fb2844a88 100644
--- a/src/cmd/compile/internal/ssa/likelyadjust.go
+++ b/src/cmd/compile/internal/ssa/likelyadjust.go
@@ -4,26 +4,81 @@
 
 package ssa
 
-import (
-	"fmt"
-)
-
+import "fmt"
+
+// ----------------------------------------------------------------------------
+// The Loop
+//
+// The natural loop usually looks like in below IR form:
+//
+//	 loop entry
+//	     │
+//	     │  ┌───loop latch
+//	     ▼  ▼       ▲
+//	loop header     │
+//	     │  │       │
+//	     │  └──►loop body
+//	     ▼
+//	 loop exit
+//
+// In the terminology, loop entry dominates the entire loop, loop header contains
+// the loop conditional test, loop body refers to the code that is repeated, loop
+// latch contains the backedge to loop header, for simple loops, the loop body is
+// equal to loop latch, and loop exit refers to the block that dominated by the
+// entire loop.
+//
+// After loop rotation, the loop will be transformed to below form with additional
+// guard block and land block:
+//
+//	   loop entry
+//	       │
+//	       │
+//	       ▼
+//	┌──loop guard
+//	│      │
+//	│      │
+//	│      ▼
+//	|  loop land  <= safe land to place Values
+//	│      │
+//	│      │
+//	│      ▼
+//	│  loop header◄──┐
+//	│      │         │
+//	│      │         │
+//	│      ▼         │
+//	│  loop body     │
+//	│      │         │
+//	│      │         │
+//	│      ▼         │
+//	│  loop latch────┘
+//	│      │
+//	│      │
+//	│      ▼
+//	└─► loop exit
+//
+// Where loop guard ensures the loop body is executed at least once, and loop
+// land is a safe place to place Values that are moved out of the loop and only
+// executed once before the loop.
 type loop struct {
 	header *Block // The header node of this (reducible) loop
-	outer  *loop  // loop containing this loop
+	entry  *Block // loop entry which jumps to header directly
+	exit   *Block // The unique main exit block of this loop, if any
+	latch  *Block // Source of backedge, where increment happens
+	body   *Block // The first loop body, near to the header
+	guard  *Block // Ensure loop executed at least once after rotation
+	land   *Block // Safe land block to place instructions after rotation
+	outer  *loop  // Outer loop containing this loop
 
 	// By default, children, exits, and depth are not initialized.
 	children []*loop  // loops nested directly within this loop. Initialized by assembleChildren().
 	exits    []*Block // exits records blocks reached by exits from this loop. Initialized by findExits().
 
-	// Next three fields used by regalloc and/or
+	// Next four fields used by regalloc and/or
 	// aid in computation of inner-ness and list of blocks.
-	nBlocks int32 // Number of blocks in this loop but not within inner loops
-	depth   int16 // Nesting depth of the loop; 1 is outermost. Initialized by calculateDepths().
-	isInner bool  // True if never discovered to contain a loop
-
-	// register allocation uses this.
-	containsUnavoidableCall bool // True if all paths through the loop have a call
+	nBlocks                 int32 // Number of blocks in this loop but not within inner loops
+	depth                   int16 // Nesting depth of the loop; 1 is outermost. Initialized by calculateDepths().
+	isInner                 bool  // True if never discovered to contain a loop
+	containsUnavoidableCall bool  // True if all paths through the loop have a call
 }
 
 // outerinner records that outer contains inner
@@ -63,7 +118,7 @@ func checkContainsCall(bb *Block) bool {
 
 type loopnest struct {
 	f              *Func
-	b2l            []*loop
+	b2l            []*loop // block id to loop mapping
 	po             []*Block
 	sdom           SparseTree
 	loops          []*loop
@@ -236,21 +291,36 @@ func likelyadjust(f *Func) {
 }
 
 func (l *loop) String() string {
-	return fmt.Sprintf("hdr:%s", l.header)
+	return fmt.Sprintf("Loop@%s", l.header)
 }
 
-func (l *loop) LongString() string {
-	i := ""
-	o := ""
-	if l.isInner {
-		i = ", INNER"
-	}
-	if l.outer != nil {
-		o = ", o=" + l.outer.header.String()
+func (loop *loop) LongString() string {
+	// Loop: loop header
+	// T: loop entry
+	// B: loop body
+	// E: loop exit
+	// L: loop latch
+	// G: loop guard
+	//
+	// * denotes main loop exit
+	if len(loop.exits) == 1 {
+		return fmt.Sprintf("Loop@%v(B@%v E@%v L@%v G@%v T@%v)",
+			loop.header, loop.body, loop.exit, loop.latch, loop.guard, loop.entry)
+	} else {
+		s := ""
+		for i, exit := range loop.exits {
+			s += exit.String()
+			if exit == loop.exit {
+				s += "*"
+			}
+			if i != len(loop.exits)-1 {
+				s += " "
+			}
+		}
+		return fmt.Sprintf("Loop@%v(B@%v E@(%v) L@%v G@%v T@%v)",
+			loop.header, loop.body, s, loop.latch, loop.guard, loop.entry)
 	}
-	return fmt.Sprintf("hdr:%s%s%s", l.header, i, o)
 }
-
 func (l *loop) isWithinOrEq(ll *loop) bool {
 	if ll == nil { // nil means whole program
 		return true
@@ -511,6 +581,18 @@ func (ln *loopnest) calculateDepths() {
 	ln.initializedDepth = true
 }
 
+func removeDuplicate(blocks []*Block) []*Block {
+	allKeys := make(map[*Block]bool)
+	list := []*Block{}
+	for _, item := range blocks {
+		if _, value := allKeys[item]; !value {
+			allKeys[item] = true
+			list = append(list, item)
+		}
+	}
+	return list
+}
+
 // findExits uses loop depth information to find the
 // exits from a loop.
 func (ln *loopnest) findExits() {
@@ -521,17 +603,29 @@ func (ln *loopnest) findExits() {
 	b2l := ln.b2l
 	for _, b := range ln.po {
 		l := b2l[b.ID]
-		if l != nil && len(b.Succs) == 2 {
-			sl := b2l[b.Succs[0].b.ID]
-			if recordIfExit(l, sl, b.Succs[0].b) {
-				continue
-			}
-			sl = b2l[b.Succs[1].b.ID]
-			if recordIfExit(l, sl, b.Succs[1].b) {
-				continue
+		if l != nil {
+			if len(b.Succs) == 2 {
+				sl := b2l[b.Succs[0].b.ID]
+				if recordExit(l, sl, b.Succs[0].b) {
+					continue
+				}
+				sl = b2l[b.Succs[1].b.ID]
+				if recordExit(l, sl, b.Succs[1].b) {
+					continue
+				}
+			} else if len(b.Succs) > 2 { // JumpTable
+				assert(b.Kind == BlockJumpTable, "why not otherwise")
+				for _, s := range b.Succs {
+					sl := b2l[s.b.ID]
+					recordExit(l, sl, s.b)
+				}
 			}
 		}
 	}
+	// Remove duplicated exits for every loop
+	for _, loop := range ln.loops {
+		loop.exits = removeDuplicate(loop.exits)
+	}
 	ln.initializedExits = true
 }
 
@@ -543,10 +637,10 @@ func (ln *loopnest) depth(b ID) int16 {
 	return 0
 }
 
-// recordIfExit checks sl (the loop containing b) to see if it
+// recordExit checks sl (the loop containing b) to see if it
 // is outside of loop l, and if so, records b as an exit block
 // from l and returns true.
-func recordIfExit(l, sl *loop, b *Block) bool {
+func recordExit(l, sl *loop, b *Block) bool {
 	if sl != l {
 		if sl == nil || sl.depth <= l.depth {
 			l.exits = append(l.exits, b)
@@ -578,3 +672,45 @@ func (l *loop) setDepth(d int16) {
 func (l *loop) iterationEnd(b *Block, b2l []*loop) bool {
 	return b == l.header || b2l[b.ID] == nil || (b2l[b.ID] != l && b2l[b.ID].depth <= l.depth)
 }
+
+// contains checks if receiver loop contains inner loop in any depth
+func (loop *loop) contains(inner *loop) bool {
+	// Find from current loop
+	for _, child := range loop.children {
+		if child == inner {
+			return true
+		}
+	}
+	// Find from child of current loop
+	for _, child := range loop.children {
+		if child.contains(inner) {
+			return true
+		}
+	}
+	return false
+}
+
+// findLoopBlocks returnss all basic blocks, including those contained in nested loops.
+func (ln *loopnest) findLoopBlocks(loop *loop) []*Block {
+	ln.assembleChildren()
+	loopBlocks := make([]*Block, 0)
+	for id, tloop := range ln.b2l {
+		if tloop == nil {
+			continue
+		}
+		if tloop == loop {
+			// Find block by id and append it
+			for _, block := range ln.f.Blocks {
+				if int32(block.ID) == int32(id) {
+					loopBlocks = append(loopBlocks, block)
+					break
+				}
+			}
+		} else if loop.contains(tloop) {
+			// Otherwise, check if this block is within inner loops
+			blocks := ln.findLoopBlocks(tloop)
+			loopBlocks = append(loopBlocks, blocks...)
+		}
+	}
+	return loopBlocks
+}
diff --git a/src/cmd/compile/internal/ssa/looprotate.go b/src/cmd/compile/internal/ssa/looprotate.go
index 844a8f712447c9..4574e5508f88fc 100644
--- a/src/cmd/compile/internal/ssa/looprotate.go
+++ b/src/cmd/compile/internal/ssa/looprotate.go
@@ -1,12 +1,1111 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2023 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package ssa
 
-// loopRotate converts loops with a check-loop-condition-at-beginning
-// to loops with a check-loop-condition-at-end.
-// This helps loops avoid extra unnecessary jumps.
+import (
+	"fmt"
+	"sort"
+)
+
+// ----------------------------------------------------------------------------
+// Loop Rotation
+//
+// Loop rotation transforms while/for loop to do-while style loop. The original
+// natural loop is in form of below IR
+//
+//	 loop entry
+//	     │
+//	     │  ┌───loop latch
+//	     ▼  ▼       ▲
+//	loop header     │
+//	     │  │       │
+//	     │  └──►loop body
+//	     ▼
+//	 loop exit
+//
+// We move the conditional test from loop header to loop latch, incoming backedge
+// argument of conditional test should be updated as well otherwise we would lose
+// one update. Also note that any other uses of moved values should be updated
+// because moved Values now live in loop latch and may no longer dominates their
+// uses. At this point, loop latch determines whether loop continues or exits
+// based on rotated test.
+//
+//	loop entry
+//	    │
+//	    │
+//	    ▼
+//	loop header◄──┐
+//	    │         │
+//	    │         │
+//	    ▼         │
+//	loop body     │
+//	    │         │
+//	    │         │
+//	    ▼         │
+//	loop latch────┘
+//	    │
+//	    │
+//	    ▼
+//	loop exit
+//
+// Now loop header and loop body are executed unconditionally, this may changes
+// program semantics while original program executes them only if test is okay.
+// A so-called loop guard is inserted to ensure loop is executed at least once.
+//
+//	   loop entry
+//	       │
+//	       │
+//	       ▼
+//	┌──loop guard
+//	│      │
+//	│      │
+//	│      ▼
+//	│  loop header◄──┐
+//	│      │         │
+//	│      │         │
+//	│      ▼         │
+//	│  loop body     │
+//	│      │         │
+//	│      │         │
+//	│      ▼         │
+//	│  loop latch────┘
+//	│      │
+//	│      │
+//	│      ▼
+//	└─► loop exit
+//
+// Loop header no longer dominates entire loop, loop guard dominates it instead.
+// If Values defined in the loop were used outside loop, all these uses should be
+// replaced by a new Phi node at loop exit which merges control flow from loop
+// header and loop guard. Based on Loop Closed SSA Form, these Phis have already
+// been created. All we need to do is simply reset their operands to accurately
+// reflect the fact that loop exit is a merge point now.
+//
+// One of the main purposes of Loop Rotation is to assist other optimizations
+// such as LICM. They may require that the rotated loop has a proper while safe
+// block to place new Values, an optional loop land block is hereby created to
+// give these optimizations a chance to keep them from being homeless.
+//
+//	   loop entry
+//	       │
+//	       │
+//	       ▼
+//	┌──loop guard
+//	│      │
+//	│      │
+//	│      ▼
+//	|  loop land  <= safe land to place Values
+//	│      │
+//	│      │
+//	│      ▼
+//	│  loop header◄──┐
+//	│      │         │
+//	│      │         │
+//	│      ▼         │
+//	│  loop body     │
+//	│      │         │
+//	│      │         │
+//	│      ▼         │
+//	│  loop latch────┘
+//	│      │
+//	│      │
+//	│      ▼
+//	└─► loop exit
+//
+// The detailed loop rotation algorithm is summarized as following steps
+//
+//  1. Transform the loop to Loop Closed SSA Form
+//     * All uses of loop defined Values will be replaced by uses of proxy phis
+//
+//  2. Check whether loop can apply loop rotate
+//     * Loop must be a natural loop and have a single exit and so on..
+//
+//  3. Rotate loop conditional test and rewire loop edges
+//     * Rewire loop header to loop body unconditionally.
+//     * Rewire loop latch to header and exit based on new conditional test.
+//     * Create new loop guard block and rewire loop entry to loop guard.
+//     * Clone conditional test from loop header to loop guard.
+//     * Rewire loop guard to original loop header and loop exit
+//
+//  4. Reconcile broken data dependency after CFG transformation
+//     * Move conditional test from loop header to loop latch
+//     * Update uses of moved Values because these defs no longer dominates uses
+//       after they were moved to loop latch
+//     * Add corresponding argument for phis at loop exits since new edge from
+//       loop guard to loop exit had been created
+//     * Update proxy phi to use the loop phi's incoming argument which comes
+//       from loop latch since loop latch may terminate the loop now
+
+// checkLoopForm checks if loop is well formed and returns failure reason if not
+func (loop *loop) checkLoopForm(fn *Func, sdom SparseTree) string {
+	loopHeader := loop.header
+	// Check if loop header is well formed block
+	if len(loopHeader.Preds) != 2 || len(loopHeader.Succs) != 2 ||
+		loopHeader.Kind != BlockIf {
+		return "bad loop header"
+	}
+
+	// Check if loop exit nears the loop header
+	fn.loopnest().findExits() // initialize loop exits
+	e1, e2 := loopHeader.Succs[1].b, loopHeader.Succs[0].b
+	found := false
+	for _, exit := range loop.exits {
+		if exit == e1 {
+			loop.exit = e1
+			loop.body = loopHeader.Succs[0].b
+			found = true
+			break
+		} else if exit == e2 {
+			loop.exit = e2
+			loop.body = loopHeader.Succs[1].b
+			found = true
+			break
+		}
+	}
+	if !found {
+		return "far loop exit beyond header"
+	}
+
+	loop.latch = loopHeader.Preds[1].b
+
+	// Check if loop header dominates all loop exits
+	if len(loop.exits) != 1 {
+		for _, exit := range loop.exits {
+			if exit == loop.exit {
+				continue
+			}
+			// Loop header may not dominate all loop exist, given up for these
+			// exotic guys
+			if !sdom.IsAncestorEq(loopHeader, exit) {
+				return "loop exit is not dominated by header"
+			}
+		}
+	}
+
+	// Check loop conditional test is "trivial"
+	for _, ctrl := range loop.header.ControlValues() {
+		if !loop.isTrivial(sdom, ctrl, true) {
+			return "non trivial loop cond"
+		}
+	}
+
+	// Check if all loop uses are "trivial"
+	for ipred, pred := range loop.exit.Preds {
+		if pred.b == loop.header {
+			for _, val := range loop.exit.Values {
+				// TODO: Relax or remove this restriction
+				if val.Op == OpPhi {
+					if arg := val.Args[ipred]; arg.Block == loop.header {
+						if !loop.isTrivial(sdom, arg, false) {
+							return "use non trivial loop def outside loop"
+						}
+					}
+				} else if val.Block == loop.header {
+					if !loop.isTrivial(sdom, val, false) {
+						return "use non trivial loop def outside loop"
+					}
+				}
+			}
+			break
+		}
+	}
+	return ""
+}
+
+// A loop def is "trivial" if, starting from the value, it is looked up along its
+// argument until it encounters the loop phi defined in the loop header, no
+// intractable values are encountered in the process, or the lookup depth does
+// not exceed the MaxDepth. We need this restriction because all the values in
+// the chain from the loop phi to the trivial loop def could be cloned into other
+// block, and cloning without careful scrutiny would lead to code bloat and extra
+// performance penalty.
+const (
+	InitDepth = 0
+	MaxDepth  = 5
+)
+
+type loopTrivialVal struct {
+	cloning  bool
+	valBlock *Block
+	touched  map[*Value]*Value
+	visited  map[*Value]bool
+}
+
+func (t *loopTrivialVal) clone(val *Value, dest *Block, depth int) *Value {
+	// If seeing Phi or value that lives different from source block? They must
+	// not part of trivial loop def chain, do nothing
+	if val.Op == OpPhi || val.Block != t.valBlock {
+		return val
+	}
+
+	// If val is already cloned? Use cloned value instead.
+	if c, exist := t.touched[val]; exist {
+		return c
+	}
+
+	// Clone val and its arguments recursively
+	clone := dest.Func.newValueNoBlock(val.Op, val.Type, val.Pos)
+	clone.AuxInt = val.AuxInt
+	clone.Aux = val.Aux
+	args := make([]*Value, len(val.Args))
+	for i := 0; i < len(val.Args); i++ {
+		args[i] = t.clone(val.Args[i], dest, depth+1)
+	}
+	clone.AddArgs(args...)
+	dest.placeValue(clone)
+	t.touched[val] = clone // cache cloned value after cloning its arguments
+	return clone
+}
+
+func (t *loopTrivialVal) move(val *Value, dest *Block, depth int) {
+	if val.Op == OpPhi || val.Block != t.valBlock {
+		return
+	}
+	for _, arg := range val.Args {
+		t.move(arg, dest, depth+1)
+	}
+	moveTo(val, dest)
+}
+
+func (t *loopTrivialVal) update(val *Value, loop *loop, loopPhiIdx, depth int) {
+	// It's a Phi or value that lives different from source block? It must not
+	// part of trivial loop def chain, do nothing
+	if val.Op == OpPhi || val.Block != t.valBlock {
+		return
+	}
+	if _, hasCycle := t.visited[val]; hasCycle {
+		// Just skip it to avoid infinite recursion
+		return
+	}
+	t.visited[val] = true
+	for iarg, arg := range val.Args {
+		// If arg of val is a Phi which lives in loop header?
+		if arg.Op == OpPhi && arg.Block == loop.header {
+			// If expected incoming argument of arg is not visited, this implies
+			// that it may comes from loop latch, this is the most common case,
+			// update val to use incoming argument instead of arg. Otherwise,
+			// there is a cyclic dependency, see below for more details.
+			newUse := arg.Args[loopPhiIdx]
+			if _, livesInHeader := t.touched[newUse]; !livesInHeader {
+				// In original while/for loop, a critical edge is inserted at the
+				// end of each iteration, Phi values are updated. All subsequent
+				// uses of Phi rely on updated values. However, when converted
+				// to a do-while loop, Phi nodes may be used at the end of each
+				// iteration before they are updated. Therefore, we need to
+				// replace all subsequent uses of Phi with use of Phi parameter.
+				// This way, it is equivalent to using updated values of Phi
+				// values. Here is a simple example:
+				//
+				// Normal case, if v2 uses v1 phi, and the backedge operand v4
+				// of v1 phi is located in the loop latch block, we only need to
+				// modify the usage of v1 by v2 to the usage of v4. This prevents
+				// loss of updates, and the dominance relationship will not be
+				// broken even after v2 is moved to the loop latch.
+				//
+				// Before:
+				//  loop header:
+				//  v1 = phi(0, v4)
+				//  v2 = v1 + 1
+				//  If v2 < 3 -> loop body, loop exit
+				//
+				//  loop latch:
+				//  v4 = const 512
+				//
+				// After:
+				//  loop header:
+				//  v1 = phi(0, v4)
+				//
+				//  loop latch:
+				//  v4 = const 512
+				//  v2 = v4 + 1
+				//  If v2 < 3 -> loop header, loop exit
+				val.SetArg(iarg, newUse)
+				// After updating uses of val, we may create yet another cyclic
+				// dependency, i.e.
+				//
+				//  loop header:
+				//  v1 = phi(0, v4)
+				//  v2 = v1 + 1
+				//  If v2 < 3 -> loop body, loop exit
+				//
+				//  loop latch:
+				//  v4 = v2 + 1
+				//
+				// After updating iarg of val to newUse, it becomes
+				//
+				//  loop header:
+				//  v1 = phi(0, v4)
+				//
+				//  loop latch:
+				//  v2 = v4 + 1   ;;; cyclic dependency
+				//  v4 = v2 + 1
+				//  If v2 < 3 -> loop header, loop exit
+				//
+				// This is similiar to below case, and it would be properly handled
+				// by updateMovedUses. For now, we just skip it to avoid infinite
+				// recursion.
+			} else {
+				// If there is a value v1 in the loop header that is used to define
+				// a v2 phi in the same basic block, and this v2 phi is used in
+				// turn to use the value v1, there is a cyclic dependency, i.e.
+				//
+				//  loop header:
+				//  v1 = phi(0, v2)   ;;; cyclic dependency
+				//  v2 = v1 + 1
+				//  If v2 < 3 -> loop body, loop exit
+				//
+				// In this case, we need to first convert the v1 phi into its
+				// normal form, where its back edge parameter uses the value defined
+				// in the loop latch.
+				//
+				//  loop header:
+				//  v1 = phi(0, v3)
+				//  v2 = v1 + 1
+				//  If v2 < 3 -> loop body, loop exit
+				//
+				//  loop latch:
+				//  v3 = Copy v2
+				//
+				// After this, the strange v1 phi is treated in the same way as
+				// other phis. After moving the conditional test to the loop latch,
+				// the relevant parameters will also be updated, i.e., v2 will
+				// use v3 instead of v1 phi:
+				//
+				//  loop header:
+				//  v1 = phi(0, v3)
+				//
+				//  loop latch:
+				//  v3 = Copy v2
+				//  v2 = v3 + 1
+				//  If v2 < 3 -> loop header, loop exit
+				//
+				// Finally, since v3 is use of v2, after moving v2 to the loop
+				// latch, updateMovedUses will update these uses and insert a
+				// new v4 Phi.
+				//
+				//  loop header:
+				//  v1 = phi(0, v3)
+				//  v4 = phi(v2', v2)    ;;; v2' lives in loop guard
+				//
+				//  loop latch:
+				//  v3 = Copy v4
+				//  v2 = v3 + 1
+				//  If v2 < 3 -> loop header, loop exit
+
+				// Copy from cyclic dependency value and place it to loop latch
+				fn := arg.Block.Func
+				copy := fn.newValueNoBlock(OpCopy, arg.Type, arg.Pos)
+				if t.cloning {
+					// If we are cloning, we need to be very careful when updating
+					// the clonee, not the clone, otherwise, it can lead to another
+					// disastrous circular dependencies, e.g.
+					//
+					//  loop header:
+					//  v1 = phi(0, v3)
+					//
+					//  loop latch:
+					//  v3 = Copy v2
+					//  v2 = v3 + 1
+					//  If v2 < 3 -> loop header, loop exit
+					//
+					//  critical block(between loop latch and loop exit):
+					//  v3' = Copy v2    ;;; copy from v2 instead of v2'
+					//  v2' = v3' + 1
+					for clonee, clone := range t.touched {
+						if clone == val {
+							copy.SetArgs1(clonee)
+							break
+						}
+					}
+					if len(copy.Args) == 0 {
+						fn.Fatalf("can not found clone from clonee")
+					}
+				} else {
+					copy.SetArgs1(newUse)
+				}
+				loop.latch.placeValue(copy)
+				// Replace incoming argument of loop phi to copied value
+				arg.SetArg(loopPhiIdx, copy)
+				// Update val to use copied value as usual
+				val.SetArg(iarg, copy)
+
+				if fn.pass.debug > 1 {
+					fmt.Printf("== Insert %v during updating %v\n", copy, val)
+				}
+			}
+		} else {
+			t.update(arg, loop, loopPhiIdx, depth+1)
+		}
+	}
+}
+
+func (t *loopTrivialVal) valid(sdom SparseTree, val *Value, allowSideEffect bool, depth int) bool {
+	if depth >= MaxDepth {
+		return false
+	}
+
+	if sdom.isAncestor(val.Block, t.valBlock) {
+		return true
+	}
+
+	if val.Op == OpPhi {
+		if val.Block == t.valBlock {
+			return true
+		}
+		return false
+	}
+
+	if !allowSideEffect {
+		if val.Op != OpLoad && isAccessMemory(val) {
+			return false
+		}
+	}
+
+	for _, arg := range val.Args {
+		if !t.valid(sdom, arg, allowSideEffect, depth+1) {
+			return false
+		}
+	}
+	return true
+}
+
+// isTrivial checks if val is "trivial" and returns true if it is, otherwise false.
+func (loop *loop) isTrivial(sdom SparseTree, val *Value, allowSideEffect bool) bool {
+	t := &loopTrivialVal{
+		valBlock: loop.header,
+	}
+	return t.valid(sdom, val, allowSideEffect, InitDepth)
+}
+
+// cloneTrivial clones val to destination block and updates its uses accordingly
+func (loop *loop) cloneTrivial(val *Value, dest *Block, loopPhiIdx int) (*Value, map[*Value]*Value) {
+	t := &loopTrivialVal{
+		cloning:  true,
+		valBlock: val.Block,
+		touched:  make(map[*Value]*Value),
+		visited:  make(map[*Value]bool),
+	}
+	clone := t.clone(val, dest, InitDepth)
+	t.valBlock = dest
+	t.update(clone, loop, loopPhiIdx, InitDepth)
+	return clone, t.touched
+}
+
+// moveTrivial moves val to destination block and updates its uses accordingly
+func (loop *loop) moveTrivial(val *Value, dest *Block, cloned map[*Value]*Value, loopPhiIdx int) {
+	t := &loopTrivialVal{
+		cloning:  false,
+		valBlock: val.Block,
+		visited:  make(map[*Value]bool),
+	}
+	t.move(val, dest, InitDepth)
+	t.valBlock = dest
+	t.touched = cloned
+	t.update(val, loop, loopPhiIdx, InitDepth)
+}
+
+// moveCond moves conditional test from loop header to loop latch
+func (loop *loop) moveCond(cond *Value, cloned map[*Value]*Value) {
+	if cond.Block != loop.header {
+		// More rare, ctrl Value is not live in loop header, do nothing
+		return
+	}
+
+	if cond.Op == OpPhi {
+		// Rare case, Phi is used as conditional test, use its incoming argument
+		//     If (Phi v1 v2) -> loop body, loop exit
+		// =>  If v1          -> loop header, loop exit
+		cond = cond.Args[LoopLatch2HeaderPredIdx]
+		loop.latch.SetControl(cond)
+		return
+	}
+
+	// Normal case, update as usual
+	//    If (Less v1 Phi(v2 v3)) -> loop body, loop exit
+	// => If (Less v1 v2)         -> loop header, loop exit
+	loop.moveTrivial(cond, loop.latch, cloned, LoopLatch2HeaderPredIdx)
+}
+
+// cloneCond clones conditional test from loop header to loop guard
+func (loop *loop) cloneCond(cond *Value) (*Value, map[*Value]*Value) {
+	if cond.Block != loop.header {
+		// Dont clone if ctrl Value is not live in loop header
+		return cond, nil
+	}
+
+	if cond.Op == OpPhi {
+		// Use incoming argument of Phi as conditional test directly
+		guardCond := cond.Args[LoopGuard2HeaderPredIdx]
+		return guardCond, nil
+	}
+
+	// Clone as usual
+	return loop.cloneTrivial(cond, loop.guard, LoopGuard2HeaderPredIdx)
+}
+
+const (
+	LoopGuard2HeaderPredIdx = 0
+	LoopLatch2HeaderPredIdx = 1
+)
+
+// rewireLoopHeader rewires loop header to loop body unconditionally
+func (loop *loop) rewireLoopHeader() {
+	loopHeader := loop.header
+	loopHeader.Reset(BlockPlain)
+
+	// loopHeader -> loopBody(0)
+	loopHeader.Succs = loopHeader.Succs[:1]
+	loopHeader.Succs[0] = Edge{loop.body, 0}
+	assert(len(loop.body.Preds) == 1, "why not otherwise")
+	loop.body.Preds[0] = Edge{loopHeader, 0}
+}
+
+// rewireLoopLatch rewires loop latch to loop header and loop exit
+func (loop *loop) rewireLoopLatch(ctrl *Value, exitIdx int) {
+	loopExit := loop.exit
+	loopLatch := loop.latch
+	loopHeader := loop.header
+	loopLatch.resetWithControl(BlockIf, ctrl)
+	loopLatch.Likely = loopHeader.Likely
+	loopLatch.Pos = ctrl.Pos
+	loopHeader.Likely = BranchUnknown
+
+	var idx = -1
+	for i := 0; i < len(loopExit.Preds); i++ {
+		if loopExit.Preds[i].b == loop.header {
+			idx = i
+			break
+		}
+	}
+	if exitIdx == 1 {
+		// loopLatch -> loopHeader(0), loopExit(1)
+		loopLatch.Succs = append(loopLatch.Succs, Edge{loopExit, idx})
+	} else {
+		// loopLatch -> loopExit(0), loopHeader(1)
+		loopLatch.Succs = append([]Edge{{loopExit, idx}}, loopLatch.Succs[:]...)
+	}
+	// loopExit <- loopLatch, ...
+	loopExit.Preds[idx] = Edge{loopLatch, exitIdx}
+	// loopHeader <- loopLatch, ...
+	for i := 0; i < len(loopHeader.Preds); i++ {
+		if loopHeader.Preds[i].b == loopLatch {
+			idx = i
+			break
+		}
+	}
+	loopHeader.Preds[idx] = Edge{loopLatch, 1 - exitIdx}
+}
+
+// rewireLoopGuard rewires loop guard to loop header and loop exit
+func (loop *loop) rewireLoopGuard(guardCond *Value, exitIdx int) {
+	assert(len(loop.guard.Preds) == 1, "already setup")
+	loopHeader := loop.header
+	loopGuard := loop.guard
+	loopGuard.Pos = loopHeader.Pos
+	loopGuard.Likely = loopHeader.Likely // respect header's branch predication
+	loopGuard.SetControl(guardCond)
+
+	var idx = -1
+	assert(len(loopHeader.Preds) == 2, "sanity check")
+	for i := 0; i < len(loopHeader.Preds); i++ {
+		if loopHeader.Preds[i].b != loop.latch {
+			idx = i
+			break
+		}
+	}
+
+	loopExit := loop.exit
+	numExitPred := len(loopExit.Preds)
+	if exitIdx == 1 {
+		// loopGuard -> loopHeader(0), loopExit(1)
+		loopGuard.Succs = append(loopGuard.Succs, Edge{loopHeader, idx})
+		loopGuard.Succs = append(loopGuard.Succs, Edge{loopExit, numExitPred})
+		loopExit.Preds = append(loopExit.Preds, Edge{loopGuard, 1})
+		loopHeader.Preds[idx] = Edge{loopGuard, 0}
+	} else {
+		// loopGuard -> loopExit(0), loopHeader(1)
+		loopGuard.Succs = append(loopGuard.Succs, Edge{loopExit, numExitPred})
+		loopGuard.Succs = append(loopGuard.Succs, Edge{loopHeader, idx})
+		loopExit.Preds = append(loopExit.Preds, Edge{loopGuard, 0})
+		loopHeader.Preds[idx] = Edge{loopGuard, 1}
+	}
+}
+
+// rewireLoopEntry rewires loop entry to loop guard
+func (loop *loop) rewireLoopEntry(loopGuard *Block) {
+	assert(len(loop.header.Preds) == 2, "sanity check")
+
+	// Find loop entry from predecessor of loop header
+	for _, pred := range loop.header.Preds {
+		if pred.b != loop.latch {
+			loop.entry = pred.b
+			break
+		}
+	}
+	assert(loop.entry != nil, "missing loop entry")
+
+	// If loop entry is plain block, simply add edge from loop entry to guard
+	loopEntry := loop.entry
+	if len(loopEntry.Succs) == 1 {
+		// loopEntry(0) -> loopGuard
+		loopEntry.Succs = loopEntry.Succs[:0]
+		loopEntry.AddEdgeTo(loopGuard)
+	} else {
+		// Rewire corresponding successor of loop entry to loop guard (This could
+		// be constructed in artificial IR test, but does it really happen?...)
+		var idx = -1
+		for isucc, succ := range loopEntry.Succs {
+			if succ.b == loop.header {
+				idx = isucc
+				break
+			}
+		}
+		// loopEntry(idx) -> loopGuard, ...
+		loopEntry.Succs[idx] = Edge{loopGuard, 0}
+		loopGuard.Preds = append(loopGuard.Preds, Edge{loopEntry, idx})
+	}
+}
+
+// insertBetween inserts an empty block in the middle of start and end block.
+// If such block already exists, it will be returned instead.
+func insertBetween(fn *Func, start, end *Block) *Block {
+	for _, succ := range start.Succs {
+		if succ.b == end {
+			break
+		} else if len(succ.b.Succs) == 1 && succ.b.Succs[0].b == end {
+			return succ.b
+		}
+	}
+	empty := fn.NewBlock(BlockPlain)
+	empty.Preds = make([]Edge, 1, 1)
+	empty.Succs = make([]Edge, 1, 1)
+	start.ReplaceSucc(end, empty, 0)
+	end.ReplacePred(start, empty, 0)
+	return empty
+}
+
+func (loop *loop) findLoopGuardIndex() int {
+	if loop.header.Preds[0].b == loop.latch {
+		return 1
+	}
+	return 0
+}
+
+func (loop *loop) findLoopBackedgeIndex() int {
+	return 1 - loop.findLoopGuardIndex()
+}
+
+// Loop header no longer dominates loop exit, a new edge from loop guard to loop
+// exit is created, this is not reflected in proxy phis in loop exits, i.e. these
+// proxy phis miss one argument that comes from loop guard, we need to reconcile
+// the divergence
+//
+//	                              loop guard
+//	                                   |
+//	loop exit               loop exit  /
+//	    |          =>            |    /
+//	v1=phi(v1)              v1=phi(v1 v1') <= add missing g2e argument v1'
+//
+// Since LCSSA ensures that all loop uses are closed, i.e. any out-of-loop uses
+// are replaced by proxy phis in loop exit, we only need to add missing argument
+// v1' to v1 proxy phi
+func (loop *loop) addG2EArg(fn *Func, sdom SparseTree) {
+	var holder *Block
+	for _, val := range loop.exit.Values {
+		// Not even a phi?
+		if val.Op != OpPhi {
+			continue
+		}
+		// Num of args already satisfies the num of predecessors of loop exit?
+		if len(val.Args) == len(loop.exit.Preds) {
+			continue
+		}
+		if len(val.Args)+1 != len(loop.exit.Preds) {
+			fn.Fatalf("Only miss one g2e arg")
+		}
+		assert(val.Block == loop.exit, "sanity check")
+
+		// If arguments of the phi is not matched with predecessors of loop exit,
+		// then add corresponding g2e argument to reflect the new edge from loop
+		// guard to loop exit
+		var g2eArg *Value // loop guard to loop exit
+		for iarg, arg := range val.Args {
+			exitPred := val.Block.Preds[iarg].b
+			// If this predecessor is either loop header or inserted block?
+			if exitPred == loop.latch || exitPred == holder {
+				if sdom.isAncestor(arg.Block, loop.header) {
+					// arg lives in block that dominates loop header, it could
+					// be used as g2eArg directly
+					g2eArg = arg
+				} else if arg.Block == loop.header {
+					// arg lives in loop header, find its counterpart from loop
+					// guard or create a new one if not exist
+					guardIdx := loop.findLoopGuardIndex()
+
+					// It's a phi? Simply use its incoming argument that comes
+					// from loop guard as g2eArg
+					if arg.Op == OpPhi {
+						g2eArg = arg.Args[guardIdx]
+					} else {
+						// Otherwise, split critical edge from loop guard to exit
+						// and clone arg into new block, it becomes new g2eArg
+						holder = insertBetween(fn, loop.guard, loop.exit)
+						guardArg, _ := loop.cloneTrivial(arg, holder, guardIdx)
+						g2eArg = guardArg
+					}
+				}
+			}
+		}
+
+		// Add g2e argument for phi to reconcile the divergence between the num
+		// of block predecessors and the num of phi arguments
+		if g2eArg == nil {
+			fn.Fatalf("Can not create new g2e arg for %v", val.LongString())
+		}
+		newArgs := make([]*Value, len(loop.exit.Preds))
+		copy(newArgs, val.Args)
+		newArgs[len(newArgs)-1] = g2eArg
+		oldVal := val.LongString()
+		val.resetArgs()
+		val.AddArgs(newArgs...)
+		if fn.pass.debug > 1 {
+			fmt.Printf("== Add g2e argument %v to %v(%v)\n",
+				g2eArg, val.LongString(), oldVal)
+		}
+	}
+}
+
+func (loop *loop) findGuardArg(fn *Func, val *Value) *Value {
+	assert(val.Block == loop.header, "mirror comes from loop header")
+	guardIdx := loop.findLoopGuardIndex()
+
+	// It's a phi? Simply use its incoming argument that comes from loop guard
+	// as counterpart
+	if val.Op == OpPhi {
+		return val.Args[guardIdx]
+	}
+
+	// Otherwise, split critical edge from loop guard to loop exit and
+	// clone arg into the new block, this is the new counterpart
+	holder := insertBetween(fn, loop.guard, loop.exit)
+	guardArg, _ := loop.cloneTrivial(val, holder, guardIdx)
+	return guardArg
+}
+
+func (loop *loop) findBackedgeArg(fn *Func, val *Value, start, end *Block) *Value {
+	assert(val.Block == loop.header, "mirror comes from loop header")
+	backedgeIdx := loop.findLoopBackedgeIndex()
+
+	// It's a phi? Simply use its incoming argument that comes from loop latch
+	// as counterpart
+	if val.Op == OpPhi {
+		return val.Args[backedgeIdx]
+	}
+
+	// Otherwise, split edge from start to end and clone arg into the new block,
+	// this is the new counterpart
+	holder := insertBetween(fn, start, end)
+	backedgeArg, _ := loop.cloneTrivial(val, holder, backedgeIdx)
+	return backedgeArg
+}
+
+// Loop latch now terminates the loop. If proxy phi uses the loop phi that lives
+// in loop header, it should be replaced by using the loop phi's incoming argument
+// which comes from loop latch instead, this avoids losing one update.
+//
+//	Before:
+//	 loop header:
+//	 v1 = phi(0, v4)
+//
+//	 loop latch:
+//	 v4 = v1 + 1
+//
+//	 loop exit
+//	 v3 = phi(v1, ...)
+//
+//	After:
+//	 loop header:
+//	 v1 = phi(0, v4)
+//
+//	 loop latch:
+//	 v4 = v1 + 1
+//
+//	 loop exit
+//	 v3 = phi(v4, ...)  ;; use v4 instead of v1
+func (loop *loop) updateLoopUse(fn *Func) {
+	fn.invalidateCFG()
+	sdom := fn.Sdom()
+
+	for _, loopExit := range loop.exits {
+		// The loop exit is still dominated by loop header?
+		if sdom.isAncestor(loop.header, loopExit) {
+			continue
+		}
+		// Loop header no longer dominates this loop exit, find the corresponding
+		// incoming argument and update it for every phi in exit block
+		for _, val := range loopExit.Values {
+			if val.Op != OpPhi {
+				continue
+			}
+
+			sdom := fn.Sdom()
+			loopExit := val.Block
+			for iarg, arg := range val.Args {
+				// Only arg lives in the loop header is of interest
+				if arg.Block != loop.header {
+					continue
+				}
+				// See if corresponding predecessor was not dominated by loop
+				// header, if so, use corresponding argument to avoid losing one
+				exitPred := loopExit.Preds[iarg].b
+				if !sdom.isAncestor(loop.header, exitPred) {
+					newArg := loop.findGuardArg(fn, arg)
+					val.SetArg(iarg, newArg)
+					if fn.pass.debug > 1 {
+						fmt.Printf("== Update guard arg %v\n", val.LongString())
+					}
+					continue
+				}
+
+				// If the predecessor of loop exit was dominated by loop latch,
+				// use corresponding argument to avoid losing one update
+				if sdom.IsAncestorEq(loop.latch, exitPred) {
+					newArg := loop.findBackedgeArg(fn, arg, exitPred, loopExit)
+					val.SetArg(iarg, newArg)
+					if fn.pass.debug > 1 {
+						fmt.Printf("== Update backedge arg %v\n", val.LongString())
+					}
+					continue
+				}
+			}
+		}
+	}
+}
+
+// If the loop conditional test is "trivial", we will move the chain of this
+// conditional test values to the loop latch, after that, they may not dominate
+// the in-loop uses anymore:
+//
+//	loop header
+//	v1 = phi(0, ...)
+//	v2 = v1 + 1
+//	If v2 < 3 ...
+//
+//	loop body:
+//	v4 = v2 - 1
+//
+// So we need to create a new phi v5 at the loop header to merge the control flow
+// from the loop guard to the loop header and the loop latch to the loop header
+// and use this phi to replace the in-loop use v4. e.g.
+//
+//	loop header:
+//	v1 = phi(0, ...)
+//	v5 = phi(v2', v2)     ;;; v2' lives in loop guard
+//
+//	loop body:
+//	v4 = v5 - 1
+//
+//	loop latch:
+//	v2 = v1 + 1
+//	If v2 < 3 ...
+func (loop *loop) updateMovedUses(fn *Func, cloned map[*Value]*Value) {
+	// Find all moved values and sort them in order to ensure determinism
+	moved := make([]*Value, 0)
+	for key, _ := range cloned {
+		moved = append(moved, key)
+	}
+	sort.SliceStable(moved, func(i, j int) bool {
+		return moved[i].ID < moved[j].ID
+	})
+
+	// One def may have multiple uses, all of these uses should be replaced by
+	// the same def replacement
+	replacement := make(map[*Value]*Value)
+	// For each of moved value, find its uses inside loop
+	defUses := buildDefUses(fn, moved)
+	for _, def := range moved {
+		uses := defUses[def]
+		if def.Uses == 1 {
+			assert(uses[0].useBlock() == loop.latch, "used by another moved val")
+			continue
+		}
+		// For each use of def, if it is not one of the moved values or loop phi
+		// in loop header, replace it with inserted Phi
+		for _, use := range uses {
+			// Used by other moved value or by loop phi in header? Skip them as
+			// they are not needed to update
+			if use.val != nil {
+				if _, exist := cloned[use.val]; exist {
+					continue
+				}
+				if use.val.Op == OpPhi && use.val.Block == loop.header {
+					continue
+				}
+			} else {
+				if _, exist := cloned[use.block.ControlValues()[0]]; exist {
+					continue
+				}
+			}
+			// Since LCSSA ensures that all uses of loop defined values are in
+			// loop we can safely do replacement then
+			// TODO: Add verification here to check if it does lives inside loop
+
+			// Create phi at loop header, merge control flow from loop guard and
+			// loop latch, and replace use with such phi. If phi already exists,
+			// use it instead of creating a new one.
+			var newUse *Value
+			if phi, exist := replacement[def]; exist {
+				newUse = phi
+			} else {
+				phi := fn.newValueNoBlock(OpPhi, def.Type, def.Pos)
+				// Merge control flow from loop guard and loop latch
+				arg1 := cloned[def]
+				arg2 := def
+				if arg1.Block != loop.guard {
+					fn.Fatalf("arg1 must be live in loop guard")
+				}
+				if arg2.Block != loop.latch {
+					fn.Fatalf("arg2 must be live in loop latch")
+				}
+				phi.AddArg2(arg1, arg2)
+				loop.header.placeValue(phi)
+				replacement[def] = phi
+				newUse = phi
+			}
+			if fn.pass.debug > 1 {
+				fmt.Printf("== Update moved use %v %v\n", use, newUse.LongString())
+			}
+			use.replaceUse(newUse)
+		}
+	}
+}
+
+// verifyRotatedForm verifies if given loop is rotated form
+func (loop *loop) verifyRotatedForm(fn *Func) {
+	if len(loop.header.Succs) != 1 || len(loop.exit.Preds) < 2 ||
+		len(loop.latch.Succs) != 2 || len(loop.guard.Succs) != 2 {
+		fn.Fatalf("Bad loop %v after rotation", loop.LongString())
+	}
+}
+
+// IsRotatedForm returns true if loop is rotated
+func (loop *loop) IsRotatedForm() bool {
+	if loop.guard == nil {
+		return false
+	}
+	return true
+}
+
+// CreateLoopLand creates a land block between loop guard and loop header, it
+// executes only if entering loop.
+func (loop *loop) CreateLoopLand(fn *Func) bool {
+	if !loop.IsRotatedForm() {
+		return false
+	}
+	if loop.land != nil {
+		return true
+	}
+
+	// loopGuard -> loopLand
+	// loopLand -> loopHeader
+	loop.land = insertBetween(fn, loop.guard, loop.header)
+
+	return true
+}
+
+// RotateLoop rotates the original loop to become a do-while style loop, returns
+// true if loop is rotated, false otherwise.
+func (fn *Func) RotateLoop(loop *loop) bool {
+	if loop.IsRotatedForm() {
+		return true
+	}
+
+	// Check loop form and bail out if failure
+	sdom := fn.Sdom()
+	if msg := loop.checkLoopForm(fn, sdom); msg != "" {
+		if fn.pass.debug > 0 {
+			fmt.Printf("Exotic %v for rotation: %s %v\n", loop.LongString(), msg, fn.Name)
+		}
+		return false
+	}
+
+	exitIdx := 1 // which successor of loop header wires to loop exit
+	if loop.header.Succs[0].b == loop.exit {
+		exitIdx = 0
+	}
+
+	assert(len(loop.header.ControlValues()) == 1, "more than 1 ctrl value")
+	cond := loop.header.Controls[0]
+
+	// Rewire loop header to loop body unconditionally
+	loop.rewireLoopHeader()
+
+	// Rewire loop latch to header and exit based on new conditional test
+	loop.rewireLoopLatch(cond, exitIdx)
+
+	// Create loop guard block
+	// TODO(yyang): Creation of loop guard can be skipped if original IR already
+	// exists such form. e.g. if 0 < len(b) { for i := 0; i < len(b); i++ {...} }
+	loopGuard := fn.NewBlock(BlockIf)
+	loop.guard = loopGuard
+
+	// Rewire entry to loop guard instead of original loop header
+	loop.rewireLoopEntry(loopGuard)
+
+	// Clone old conditional test and its arguments to control loop guard
+	guardCond, cloned := loop.cloneCond(cond)
+
+	// Rewire loop guard to original loop header and loop exit
+	loop.rewireLoopGuard(guardCond, exitIdx)
+
+	// CFG changes are all done here, then update data dependencies accordingly
+
+	// Move conditional test from loop header to loop latch
+	loop.moveCond(cond, cloned)
+
+	// Update uses of moved Values because these defs no longer dominates uses
+	// after they were moved to loop latch
+	loop.updateMovedUses(fn, cloned)
+
+	// Add corresponding argument for phis at loop exits since new edge from
+	// loop guard to loop exit had been created
+	loop.addG2EArg(fn, sdom)
+
+	// Update proxy phi to use the loop phi's incoming argument which comes from
+	// loop latch since loop latch may terminate the loop now
+	loop.updateLoopUse(fn)
+
+	// Gosh, loop is rotated
+	loop.verifyRotatedForm(fn)
+
+	if fn.pass.debug > 0 {
+		fmt.Printf("%v rotated in %v\n", loop.LongString(), fn.Name)
+	}
+	fn.invalidateCFG()
+	return true
+}
+
+func moveBlock(slice []*Block, from, to int) []*Block {
+	if from < 0 || to < 0 || from >= len(slice) || to >= len(slice) {
+		return slice
+	}
+
+	elem := slice[from]
+	if from < to {
+		copy(slice[from:], slice[from+1:to+1])
+	} else {
+		copy(slice[to+1:], slice[to:from])
+	}
+
+	slice[to] = elem
+	return slice
+}
+
+// layoutLoop converts loops with a check-loop-condition-at-beginning
+// to loops with a check-loop-condition-at-end by reordering blocks. no
+// CFG changes here. This helps loops avoid extra unnecessary jumps.
 //
 //	 loop:
 //	   CMPQ ...
@@ -21,7 +1120,7 @@ package ssa
 //	entry:
 //	  CMPQ ...
 //	  JLT loop
-func loopRotate(f *Func) {
+func layoutLoop(f *Func) {
 	loopnest := f.loopnest()
 	if loopnest.hasIrreducible {
 		return
@@ -30,84 +1129,36 @@ func loopRotate(f *Func) {
 		return
 	}
 
-	idToIdx := f.Cache.allocIntSlice(f.NumBlocks())
-	defer f.Cache.freeIntSlice(idToIdx)
-	for i, b := range f.Blocks {
-		idToIdx[b.ID] = i
-	}
-
-	// Set of blocks we're moving, by ID.
-	move := map[ID]struct{}{}
-
-	// Map from block ID to the moving blocks that should
-	// come right after it.
-	after := map[ID][]*Block{}
-
-	// Check each loop header and decide if we want to move it.
 	for _, loop := range loopnest.loops {
-		b := loop.header
-		var p *Block // b's in-loop predecessor
-		for _, e := range b.Preds {
+		header := loop.header
+		// If loop rotation is already applied, loop latch should be right after
+		// all loop body blocks
+		if header.Kind == BlockPlain && len(header.Succs) == 1 {
+			continue
+		}
+		// Otherwise, place loop header right after all body blocks
+		var latch *Block // b's in-loop predecessor
+		for _, e := range header.Preds {
 			if e.b.Kind != BlockPlain {
 				continue
 			}
 			if loopnest.b2l[e.b.ID] != loop {
 				continue
 			}
-			p = e.b
+			latch = e.b
 		}
-		if p == nil || p == b {
+		if latch == nil || latch == header {
 			continue
 		}
-		after[p.ID] = []*Block{b}
-		for {
-			nextIdx := idToIdx[b.ID] + 1
-			if nextIdx >= len(f.Blocks) { // reached end of function (maybe impossible?)
-				break
-			}
-			nextb := f.Blocks[nextIdx]
-			if nextb == p { // original loop predecessor is next
-				break
-			}
-			if loopnest.b2l[nextb.ID] == loop {
-				after[p.ID] = append(after[p.ID], nextb)
+		iheader, ilatch := 0, 0
+		for ib, b := range f.Blocks {
+			if b == header {
+				iheader = ib
+			} else if b == latch {
+				ilatch = ib
 			}
-			b = nextb
-		}
-		// Swap b and p so that we'll handle p before b when moving blocks.
-		f.Blocks[idToIdx[loop.header.ID]] = p
-		f.Blocks[idToIdx[p.ID]] = loop.header
-		idToIdx[loop.header.ID], idToIdx[p.ID] = idToIdx[p.ID], idToIdx[loop.header.ID]
-
-		// Place b after p.
-		for _, b := range after[p.ID] {
-			move[b.ID] = struct{}{}
-		}
-	}
-
-	// Move blocks to their destinations in a single pass.
-	// We rely here on the fact that loop headers must come
-	// before the rest of the loop.  And that relies on the
-	// fact that we only identify reducible loops.
-	j := 0
-	// Some blocks that are not part of a loop may be placed
-	// between loop blocks. In order to avoid these blocks from
-	// being overwritten, use a temporary slice.
-	oldOrder := f.Cache.allocBlockSlice(len(f.Blocks))
-	defer f.Cache.freeBlockSlice(oldOrder)
-	copy(oldOrder, f.Blocks)
-	for _, b := range oldOrder {
-		if _, ok := move[b.ID]; ok {
-			continue
-		}
-		f.Blocks[j] = b
-		j++
-		for _, a := range after[b.ID] {
-			f.Blocks[j] = a
-			j++
 		}
-	}
-	if j != len(oldOrder) {
-		f.Fatalf("bad reordering in looprotate")
+		// Reordering the loop blocks from [header,body,latch] to [latch,body,header]
+		f.Blocks = moveBlock(f.Blocks, iheader, ilatch)
 	}
 }
diff --git a/src/cmd/compile/internal/ssa/looprotate_test.go b/src/cmd/compile/internal/ssa/looprotate_test.go
new file mode 100644
index 00000000000000..8004b76740aa4d
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/looprotate_test.go
@@ -0,0 +1,689 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"cmd/compile/internal/types"
+	"testing"
+)
+
+func doLoopRotation(fun fun) bool {
+	CheckFunc(fun.f)
+	f := fun.f
+	loopnest := f.loopnest()
+	loopnest.assembleChildren()
+	loopnest.findExits()
+	for _, loop := range loopnest.loops {
+		if !f.RotateLoop(loop) {
+			return false
+		}
+		CheckFunc(fun.f)
+	}
+	return true
+}
+
+func doLoopRotationWithLCSSSA(fun fun) bool {
+	CheckFunc(fun.f)
+	f := fun.f
+	loopnest := f.loopnest()
+	loopnest.assembleChildren()
+	loopnest.findExits()
+	for _, loop := range loopnest.loops {
+		if !f.BuildLoopClosedForm(loopnest, loop) {
+			panic("Failed to build loop closed form")
+		}
+	}
+
+	for _, loop := range loopnest.loops {
+		if !f.RotateLoop(loop) {
+			return false
+		}
+		CheckFunc(fun.f)
+	}
+	return true
+}
+
+func verifyRotatedCFG(fun fun, t *testing.T) {
+	// CFG is correctly wired?
+	cfg := map[string][]string{
+		"loopHeader": {"loopLatch", "loopBody"},
+		"loopLatch":  {"loopHeader", "loopExit"},
+		"loopBody":   {"loopLatch"},
+	}
+	for k, succs := range cfg {
+		for _, b := range fun.f.Blocks {
+			if fun.blocks[k] == b {
+				for _, succ := range succs {
+					succb := fun.blocks[succ]
+					found := false
+					for _, s := range b.Succs {
+						if s.b == succb {
+							found = true
+							break
+						}
+					}
+					if !found {
+						t.Fatalf("Illegal CFG")
+					}
+				}
+			}
+			break
+		}
+	}
+}
+
+func verifyNumValue(fun fun, t *testing.T, expectedOp Op, expectedNum int) {
+	// Data flow is correctly set up?
+	num := 0
+	for _, b := range fun.f.Blocks {
+		for _, val := range b.Values {
+			if val.Op == expectedOp {
+				num++
+			}
+		}
+	}
+	if num != expectedNum {
+		t.Fatalf("unexpected num of operation %v", expectedOp)
+	}
+}
+
+// The original loop looks like in below form
+//
+//	for i := 0; i < 10; i++ {
+//	}
+//
+// After loop rotation, it should be like below
+//
+//	if 0 < 10 {
+//		i := 0
+//		do {
+//			i++
+//		} while i < 10
+//	}
+//
+// Loop defs are not used outside the loop, so simply performing loop rotation
+// w/o LCSSA is okay.
+func TestSimpleLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+}
+
+// Loop header contains Values that may takes side effects and it was used by
+// condiitonal test.
+//
+//	for i := 0; i < *load; i++ {
+//	}
+//
+// After loop rotation, it should be like below
+//
+//	if 0 < *load {
+//		i := 0
+//		do {
+//			i+=*load
+//		} while *load < 10
+//	}
+//
+// Loop defs are not used outside the loop, so simply performing loop rotation
+// w/o LCSSA is okay.
+func TestComplexLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"),
+			Valu("store", OpStore, types.TypeMem, 0, nil, "addr", "one", "mem"),
+			Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "store"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+	verifyNumValue(fun, t, OpLoad, 2)
+	verifyNumValue(fun, t, OpAddr, 2)
+	verifyNumValue(fun, t, OpStore, 2)
+}
+
+// Similiar to TestSimpleLoop, but control value is not live in loop header
+//
+//	i := 0
+//	cmp := i < 10
+//	for ; cmp; i++ {
+//	}
+//
+// After loop rotation, it should be like below
+//
+//	i := 0
+//	cmp := i < 10
+//	if cmp {
+//		i := 0
+//		do {
+//			i++
+//		} while cmp
+//	}
+//
+// Loop defs are not used outside the loop, so simply performing loop rotation
+// w/o LCSSA is okay.
+func TestSimpleLoopCtrlElsewhere(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Valu("i", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("phi", OpPhi, c.config.Types.Int64, 0, nil, "i", "inc"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "phi"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// no copy, no clone
+	verifyNumValue(fun, t, OpLess64, 1)
+	verifyNumValue(fun, t, OpPhi, 1)
+}
+
+// Even more harder, Values in loop header have cyclic dependencies, i.e.
+//
+// loop header:
+//
+//	v1 = phi(.., v3)
+//	v3 = add(v1, 1)
+//	If v3 < 10, ...
+func TestCondCyclicLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			// cyclic dependency in loop header
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "inc", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+	verifyNumValue(fun, t, OpCopy, 1)
+	verifyNumValue(fun, t, OpPhi, 2)
+
+	for _, b := range fun.f.Blocks {
+		for _, val := range b.Values {
+			switch val.Op {
+			case OpCopy:
+				if val.Block != fun.blocks["loopLatch"] {
+					t.Fatalf("copy must be in loop latch")
+				}
+			}
+		}
+	}
+}
+
+// Cyclic dependencies may appear during updating
+//
+//	loop header:
+//	v1 = phi(.., v4)
+//	v3 = add(v1, 1)
+//	If v3 < 10, ...
+//
+//	loop latch:
+//	v4 = add(v3, 1)
+func TestNewCyclicLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			// cyclic dependency in loop header
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc2"),
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "inc", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc2", OpAdd64, c.config.Types.Int64, 0, nil, "one", "inc"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+	// no copy because inc2 explicitly uses inc
+	verifyNumValue(fun, t, OpPhi, 2)
+}
+
+// Use loop phi outside the loop, this requires LCSSA, which creates proxy phi
+// and use such phi outside the loop.
+//
+//	if 0 < 10 {
+//		i := 0
+//		do {
+//			i++
+//		} while i < 10
+//		use := i * 10
+//	}
+func TestOutsideLoopUses(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"),
+			Goto("exit")),
+		Bloc("exit",
+			Exit("mem")))
+
+	// doLoopRotation fails because loop phi is used outside the loop.
+	if !doLoopRotationWithLCSSSA(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+
+	loopExit := fun.blocks["loopExit"]
+	for _, val := range loopExit.Values {
+		if val.Op == OpPhi {
+			if len(val.Args) != len(loopExit.Preds) {
+				t.Fatalf("num of phi arguments mismatched with num of predecessors")
+			}
+			if 1 != val.Uses {
+				t.Fatalf("proxy phi must be used by p")
+			}
+			for _, arg := range val.Args {
+				switch arg.Op {
+				case OpConst64, OpAdd64:
+				default:
+					t.Fatalf("proxy phi must have only constants and add operands")
+				}
+			}
+		}
+	}
+}
+
+// Ditto, but the loop phi has cyclic dependencies.
+func TestPhiCondCyclicLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("true", OpConstBool, c.config.Types.Bool, 1, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("phi", OpPhi, c.config.Types.Bool, 0, nil, "true", "false"),
+			Valu("false", OpConstBool, c.config.Types.Bool, 0, nil),
+			If("phi", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+	// phi will not copy to loop guard, so only one phi exists
+	verifyNumValue(fun, t, OpPhi, 1)
+}
+
+// Loop has multiple exits
+//
+//	for i := 0; i < 10; i++ {
+//		if i == 1 {
+//			return
+//		}
+//	}
+//
+// After loop rotation, it should be like below
+//
+//	if 0 < 10 {
+//		i := 0
+//		do {
+//			if i == 1 {
+//				return
+//			}
+//			i++
+//		} while i < 10
+//	}
+//
+// Loop defs are not used outside the loop, so simply performing loop rotation
+// w/o LCSSA is okay.
+func TestMultiExitLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopBody", "loopExit")),
+		Bloc("loopBody",
+			Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp2", "loopExit1", "loopLatch")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit1",
+			Exit("mem")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotation(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+}
+
+// Loop contains multiple exits, and every loop exit block contians at least one
+// use that uses loop phi.
+//
+//	if 0 < 10 {
+//		i := 0
+//		do {
+//			if i == 1 {
+//				use1 = i * 10
+//				return
+//			}
+//			i++
+//		} while i < 10
+//	}
+//	use2  = i * 10
+func TestMultiExitLoopUses(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopBody", "loopExit")),
+		Bloc("loopBody",
+			Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp2", "loopExit1", "loopLatch")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit1",
+			Valu("use1", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"),
+			Exit("mem")),
+		Bloc("loopExit",
+			Valu("use2", OpMul64, c.config.Types.Int64, 0, nil, "i", "ten"),
+			Exit("mem")))
+
+	if !doLoopRotationWithLCSSSA(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+	verifyNumValue(fun, t, OpPhi, 1 /*var i*/ +2 /*proxy phi*/)
+}
+
+// Even harder, Values defined in loop header are used everywhere.
+func TestMultiExitLoopUsesEverywhere(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"),
+			Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "mem"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"),
+			If("cmp", "loopBody", "loopExit")),
+		Bloc("loopBody",
+			Valu("use3", OpMul64, c.config.Types.Int64, 0, nil, "i", "load"),
+			Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "i", "one"),
+			If("cmp2", "loopExit1", "loopLatch")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit1",
+			Valu("use1", OpMul64, c.config.Types.Int64, 0, nil, "i", "load"),
+			Exit("mem")),
+		Bloc("loopExit",
+			Valu("use2", OpMul64, c.config.Types.Int64, 0, nil, "i", "load"),
+			Exit("mem")))
+
+	if !doLoopRotationWithLCSSSA(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+
+	verifyRotatedCFG(fun, t)
+
+	// one lives in loop latch and one lives in loop guard
+	verifyNumValue(fun, t, OpLess64, 2)
+	verifyNumValue(fun, t, OpLoad, 2)
+	numOfPhi := 2 /*two proxy phi in exit1*/ + 2 /*two proxy phi in exit*/ +
+		2 /*i and inserted phi for load*/
+	verifyNumValue(fun, t, OpPhi, numOfPhi)
+}
+
+// Rotation the Loop inclduing nesting children
+func TestNestLoopRotation(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("ten", OpConst64, c.config.Types.Int64, 10, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "ten"),
+			If("cmp", "loopHeader2", "loopExit")),
+		Bloc("loopHeader2",
+			Valu("k", OpPhi, c.config.Types.Int64, 0, nil, "i", "inc2"),
+			Valu("cmp2", OpEq64, c.config.Types.Bool, 0, nil, "k", "one"),
+			If("cmp2", "loopLatch2", "loopLatch")),
+		Bloc("loopLatch2",
+			Valu("inc2", OpAdd64, c.config.Types.Int64, 0, nil, "one", "k"),
+			Goto("loopHeader2")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if !doLoopRotationWithLCSSSA(fun) {
+		t.Fatal("Loop rotation failed")
+	}
+}
+
+// Store is defined in loop header and used outside the loop indirectly.
+func TestBadLoop(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"),
+			Valu("store", OpStore, types.TypeMem, 0, nil, "addr", "one", "mem"),
+			Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "store"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "load"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "load", "one"),
+			Exit("mem")))
+
+	if doLoopRotationWithLCSSSA(fun) != false {
+		t.Fatal("Loop rotation is expected to fail")
+	}
+}
+
+// Loop def is non trivial because it excesses max depth
+func TestBadLoop2(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry",
+		Bloc("loopEntry",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("sb", OpSB, c.config.Types.Uintptr, 0, nil),
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "inc"),
+			Valu("addr", OpAddr, c.config.Types.Int64.PtrTo(), 0, nil, "sb"),
+			Valu("load", OpLoad, c.config.Types.Int64, 0, nil, "addr", "mem"),
+			Valu("depth5", OpAdd64, c.config.Types.Int64, 0, nil, "one", "load"),
+			Valu("depth4", OpAdd64, c.config.Types.Int64, 0, nil, "one", "depth5"),
+			Valu("depth3", OpAdd64, c.config.Types.Int64, 0, nil, "one", "depth4"),
+			Valu("depth2", OpAdd64, c.config.Types.Int64, 0, nil, "one", "depth3"),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "i", "depth2"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "load", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Valu("use", OpMul64, c.config.Types.Int64, 0, nil, "load", "one"),
+			Exit("mem")))
+
+	if doLoopRotationWithLCSSSA(fun) != false {
+		t.Fatal("Loop rotation is expected to fail")
+	}
+}
+
+// Loop header has multiple entries
+func TestBadLoop3(t *testing.T) {
+	c := testConfig(t)
+	fun := c.Fun("loopEntry1",
+		Bloc("loopEntry1",
+			Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+			Valu("zero", OpConst64, c.config.Types.Int64, 0, nil),
+			Valu("one", OpConst64, c.config.Types.Int64, 1, nil),
+			Valu("cmp", OpLess64, c.config.Types.Bool, 0, nil, "zero", "one"),
+			If("cmp", "loopHeader", "loopEntry2")),
+		Bloc("loopEntry2",
+			Goto("loopHeader")),
+		Bloc("loopHeader",
+			Valu("i", OpPhi, c.config.Types.Int64, 0, nil, "zero", "one", "inc"),
+			If("cmp", "loopLatch", "loopExit")),
+		Bloc("loopLatch",
+			Valu("inc", OpAdd64, c.config.Types.Int64, 0, nil, "one", "i"),
+			Goto("loopHeader")),
+		Bloc("loopExit",
+			Exit("mem")))
+
+	if doLoopRotationWithLCSSSA(fun) != false {
+		t.Fatal("Loop rotation is expected to fail")
+	}
+}
diff --git a/src/cmd/compile/internal/ssa/phielim.go b/src/cmd/compile/internal/ssa/phielim.go
index 4fc942375fdef3..cadea4eeabfeba 100644
--- a/src/cmd/compile/internal/ssa/phielim.go
+++ b/src/cmd/compile/internal/ssa/phielim.go
@@ -35,6 +35,8 @@ func phielim(f *Func) {
 			break
 		}
 	}
+
+	copyelim(f)
 }
 
 // phielimValue tries to convert the phi v to a copy.
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go
index 2325b9ee458412..676093756cb2bf 100644
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -1242,12 +1242,17 @@ func (s *regAllocState) regalloc(f *Func) {
 		// we get the right behavior for a block which branches to itself.
 		for _, e := range b.Succs {
 			succ := e.b
+			pidx := e.i
+			if succ.Kind == BlockPlain && len(succ.Values) == 0 {
+				ne := succ.Succs[0]
+				succ = ne.b
+				pidx = ne.i
+			}
 			// TODO: prioritize likely successor?
 			for _, x := range s.startRegs[succ.ID] {
 				desired.add(x.v.ID, x.r)
 			}
 			// Process phi ops in succ.
-			pidx := e.i
 			for _, v := range succ.Values {
 				if v.Op != OpPhi {
 					break
diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go
index fb38f40d63ab11..a0f9defee5325f 100644
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -11,6 +11,7 @@ import (
 	"sort"
 )
 
+// Larger numbers are scheduled closer to the end of the block.
 const (
 	ScorePhi       = iota // towards top of block
 	ScoreArg              // must occur at the top of the entry block
@@ -204,6 +205,13 @@ func schedule(f *Func) {
 				continue
 			}
 			score[c.ID] = ScoreControl
+			// schedule arguments of control values closer if they are defined
+			// in the same block and not compute score yet.
+			for _, arg := range c.Args {
+				if arg.Block == b && score[arg.ID] == ScoreDefault {
+					score[arg.ID] = ScoreControl - 1
+				}
+			}
 		}
 	}
 	priq.score = score
@@ -390,6 +398,9 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
 	hasNilCheck := false
 	sset.clear() // sset is the set of stores that are used in other values
 	for _, v := range values {
+		if v.Op == OpInvalid {
+			continue
+		}
 		if v.Type.IsMemory() {
 			stores = append(stores, v)
 			if v.Op == OpInitMem || v.Op == OpPhi {
diff --git a/src/cmd/compile/internal/ssa/tighten.go b/src/cmd/compile/internal/ssa/tighten.go
index 85b6a84cc3f426..6a5a2f32c46761 100644
--- a/src/cmd/compile/internal/ssa/tighten.go
+++ b/src/cmd/compile/internal/ssa/tighten.go
@@ -23,11 +23,7 @@ func tighten(f *Func) {
 	defer f.Cache.freeBoolSlice(canMove)
 
 	// Compute the memory states of each block.
-	startMem := f.Cache.allocValueSlice(f.NumBlocks())
-	defer f.Cache.freeValueSlice(startMem)
-	endMem := f.Cache.allocValueSlice(f.NumBlocks())
-	defer f.Cache.freeValueSlice(endMem)
-	memState(f, startMem, endMem)
+	startMem, _ := memState(f)
 
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
@@ -214,7 +210,9 @@ func phiTighten(f *Func) {
 //  3. The algorithm first obtains the memory state of some blocks in the tree
 //     in the first step. Then floods the known memory state to other nodes in
 //     the second step.
-func memState(f *Func, startMem, endMem []*Value) {
+func memState(f *Func) ([]*Value, []*Value) {
+	startMem := make([]*Value, f.NumBlocks())
+	endMem := make([]*Value, f.NumBlocks())
 	// This slice contains the set of blocks that have had their startMem set but this
 	// startMem value has not yet been propagated to the endMem of its predecessors
 	changed := make([]*Block, 0)
@@ -266,4 +264,5 @@ func memState(f *Func, startMem, endMem []*Value) {
 			}
 		}
 	}
+	return startMem, endMem
 }
diff --git a/src/cmd/compile/internal/ssa/value.go b/src/cmd/compile/internal/ssa/value.go
index 4eaab40354c171..9ab7e554eb18fe 100644
--- a/src/cmd/compile/internal/ssa/value.go
+++ b/src/cmd/compile/internal/ssa/value.go
@@ -105,6 +105,13 @@ func (v *Value) AuxInt32() int32 {
 	return int32(v.AuxInt)
 }
 
+func (v *Value) AuxInt64() int64 {
+	if opcodeTable[v.Op].auxType != auxInt64 {
+		v.Fatalf("op %s doesn't have an int64 aux field", v.Op)
+	}
+	return int64(v.AuxInt)
+}
+
 // AuxUnsigned returns v.AuxInt as an unsigned value for OpConst*.
 // v.AuxInt is always sign-extended to 64 bits, even if the
 // represented value is unsigned. This undoes that sign extension.
diff --git a/src/internal/goexperiment/exp_loopopts_off.go b/src/internal/goexperiment/exp_loopopts_off.go
new file mode 100644
index 00000000000000..fd9018132e752c
--- /dev/null
+++ b/src/internal/goexperiment/exp_loopopts_off.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build !goexperiment.loopopts
+
+package goexperiment
+
+const LoopOpts = false
+const LoopOptsInt = 0
diff --git a/src/internal/goexperiment/exp_loopopts_on.go b/src/internal/goexperiment/exp_loopopts_on.go
new file mode 100644
index 00000000000000..d372fd33ba564c
--- /dev/null
+++ b/src/internal/goexperiment/exp_loopopts_on.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build goexperiment.loopopts
+
+package goexperiment
+
+const LoopOpts = true
+const LoopOptsInt = 1
diff --git a/src/internal/goexperiment/exp_range_off.go b/src/internal/goexperiment/exp_range_off.go
new file mode 100644
index 00000000000000..82f5dc71b28320
--- /dev/null
+++ b/src/internal/goexperiment/exp_range_off.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build !goexperiment.range
+
+package goexperiment
+
+const Range = false
+const RangeInt = 0
diff --git a/src/internal/goexperiment/exp_range_on.go b/src/internal/goexperiment/exp_range_on.go
new file mode 100644
index 00000000000000..1d0f30f49f83ed
--- /dev/null
+++ b/src/internal/goexperiment/exp_range_on.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build goexperiment.range
+
+package goexperiment
+
+const Range = true
+const RangeInt = 1
diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go
index dacc4c3b135732..cc44f8b6a04707 100644
--- a/src/internal/goexperiment/flags.go
+++ b/src/internal/goexperiment/flags.go
@@ -127,4 +127,8 @@ type Flags struct {
 	// ExecTracer2 controls whether to use the new execution trace
 	// implementation.
 	ExecTracer2 bool
+
+	// LoopOpts enables aggressive loop optimizations on SSA, which may takes
+	// more time to compile but produce faster code.
+	LoopOpts bool
 }
diff --git a/test/nilptr3.go b/test/nilptr3.go
index 2cc510beb635df..33963b75620f11 100644
--- a/test/nilptr3.go
+++ b/test/nilptr3.go
@@ -155,7 +155,7 @@ func f4(x *[10]int) {
 	// and the offset is small enough that if x is nil, the address will still be
 	// in the first unmapped page of memory.
 
-	_ = x[9] // ERROR "generated nil check" // bug: would like to remove this check (but nilcheck and load are in different blocks)
+	_ = x[9] // ERROR "removed nil check"
 
 	for {
 		if x[9] != 0 { // ERROR "removed nil check"
diff --git a/test/opt_branchlikely.go b/test/opt_branchlikely.go
index 0aee33f87a578b..f68d746db63fea 100644
--- a/test/opt_branchlikely.go
+++ b/test/opt_branchlikely.go
@@ -12,7 +12,7 @@ package foo
 
 func f(x, y, z int) int {
 	a := 0
-	for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop"
+	for i := 0; i < x; i++ { // ERROR "Branch prediction rule"
 		for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop"
 			a += j
 		}
@@ -45,7 +45,7 @@ func g(x, y, z int) int {
 		panic("help help help")
 	}
 	if x != 0 { // ERROR "Branch prediction rule default < ret"
-		for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop"
+		for i := 0; i < x; i++ { // ERROR "Branch prediction rule"
 			if x == 4 { // ERROR "Branch prediction rule stay in loop"
 				return a
 			}
@@ -62,7 +62,7 @@ func g(x, y, z int) int {
 
 func h(x, y, z int) int {
 	a := 0
-	for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop"
+	for i := 0; i < x; i++ { // ERROR "Branch prediction rule"
 		for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop"
 			a += j
 			if i == j { // ERROR "Branch prediction rule stay in loop"
diff --git a/test/writebarrier.go b/test/writebarrier.go
index 1b30fa509e5503..ddd37895780594 100644
--- a/test/writebarrier.go
+++ b/test/writebarrier.go
@@ -60,7 +60,7 @@ func f3a(x *string, y *string) {
 }
 
 func f4(x *[2]string, y [2]string) {
-	*x = y // ERROR "write barrier"
+	*x = y // no barrier (dead store)
 
 	z := y // no barrier
 	*x = z // ERROR "write barrier"