-
Notifications
You must be signed in to change notification settings - Fork 14k
[RISCV] Select mask operands as virtual registers and eliminate uses of vmv0 #125026
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-globalisel Author: Luke Lau (lukel97) ChangesThis is another attempt at #88496 to keep mask operands in SSA after instruction selection. Previously we selected the mask operands into vmv0, a singleton register class with exactly one register, V0. But the register allocator doesn't really support singleton register classes and we ran into errors like "ran out of registers during register allocation in function". This avoids this by introducing a pass just before register allocation that converts any use of vmv0 to a copy to $v0, i.e. what isel currently does today. That way the register allocator doesn't need to deal with the singleton register class, but get the benefits of having the mask registers in SSA throughout the backend:
As a follow up, we can move the elimination pass to after phi elimination and outside of SSA, which would unblock the pre-RA scheduler around masked pseudos. This might also help the issue that RISCVVectorMaskDAGMutation tries to solve. Patch is 354.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125026.diff 52 Files Affected:
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 98d3615ebab58dd..9b23a5ab521c8d0 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(RISCVCodeGen
RISCVVectorMaskDAGMutation.cpp
RISCVVectorPeephole.cpp
RISCVVLOptimizer.cpp
+ RISCVVMV0Elimination.cpp
RISCVZacasABIFix.cpp
GISel/RISCVCallLowering.cpp
GISel/RISCVInstructionSelector.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index b1aee98739e8521..851eea135285246 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -107,6 +107,9 @@ void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
FunctionPass *createRISCVVLOptimizerPass();
void initializeRISCVVLOptimizerPass(PassRegistry &);
+
+FunctionPass *createRISCVVMV0EliminationPass();
+void initializeRISCVVMV0EliminationPass(PassRegistry &);
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9855028ead9e208..c79539a7e4d20bc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -241,7 +241,6 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
bool IsMasked, bool IsStridedOrIndexed, SmallVectorImpl<SDValue> &Operands,
bool IsLoad, MVT *IndexVT) {
SDValue Chain = Node->getOperand(0);
- SDValue Glue;
Operands.push_back(Node->getOperand(CurOp++)); // Base pointer.
@@ -252,11 +251,8 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
}
if (IsMasked) {
- // Mask needs to be copied to V0.
SDValue Mask = Node->getOperand(CurOp++);
- Chain = CurDAG->getCopyToReg(Chain, DL, RISCV::V0, Mask, SDValue());
- Glue = Chain.getValue(1);
- Operands.push_back(CurDAG->getRegister(RISCV::V0, Mask.getValueType()));
+ Operands.push_back(Mask);
}
SDValue VL;
selectVLOp(Node->getOperand(CurOp++), VL);
@@ -278,8 +274,6 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
}
Operands.push_back(Chain); // Chain.
- if (Glue)
- Operands.push_back(Glue);
}
void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, unsigned NF, bool IsMasked,
@@ -1831,19 +1825,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
return;
}
- // Mask needs to be copied to V0.
- SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
- RISCV::V0, Mask, SDValue());
- SDValue Glue = Chain.getValue(1);
- SDValue V0 = CurDAG->getRegister(RISCV::V0, VT);
-
if (IsCmpConstant) {
SDValue Imm =
selectImm(CurDAG, SDLoc(Src2), XLenVT, CVal - 1, *Subtarget);
ReplaceNode(Node, CurDAG->getMachineNode(
VMSGTMaskOpcode, DL, VT,
- {MaskedOff, Src1, Imm, V0, VL, SEW, Glue}));
+ {MaskedOff, Src1, Imm, Mask, VL, SEW}));
return;
}
@@ -1854,7 +1842,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// the agnostic result can be either undisturbed or all 1.
SDValue Cmp = SDValue(
CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
- {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}),
+ {MaskedOff, Src1, Src2, Mask, VL, SEW}),
0);
// vmxor.mm vd, vd, v0 is used to update active value.
ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
@@ -3274,12 +3262,10 @@ static bool vectorPseudoHasAllNBitUsers(SDNode *User, unsigned UserOpNo,
return false;
assert(RISCVII::hasVLOp(TSFlags));
- bool HasGlueOp = User->getGluedNode() != nullptr;
- unsigned ChainOpIdx = User->getNumOperands() - HasGlueOp - 1;
+ unsigned ChainOpIdx = User->getNumOperands() - 1;
bool HasChainOp = User->getOperand(ChainOpIdx).getValueType() == MVT::Other;
bool HasVecPolicyOp = RISCVII::hasVecPolicyOp(TSFlags);
- unsigned VLIdx =
- User->getNumOperands() - HasVecPolicyOp - HasChainOp - HasGlueOp - 2;
+ unsigned VLIdx = User->getNumOperands() - HasVecPolicyOp - HasChainOp - 2;
const unsigned Log2SEW = User->getConstantOperandVal(VLIdx + 1);
if (UserOpNo == VLIdx)
@@ -3739,43 +3725,7 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
return false;
}
-// After ISel, a vector pseudo's mask will be copied to V0 via a CopyToReg
-// that's glued to the pseudo. This tries to look up the value that was copied
-// to V0.
-static SDValue getMaskSetter(SDValue MaskOp, SDValue GlueOp) {
- // Check that we're using V0 as a mask register.
- if (!isa<RegisterSDNode>(MaskOp) ||
- cast<RegisterSDNode>(MaskOp)->getReg() != RISCV::V0)
- return SDValue();
-
- // The glued user defines V0.
- const auto *Glued = GlueOp.getNode();
-
- if (!Glued || Glued->getOpcode() != ISD::CopyToReg)
- return SDValue();
-
- // Check that we're defining V0 as a mask register.
- if (!isa<RegisterSDNode>(Glued->getOperand(1)) ||
- cast<RegisterSDNode>(Glued->getOperand(1))->getReg() != RISCV::V0)
- return SDValue();
-
- SDValue MaskSetter = Glued->getOperand(2);
-
- // Sometimes the VMSET is wrapped in a COPY_TO_REGCLASS, e.g. if the mask came
- // from an extract_subvector or insert_subvector.
- if (MaskSetter->isMachineOpcode() &&
- MaskSetter->getMachineOpcode() == RISCV::COPY_TO_REGCLASS)
- MaskSetter = MaskSetter->getOperand(0);
-
- return MaskSetter;
-}
-
-static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) {
- // Check the instruction defining V0; it needs to be a VMSET pseudo.
- SDValue MaskSetter = getMaskSetter(MaskOp, GlueOp);
- if (!MaskSetter)
- return false;
-
+static bool usesAllOnesMask(SDValue MaskOp) {
const auto IsVMSet = [](unsigned Opc) {
return Opc == RISCV::PseudoVMSET_M_B1 || Opc == RISCV::PseudoVMSET_M_B16 ||
Opc == RISCV::PseudoVMSET_M_B2 || Opc == RISCV::PseudoVMSET_M_B32 ||
@@ -3786,14 +3736,7 @@ static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) {
// TODO: Check that the VMSET is the expected bitwidth? The pseudo has
// undefined behaviour if it's the wrong bitwidth, so we could choose to
// assume that it's all-ones? Same applies to its VL.
- return MaskSetter->isMachineOpcode() &&
- IsVMSet(MaskSetter.getMachineOpcode());
-}
-
-// Return true if we can make sure mask of N is all-ones mask.
-static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) {
- return usesAllOnesMask(N->getOperand(MaskOpIdx),
- N->getOperand(N->getNumOperands() - 1));
+ return MaskOp->isMachineOpcode() && IsVMSet(MaskOp.getMachineOpcode());
}
static bool isImplicitDef(SDValue V) {
@@ -3809,9 +3752,7 @@ static bool isImplicitDef(SDValue V) {
}
// Optimize masked RVV pseudo instructions with a known all-ones mask to their
-// corresponding "unmasked" pseudo versions. The mask we're interested in will
-// take the form of a V0 physical register operand, with a glued
-// register-setting instruction.
+// corresponding "unmasked" pseudo versions.
bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
const RISCV::RISCVMaskedPseudoInfo *I =
RISCV::getMaskedPseudoInfo(N->getMachineOpcode());
@@ -3819,7 +3760,7 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
return false;
unsigned MaskOpIdx = I->MaskOpIdx;
- if (!usesAllOnesMask(N, MaskOpIdx))
+ if (!usesAllOnesMask(N->getOperand(MaskOpIdx)))
return false;
// There are two classes of pseudos in the table - compares and
@@ -3843,18 +3784,13 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
// Skip the passthru operand at index 0 if the unmasked don't have one.
bool ShouldSkip = !HasPassthru && MaskedHasPassthru;
for (unsigned I = ShouldSkip, E = N->getNumOperands(); I != E; I++) {
- // Skip the mask, and the Glue.
+ // Skip the mask
SDValue Op = N->getOperand(I);
- if (I == MaskOpIdx || Op.getValueType() == MVT::Glue)
+ if (I == MaskOpIdx)
continue;
Ops.push_back(Op);
}
- // Transitively apply any node glued to our new node.
- const auto *Glued = N->getGluedNode();
- if (auto *TGlued = Glued->getGluedNode())
- Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1));
-
MachineSDNode *Result =
CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
@@ -3890,17 +3826,13 @@ static bool IsVMerge(SDNode *N) {
// The resulting policy is the effective policy the vmerge would have had,
// i.e. whether or not it's passthru operand was implicit-def.
bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
- SDValue Passthru, False, True, VL, Mask, Glue;
+ SDValue Passthru, False, True, VL, Mask;
assert(IsVMerge(N));
Passthru = N->getOperand(0);
False = N->getOperand(1);
True = N->getOperand(2);
Mask = N->getOperand(3);
VL = N->getOperand(4);
- // We always have a glue node for the mask at v0.
- Glue = N->getOperand(N->getNumOperands() - 1);
- assert(cast<RegisterSDNode>(Mask)->getReg() == RISCV::V0);
- assert(Glue.getValueType() == MVT::Glue);
// If the EEW of True is different from vmerge's SEW, then we can't fold.
if (True.getSimpleValueType() != N->getSimpleValueType(0))
@@ -3943,12 +3875,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
if (TII->get(TrueOpc).hasUnmodeledSideEffects())
return false;
- // The last operand of a masked instruction may be glued.
- bool HasGlueOp = True->getGluedNode() != nullptr;
-
- // The chain operand may exist either before the glued operands or in the last
- // position.
- unsigned TrueChainOpIdx = True.getNumOperands() - HasGlueOp - 1;
+ unsigned TrueChainOpIdx = True.getNumOperands() - 1;
bool HasChainOp =
True.getOperand(TrueChainOpIdx).getValueType() == MVT::Other;
@@ -3960,7 +3887,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
LoopWorklist.push_back(False.getNode());
LoopWorklist.push_back(Mask.getNode());
LoopWorklist.push_back(VL.getNode());
- LoopWorklist.push_back(Glue.getNode());
if (SDNode::hasPredecessorHelper(True.getNode(), Visited, LoopWorklist))
return false;
}
@@ -3968,7 +3894,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
// The vector policy operand may be present for masked intrinsics
bool HasVecPolicyOp = RISCVII::hasVecPolicyOp(TrueTSFlags);
unsigned TrueVLIndex =
- True.getNumOperands() - HasVecPolicyOp - HasChainOp - HasGlueOp - 2;
+ True.getNumOperands() - HasVecPolicyOp - HasChainOp - 2;
SDValue TrueVL = True.getOperand(TrueVLIndex);
SDValue SEW = True.getOperand(TrueVLIndex + 1);
@@ -4000,7 +3926,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
if (RISCVII::elementsDependOnVL(TrueBaseMCID.TSFlags) && (TrueVL != VL))
return false;
if (RISCVII::elementsDependOnMask(TrueBaseMCID.TSFlags) &&
- (Mask && !usesAllOnesMask(Mask, Glue)))
+ (Mask && !usesAllOnesMask(Mask)))
return false;
// Make sure it doesn't raise any observable fp exceptions, since changing the
@@ -4057,9 +3983,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
if (HasChainOp)
Ops.push_back(True.getOperand(TrueChainOpIdx));
- // Add the glue for the CopyToReg of mask->v0.
- Ops.push_back(Glue);
-
MachineSDNode *Result =
CurDAG->getMachineNode(MaskedOpc, DL, True->getVTList(), Ops);
Result->setFlags(True->getFlags());
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 268bfe70673a2ac..46cf27838d1ce23 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3945,7 +3945,7 @@ class VPatUnaryMask<string intrinsic_name,
Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
(result_type result_reg_class:$passthru),
(op2_type op2_reg_class:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(
!if(isSEWAware,
@@ -3953,7 +3953,7 @@ class VPatUnaryMask<string intrinsic_name,
inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
(result_type result_reg_class:$passthru),
(op2_type op2_reg_class:$rs2),
- (mask_type V0), GPR:$vl, log2sew, (XLenVT timm:$policy))>;
+ (mask_type VMV0:$vm), GPR:$vl, log2sew, (XLenVT timm:$policy))>;
class VPatUnaryMaskRoundingMode<string intrinsic_name,
string inst,
@@ -3969,7 +3969,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
(result_type result_reg_class:$passthru),
(op2_type op2_reg_class:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT timm:$round),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(
@@ -3978,7 +3978,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
(result_type result_reg_class:$passthru),
(op2_type op2_reg_class:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT timm:$round),
GPR:$vl, log2sew, (XLenVT timm:$policy))>;
@@ -3996,7 +3996,7 @@ class VPatUnaryMaskRTZ<string intrinsic_name,
Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
(result_type result_reg_class:$passthru),
(op2_type op2_reg_class:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT 0b001),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(
@@ -4005,7 +4005,7 @@ class VPatUnaryMaskRTZ<string intrinsic_name,
inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
(result_type result_reg_class:$passthru),
(op2_type op2_reg_class:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
GPR:$vl, log2sew, (XLenVT timm:$policy))>;
class VPatMaskUnaryNoMask<string intrinsic_name,
@@ -4024,12 +4024,12 @@ class VPatMaskUnaryMask<string intrinsic_name,
Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name#"_mask")
(mti.Mask VR:$passthru),
(mti.Mask VR:$rs2),
- (mti.Mask V0),
+ (mti.Mask VMV0:$vm),
VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
(mti.Mask VR:$passthru),
(mti.Mask VR:$rs2),
- (mti.Mask V0), GPR:$vl, mti.Log2SEW, TU_MU)>;
+ (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TU_MU)>;
class VPatUnaryAnyMask<string intrinsic,
string inst,
@@ -4144,13 +4144,13 @@ class VPatBinaryMask<string intrinsic_name,
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
VLOpFrag)),
(!cast<Instruction>(inst#"_MASK")
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0), GPR:$vl, sew)>;
+ (mask_type VMV0:$vm), GPR:$vl, sew)>;
class VPatBinaryMaskPolicy<string intrinsic_name,
string inst,
@@ -4166,13 +4166,13 @@ class VPatBinaryMaskPolicy<string intrinsic_name,
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(inst#"_MASK")
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
+ (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>;
class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name,
string inst,
@@ -4188,14 +4188,14 @@ class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name,
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT timm:$round),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(inst#"_MASK")
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT timm:$round),
GPR:$vl, sew, (XLenVT timm:$policy))>;
@@ -4214,13 +4214,13 @@ class VPatBinaryMaskSwapped<string intrinsic_name,
(result_type result_reg_class:$passthru),
(op2_type op2_kind:$rs2),
(op1_type op1_reg_class:$rs1),
- (mask_type V0),
+ (mask_type VMV0:$vm),
VLOpFrag)),
(!cast<Instruction>(inst#"_MASK")
(result_type result_reg_class:$passthru),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0), GPR:$vl, sew)>;
+ (mask_type VMV0:$vm), GPR:$vl, sew)>;
class VPatTiedBinaryNoMask<string intrinsic_name,
string inst,
@@ -4306,12 +4306,12 @@ class VPatTiedBinaryMask<string intrinsic_name,
(result_type result_reg_class:$passthru),
(result_type result_reg_class:$passthru),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(inst#"_MASK_TIED")
(result_type result_reg_class:$passthru),
(op2_type op2_kind:$rs2),
- (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
+ (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>;
class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
string inst,
@@ -4325,13 +4325,13 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
(result_type result_reg_class:$passthru),
(result_type result_reg_class:$passthru),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT timm:$round),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(inst#"_MASK_TIED")
(result_type result_reg_class:$passthru),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
(XLenVT timm:$round),
GPR:$vl, sew, (XLenVT timm:$policy))>;
@@ -4447,13 +4447,13 @@ class VPatTernaryMaskPolicy<string intrinsic,
(result_type result_reg_class:$rs3),
(op1_type op1_reg_class:$rs1),
(op2_type op2_kind:$rs2),
- (mask_type V0),
+ (mask_type VMV0:$vm),
VLOpFrag, (XLenVT timm:$policy))),
(!cast<Instruction>(inst#...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I just had a quick scan and I like this method to work around the limitation.
LGTM but please wait for another approval.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As we talked about last night, very supportive of this direction. I think this approach works out.
One suggestion: can we stage this into two commits to make the diffs easier to understand? If you do the new pass immediately after ISEL, I think you get something which is much closer to NFC. This lets us land the majority of the cleanup Then you can do a separate change which moves it before regalloc, and we can focus on the interaction with all the passes being moved over.
@@ -1495,47 +1495,73 @@ declare <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double | |||
define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) { | |||
; CHECK-LABEL: vp_nearbyint_nxv16f64: | |||
; CHECK: # %bb.0: | |||
; CHECK-NEXT: addi sp, sp, -16 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a couple cases like this in the test diffs where we need a frame setup after the change and not before. I'm pretty sure this is an artifact of scheduling - and thus not blocking for this change - but this is interesting, and possibly worth a bit of digging into as a followup.
@@ -1515,40 +1515,36 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs | |||
; CHECK-NEXT: vmv1r.v v0, v6 | |||
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma | |||
; CHECK-NEXT: vfabs.v v24, v16, v0.t | |||
; CHECK-NEXT: addi a2, sp, 16 | |||
; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Huh? We spill something, and then immediately fill it back to the same register?
(This is not new behavior, the prior code was actually worse.)
; CHECK-NEXT: beqz a0, .LBB4_2 | ||
; CHECK-NEXT: # %bb.1: | ||
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu | ||
; CHECK-NEXT: vmv.v.i v9, 0 | ||
; CHECK-NEXT: vid.v v9, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Complete aside: We could use a start value of -1 here, use a vredmaxs instead, and move the conditional branch to be after the reduction, and guard only the slidedown and extract. Doing so would remove a vcpop. This assumes that the passthru case is exceedingly rare.
Actually, we could replace the slidedown entirely with another masked reduction (mask being a comparison of the vid to the result of the max), and feed the passthru through as the start value for an entirely branchless sequence. Not sure the required vrgather.vi is worth it there though.
bool V0Clobbered = V0ClobberedOnEntry.contains(MBB); | ||
for (MachineInstr &MI : *MBB) { | ||
assert(!(MI.readsRegister(RISCV::V0, TRI) && V0Clobbered)); | ||
if (MI.modifiesRegister(RISCV::V0, TRI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure this invariant holds. Consider the following:
// inline asm that sets V0
// call to some VP intrinsic with a mask
// inline asm which reads V0
This sequence is fully UB, but I think your assert might spurious fail in this case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a test case for this in vmv0-elimination.ll, but couldn't get the assertion to trigger.
It looks like any inline asm that reads v0 (i.e. has a v0 input constraint: the vm constraint isn't affected here) will end up emitting a glued COPY to $v0 before it:
bb.0 (%ir-block.0):
liveins: $v8, $v9, $v0, $x10
%3:gpr = COPY $x10
%2:vr = COPY $v0
%1:vr = COPY $v9
%0:vr = COPY $v8
%4:vr = COPY %0:vr
%5:vr = COPY %1:vr
INLINEASM &"vadd.vv $0, $1, $2" [attdialect], $0:[regdef], implicit-def $v0, $1:[reguse:VR], %4:vr, $2:[reguse:VR], %5:vr
%8:vr = COPY $v0
%10:vmv0 = COPY %2:vr
%9:vrnov0 = PseudoVADD_VV_M1_MASK $noreg(tied-def 0), %0:vr, %1:vr, %10:vmv0, -1, 6, 0
PseudoVSE64_V_M1 killed %9:vrnov0, %3:gpr, -1, 6 :: (store (<vscale x 1 x s64>) into %ir.p)
%7:vr = COPY %0:vr
$v0 = COPY %8:vr
INLINEASM &"vadd.vv $0, $1, $2" [attdialect], $0:[regdef:VR], def %6:vr, $1:[reguse:VR], %7:vr, $2:[reguse], $v0
$v8 = COPY %6:vr
PseudoRET implicit $v8
So any inline asm which reads V0 shouldn't be clobbered. Should we maintain this stricter invariant then?
}; | ||
|
||
#ifndef NDEBUG | ||
// Assert that we won't clobber any existing reads of V0 where we need to |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use the livein list on the basic blocks?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've replaced the map with an assertion that none of the successors have v0 as a live in if v0 is clobbered
// register in the singleton vmv0 register class instead of copying them to $v0 | ||
// straight away, to make optimizing masks easier. | ||
// | ||
// However the register allocator struggles with singleton register classes and |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a hand wavy and poor explanation for the problem and makes it sound like a workaround. The problem is primarily with every other piece of code that can now produce situations where multiple values must be live at the same time
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To my knowledge, it is a workaround? I don't understand your second sentence. All the words individually make sense, I just don't get the connection to the point. I've never really understood why 3 virtual registers with overlapping live ranges in a 2 element register class is fine, but 2 overlapping virtual registers in a 1 element register class isn't.
Also, do you have better wording to use here? Or is this more than a request to reword a comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've never really understood why 3 virtual registers with overlapping live ranges in a 2 element register class is fine, but 2 overlapping virtual registers in a 1 element register class isn't.
Maybe this is just a limitation of how GreedyRegAlloc implements spilling, i.e. it requires at least two registers in a class?
One way to think of RISCVVMV0Elimination is that it "spills" vmv0 register classes to vr register classes by inserting copies and inflating the register class. That way we're removing any overlapping live ranges.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem is if you have more than 1 value that need to be live at the same point, as in instruction.
%0:my_rc = DEF0
%1:my_rc = DEF1
SOME_INST implicit %0, implicit %1
Is the kind of case that will fail. Other cases where they don't need to be live at the same point should work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Off the top of my head the reason why we were ending up with instructions with two separate uses of vmv0 was because copies were getting coalesced into the vmv0 class. There shouldn't be any instructions with two vmv0 operands.
It might be worth trying to change this pass to inflate any unnecessary vmv0 register classes so we avoid this situation? Ideally in a separate PR so we minimise the test diff from isel's current behaviour
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Luke, I'm not clear what you mean by "this pass". It sounds like you're referring to the VMV0 elimination pass from this patch, but that doesn't seem to make sense in context? Were you maybe referring to Register Coalescing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My idea is that RISCVVMV0Elimination sits just before register allocation, after register coalescing. But instead of emitting copies to V0, it finds instructions with two or more distinct uses of VMV0.
Only one of the uses should need to be in VMV0, I.e the mask operand.
For the other uses, it would either inflate the uses to VR, or if not possible emit a COPY to VR.
That way each instruction would have at most one use of VMV0, and this avoids the scenario that @arsenm was mentioning, which should be allocatable.
I'm suggesting we could explore it as a follow up to this patch.
d1a4034
to
06f71b0
Compare
I've shuffled it back to just after RISCVVectorPeephole (so we can remove the V0 def map in it), and peeked through the first COPY to match the behaviour of SelectionDAG, most of the test diffs should be gone now. The one place I couldn't get rid of the diffs are in the nearbyint tests. The changes are due to how the VFROUND pseudos are expanded in such a way that two pseudos might share the same $v0 COPY, instead of having one copy per pseudo, e.g.:
It's possible to remove redundant copies to $v0 in this pass but it ends up generating more test diffs, so shelving that idea for later. |
@@ -599,6 +602,10 @@ void RISCVPassConfig::addMachineSSAOptimization() { | |||
} | |||
|
|||
void RISCVPassConfig::addPreRegAlloc() { | |||
// TODO: Move this as late as possible before regalloc | |||
if (TM->getOptLevel() == CodeGenOptLevel::None) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At -O0 we don't run addMachineSSAOptimization
so we still need to eliminate them here
This is another attempt at llvm#88496 to keep mask operands in SSA after instruction selection. Previously we selected the mask operands into vmv0, a singleton register class with exactly one register, V0. But the register allocator doesn't really support singleton register classes and we ran into errors like "ran out of registers during register allocation in function". This avoids this by introducing a pass just before register allocation that converts any use of vmv0 to a copy to $v0, i.e. what isel currently does today. That way the register allocator doesn't need to deal with the singleton register class, but get the benefits of having the mask registers in SSA throughout the backend: - This allows RISCVVLOptimizer to reduce the VLs of instructions that define mask registers - It enables CSE and code sinking in more places - It removes the need to peek through mask copies in RISCVISelDAGToDAG and keep track of V0 defs in RISCVVectorPeephole As a follow up, we can move the elimination pass to after phi elimination and outside of SSA, which would unblock the pre-RA scheduler around masked pseudos. This might also help the issue that RISCVVectorMaskDAGMutation tries to solve.
- Peek through copies to match isel better, remove most of test diff - Use liveins list to check for clobbers across blocks - Move messages into asserts - Add test for inline asm use - Assert no subregster index
06f71b0
to
b91c582
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/comments addressed.
Though, I'd encourage you to wait on review from @topperc . He's better at spotting subtle problems in this type of thing than I am.
if (isVMV0(MCOI)) { | ||
MachineOperand &MO = MI.getOperand(OpNo); | ||
Register Src = MO.getReg(); | ||
assert(MO.isUse() && MO.getSubReg() == RISCV::NoSubRegister && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A detail I'd missed, and that you should probably clarify in the patch description - you are only using the vmv0 register class for mask uses. In theory, we could use it for mask definitions too, but your code assumes we don't do that. This was part of the confusion on the assertion above - we were talking past each other on expectations.
Src.isVirtual() && "vmv0 use in unexpected form"); | ||
|
||
// Peek through a single copy to match what isel does. | ||
MachineInstr *SrcMI = MRI.getVRegDef(Src); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SrcMI isn't used after this, and can be faulted into the if-clause "if (def; condition)"
MI.isInlineAsm() || | ||
MRI.getVRegDef(MO.getReg())->isInlineAsm() && | ||
"Non-inline-asm use of vmv0 left behind"); | ||
MadeChange = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This update is redundant. At this point, the value is unconditionally true at the return below. Please remove, and change the return to "return true;" below.
…ment about how the register coalescer produces unallocatable instructions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks! This is GREAT!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I went back and looked at the I wonder if we can fix that by using VRNoV0 on the dest/passthru operand. Due to the early-clobber we never end up using V0 for that operand today anyway. I have a ticket for that in our internal database from 2+ years ago:
|
This is the follow up to llvm#125026 that keeps mask operands in virtual register form for as long as possible throughout the backend. The diffs in this patch are from MachineCSE/MachineSink/RISCVVLOptimizer kicking in. The invariant that the mask COPY never has a subreg no longer holds after MachineCSE (it coalesces some copies), so it needed to be relaxed.
…of vmv0 (llvm#125026) This is another attempt at llvm#88496 to keep mask operands in SSA after instruction selection. Previously we selected the mask operands into vmv0, a singleton register class with exactly one register, V0. But the register allocator doesn't really support singleton register classes and we ran into errors like "ran out of registers during register allocation in function". This avoids this by introducing a pass just before register allocation that converts any use of vmv0 to a copy to $v0, i.e. what isel currently does today. That way the register allocator doesn't need to deal with the singleton register class, but we get the benefits of having the mask registers in SSA throughout the backend: - This allows RISCVVLOptimizer to reduce the VLs of instructions that define mask registers - It enables CSE and code sinking in more places - It removes the need to peek through mask copies in RISCVISelDAGToDAG and keep track of V0 defs in RISCVVectorPeephole This patch initially eliminates uses of vmv0s after RISCVVectorPeephole to keep the diff to a minimum, and a follow up patch will move it past the other MachineInstr SSA passes. Note that it doesn't try to remove any defs of vmv0 as we shouldn't have any instructions that have any vmv0 outputs. As a further follow up, we can move the elimination pass to after phi elimination and outside of SSA, which would unblock the pre-RA scheduler around masked pseudos. This might also help the issue that RISCVVectorMaskDAGMutation tries to solve.
…of vmv0 (llvm#125026) This is another attempt at llvm#88496 to keep mask operands in SSA after instruction selection. Previously we selected the mask operands into vmv0, a singleton register class with exactly one register, V0. But the register allocator doesn't really support singleton register classes and we ran into errors like "ran out of registers during register allocation in function". This avoids this by introducing a pass just before register allocation that converts any use of vmv0 to a copy to $v0, i.e. what isel currently does today. That way the register allocator doesn't need to deal with the singleton register class, but we get the benefits of having the mask registers in SSA throughout the backend: - This allows RISCVVLOptimizer to reduce the VLs of instructions that define mask registers - It enables CSE and code sinking in more places - It removes the need to peek through mask copies in RISCVISelDAGToDAG and keep track of V0 defs in RISCVVectorPeephole This patch initially eliminates uses of vmv0s after RISCVVectorPeephole to keep the diff to a minimum, and a follow up patch will move it past the other MachineInstr SSA passes. Note that it doesn't try to remove any defs of vmv0 as we shouldn't have any instructions that have any vmv0 outputs. As a further follow up, we can move the elimination pass to after phi elimination and outside of SSA, which would unblock the pre-RA scheduler around masked pseudos. This might also help the issue that RISCVVectorMaskDAGMutation tries to solve.
This is the follow up to llvm#125026 that keeps mask operands in virtual register form for as long as possible throughout the backend. The diffs in this patch are from MachineCSE/MachineSink/RISCVVLOptimizer kicking in. The invariant that the mask COPY never has a subreg no longer holds after MachineCSE (it coalesces some copies), so it needed to be relaxed.
This is the follow up to #125026 that keeps mask operands in virtual register form for as long as possible throughout the backend. The diffs in this patch are from MachineCSE/MachineSink/RISCVVLOptimizer kicking in. The invariant that the mask COPY never has a subreg no longer holds after MachineCSE (it coalesces some copies), so it needed to be relaxed.
…of vmv0 (llvm#125026) This is another attempt at llvm#88496 to keep mask operands in SSA after instruction selection. Previously we selected the mask operands into vmv0, a singleton register class with exactly one register, V0. But the register allocator doesn't really support singleton register classes and we ran into errors like "ran out of registers during register allocation in function". This avoids this by introducing a pass just before register allocation that converts any use of vmv0 to a copy to $v0, i.e. what isel currently does today. That way the register allocator doesn't need to deal with the singleton register class, but we get the benefits of having the mask registers in SSA throughout the backend: - This allows RISCVVLOptimizer to reduce the VLs of instructions that define mask registers - It enables CSE and code sinking in more places - It removes the need to peek through mask copies in RISCVISelDAGToDAG and keep track of V0 defs in RISCVVectorPeephole This patch initially eliminates uses of vmv0s after RISCVVectorPeephole to keep the diff to a minimum, and a follow up patch will move it past the other MachineInstr SSA passes. Note that it doesn't try to remove any defs of vmv0 as we shouldn't have any instructions that have any vmv0 outputs. As a further follow up, we can move the elimination pass to after phi elimination and outside of SSA, which would unblock the pre-RA scheduler around masked pseudos. This might also help the issue that RISCVVectorMaskDAGMutation tries to solve.
This is another attempt at #88496 to keep mask operands in SSA after instruction selection.
Previously we selected the mask operands into vmv0, a singleton register class with exactly one register, V0.
But the register allocator doesn't really support singleton register classes and we ran into errors like "ran out of registers during register allocation in function".
This avoids this by introducing a pass just before register allocation that converts any use of vmv0 to a copy to $v0, i.e. what isel currently does today.
That way the register allocator doesn't need to deal with the singleton register class, but we get the benefits of having the mask registers in SSA throughout the backend:
This patch initially eliminates uses of vmv0s after RISCVVectorPeephole to keep the diff to a minimum, and a follow up patch will move it past the other MachineInstr SSA passes.
Note that it doesn't try to remove any defs of vmv0 as we shouldn't have any instructions that have any vmv0 outputs.
As a further follow up, we can move the elimination pass to after phi elimination and outside of SSA, which would unblock the pre-RA scheduler around masked pseudos. This might also help the issue that RISCVVectorMaskDAGMutation tries to solve.