Divide vectors that surpass 4 element limit

V-FEXrt · V-FEXrt · commit a90026c858f8 · 2024-12-09T17:34:28.000-07:00
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
@@ -621,7 +621,7 @@ def CountBits :  DXILOp<31, unaryBits> {
 def FirstbitLo : DXILOp<32, unaryBits> {
   let Doc = "Returns the location of the first set bit starting from "
             "the lowest order bit and working upward.";
-  let LLVMIntrinsic = int_dx_firstbitlow;
+  let intrinsics = [ IntrinSelect<int_dx_firstbitlow> ];
   let arguments = [OverloadTy];
   let result = Int32Ty;
   let overloads =
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -108,8 +108,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                            unsigned Opcode) const;
 
   bool selectFirstBitSet64(Register ResVReg, const SPIRVType *ResType,
-                           MachineInstr &I, unsigned BitSetOpcode,
-                           bool SwapPrimarySide) const;
+                           MachineInstr &I, Register SrcReg,
+                           unsigned BitSetOpcode, bool SwapPrimarySide) const;
 
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
@@ -3171,23 +3171,116 @@ bool SPIRVInstructionSelector::selectFirstBitSet32(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitSet64(Register ResVReg,
-                                                   const SPIRVType *ResType,
-                                                   MachineInstr &I,
-                                                   unsigned BitSetOpcode,
-                                                   bool SwapPrimarySide) const {
-  Register OpReg = I.getOperand(2).getReg();
-
-  // 1. Split int64 into 2 pieces using a bitcast
+bool SPIRVInstructionSelector::selectFirstBitSet64(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
   unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  bool ZeroAsNull = STI.isOpenCLEnv();
+  Register ConstIntZero =
+      GR.getOrCreateConstInt(0, I, BaseType, TII, ZeroAsNull);
+  Register ConstIntOne =
+      GR.getOrCreateConstInt(1, I, BaseType, TII, ZeroAsNull);
+
+  // SPIRV doesn't support vectors with more than 4 components. Since the
+  // algoritm below converts i64 -> i32x2 and i64x4 -> i32x8 it can only
+  // operate on vectors with 2 or less components. When largers vectors are
+  // seen. Split them, recurse, then recombine them.
+  if (ComponentCount > 2) {
+    unsigned LeftComponentCount = ComponentCount / 2;
+    unsigned RightComponentCount = ComponentCount - LeftComponentCount;
+    bool LeftIsVector = LeftComponentCount > 1;
+
+    // Split the SrcReg in half into 2 smaller vec registers
+    // (ie i64x4 -> i64x2, i64x2)
+    MachineIRBuilder MIRBuilder(I);
+    SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
+    SPIRVType *LeftVecOpType;
+    SPIRVType *LeftVecResType;
+    if (LeftIsVector) {
+      LeftVecOpType =
+          GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
+      LeftVecResType = GR.getOrCreateSPIRVVectorType(
+          BaseType, LeftComponentCount, MIRBuilder);
+    } else {
+      LeftVecOpType = OpType;
+      LeftVecResType = BaseType;
+    }
+
+    SPIRVType *RightVecOpType =
+        GR.getOrCreateSPIRVVectorType(OpType, RightComponentCount, MIRBuilder);
+    SPIRVType *RightVecResType = GR.getOrCreateSPIRVVectorType(
+        BaseType, RightComponentCount, MIRBuilder);
+
+    Register LeftSideIn =
+        MRI->createVirtualRegister(GR.getRegClass(LeftVecOpType));
+    Register RightSideIn =
+        MRI->createVirtualRegister(GR.getRegClass(RightVecOpType));
+
+    bool Result;
+
+    if (LeftIsVector) {
+      auto MIB =
+          BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                  TII.get(SPIRV::OpVectorShuffle))
+              .addDef(LeftSideIn)
+              .addUse(GR.getSPIRVTypeID(LeftVecOpType))
+              .addUse(SrcReg)
+              // Per the spec, repeat the vector if only one vec is needed
+              .addUse(SrcReg);
+
+      for (unsigned J = 0; J < LeftComponentCount; J++) {
+        MIB.addImm(J);
+      }
+
+      Result = MIB.constrainAllUses(TII, TRI, RBI);
+    } else {
+      Result =
+          selectOpWithSrcs(LeftSideIn, LeftVecOpType, I, {SrcReg, ConstIntZero},
+                           SPIRV::OpVectorExtractDynamic);
+    }
+
+    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                       TII.get(SPIRV::OpVectorShuffle))
+                   .addDef(RightSideIn)
+                   .addUse(GR.getSPIRVTypeID(RightVecOpType))
+                   .addUse(SrcReg)
+                   // Per the spec, repeat the vector if only one vec is needed
+                   .addUse(SrcReg);
+
+    for (unsigned J = LeftComponentCount; J < ComponentCount; J++) {
+      MIB.addImm(J);
+    }
+
+    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+
+    // Recursively call selectFirstBitSet64 on the 2 registers
+    Register LeftSideOut =
+        MRI->createVirtualRegister(GR.getRegClass(LeftVecResType));
+    Register RightSideOut =
+        MRI->createVirtualRegister(GR.getRegClass(RightVecResType));
+    Result = Result &&
+             selectFirstBitSet64(LeftSideOut, LeftVecResType, I, LeftSideIn,
+                                 BitSetOpcode, SwapPrimarySide);
+    Result = Result &&
+             selectFirstBitSet64(RightSideOut, RightVecResType, I, RightSideIn,
+                                 BitSetOpcode, SwapPrimarySide);
+
+    // Join the two resulting registers back into the return type
+    // (ie i32x2, i32x2 -> i32x4)
+    return Result &&
+           selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
+                            SPIRV::OpCompositeConstruct);
+  }
+
+  // 1. Split int64 into 2 pieces using a bitcast
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *PostCastType =
       GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
   Register BitcastReg =
       MRI->createVirtualRegister(GR.getRegClass(PostCastType));
   bool Result =
-      selectOpWithSrcs(BitcastReg, PostCastType, I, {OpReg}, SPIRV::OpBitcast);
+      selectOpWithSrcs(BitcastReg, PostCastType, I, {SrcReg}, SPIRV::OpBitcast);
 
   // 2. Find the first set bit from the primary side for all the pieces in #1
   Register FBSReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
@@ -3198,20 +3291,15 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(Register ResVReg,
   Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
   Register LowReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
 
-  bool ZeroAsNull = STI.isOpenCLEnv();
   bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
   if (IsScalarRes) {
     // if scalar do a vector extract
-    Result = Result &&
-             selectOpWithSrcs(HighReg, ResType, I,
-                              {FBSReg, GR.getOrCreateConstInt(0, I, ResType,
-                                                              TII, ZeroAsNull)},
-                              SPIRV::OpVectorExtractDynamic);
-    Result = Result &&
-             selectOpWithSrcs(LowReg, ResType, I,
-                              {FBSReg, GR.getOrCreateConstInt(1, I, ResType,
-                                                              TII, ZeroAsNull)},
-                              SPIRV::OpVectorExtractDynamic);
+    Result =
+        Result && selectOpWithSrcs(HighReg, ResType, I, {FBSReg, ConstIntZero},
+                                   SPIRV::OpVectorExtractDynamic);
+    Result =
+        Result && selectOpWithSrcs(LowReg, ResType, I, {FBSReg, ConstIntOne},
+                                   SPIRV::OpVectorExtractDynamic);
   } else {
     // if vector do a shufflevector
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -3324,7 +3412,7 @@ bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
   case 32:
     return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitSet64(ResVReg, ResType, I, BitSetOpcode,
+    return selectFirstBitSet64(ResVReg, ResType, I, OpReg, BitSetOpcode,
                                /*SwapPrimarySide=*/false);
   default:
     report_fatal_error(
@@ -3350,7 +3438,7 @@ bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
   case 32:
     return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitSet64(ResVReg, ResType, I, BitSetOpcode,
+    return selectFirstBitSet64(ResVReg, ResType, I, OpReg, BitSetOpcode,
                                /*SwapPrimarySide=*/true);
   default:
     report_fatal_error("spv_firstbitlow only supports 16,32,64 bits.");
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -5,6 +5,7 @@
 ; CHECK-DAG: OpMemoryModel Logical GLSL450
 ; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
 ; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
+; CHECK-DAG: [[u32x3_t:%.+]] = OpTypeVector [[u32_t]] 3
 ; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
 ; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
 ; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
@@ -15,8 +16,12 @@
 ; CHECK-DAG: [[const_neg1x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg1]] [[const_neg1]]
 ; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
 ; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
+; CHECK-DAG: [[u16x3_t:%.+]] = OpTypeVector [[u16_t]] 3
+; CHECK-DAG: [[u16x4_t:%.+]] = OpTypeVector [[u16_t]] 4
 ; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
 ; CHECK-DAG: [[u64x2_t:%.+]] = OpTypeVector [[u64_t]] 2
+; CHECK-DAG: [[u64x3_t:%.+]] = OpTypeVector [[u64_t]] 3
+; CHECK-DAG: [[u64x4_t:%.+]] = OpTypeVector [[u64_t]] 4
 ; CHECK-DAG: [[bool_t:%.+]] = OpTypeBool
 ; CHECK-DAG: [[boolx2_t:%.+]] = OpTypeVector [[bool_t]] 2
 
@@ -30,8 +35,8 @@ entry:
   ret i32 %elt.firstbitlow
 }
 
-; CHECK-LABEL: Begin function firstbitlow_2xi32
-define noundef <2 x i32> @firstbitlow_2xi32(<2 x i32> noundef %a) {
+; CHECK-LABEL: Begin function firstbitlow_v2xi32
+define noundef <2 x i32> @firstbitlow_v2xi32(<2 x i32> noundef %a) {
 entry:
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x2_t]]
 ; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a]]
@@ -40,6 +45,26 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
+; CHECK-LABEL: Begin function firstbitlow_v3xi32
+define noundef <3 x i32> @firstbitlow_v3xi32(<3 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x3_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i32(<3 x i32> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi32
+define noundef <4 x i32> @firstbitlow_v4xi32(<4 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x4_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i32(<4 x i32> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
 ; CHECK-LABEL: Begin function firstbitlow_i16
 define noundef i32 @firstbitlow_i16(i16 noundef %a) {
 entry:
@@ -62,6 +87,28 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
+; CHECK-LABEL: Begin function firstbitlow_v3xi16
+define noundef <3 x i32> @firstbitlow_v3xi16(<3 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x3_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x3_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i16(<3 x i16> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi16
+define noundef <4 x i32> @firstbitlow_v4xi16(<4 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x4_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x4_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i16(<4 x i16> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
 ; CHECK-LABEL: Begin function firstbitlow_i64
 define noundef i32 @firstbitlow_i64(i64 noundef %a) {
 entry:
@@ -96,6 +143,74 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
+; CHECK-LABEL: Begin function firstbitlow_v3i64
+define noundef <3 x i32> @firstbitlow_v3i64(<3 x i64> noundef %a) {
+entry:
+; Split the i64x3 into i64, i64x2
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
+; CHECK: [[left:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_0]]
+; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 1 2
+
+; Do firstbitlow on i64, i64x2
+; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x2_t]] [[left]]
+; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[left_cast]]
+; CHECK: [[left_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_0]]
+; CHECK: [[left_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_1]]
+; CHECK: [[left_should_use_high:%.+]] = OpIEqual [[bool_t]] [[left_low_bits]] [[const_neg1]]
+; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32_t]] [[left_should_use_high]] [[left_high_bits]] [[left_low_bits]]
+; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32_t]] [[left_should_use_high]] [[const_32]] [[const_0]]
+; CHECK: [[left_res:%.+]] = OpIAdd [[u32_t]] [[left_ans_offset]] [[left_ans_bits]]
+
+; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
+; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[right_cast]]
+; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
+; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
+; CHECK: [[right_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[right_low_bits]] [[const_neg1x2]]
+; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[right_high_bits]] [[right_low_bits]]
+; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+
+; Merge the resulting i32, i32x2 into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[left_res]] [[right_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i64(<3 x i64> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4i64
+define noundef <4 x i32> @firstbitlow_v4i64(<4 x i64> noundef %a) {
+entry:
+; Split the i64x4 into 2 i64x2
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
+; CHECK: [[left:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
+
+; Do firstbitlow on the 2 i64x2
+; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x4_t]] [[left]]
+; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[left_cast]]
+; CHECK: [[left_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 0 2
+; CHECK: [[left_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 1 3
+; CHECK: [[left_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[left_low_bits]] [[const_neg1x2]]
+; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_high]] [[left_high_bits]] [[left_low_bits]]
+; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[left_res:%.+]] = OpIAdd [[u32x2_t]] [[left_ans_offset]] [[left_ans_bits]]
+
+; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
+; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[right_cast]]
+; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
+; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
+; CHECK: [[right_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[right_low_bits]] [[const_neg1x2]]
+; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[right_high_bits]] [[right_low_bits]]
+; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+
+; Merge the resulting 2 i32x2 into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[left_res]] [[right_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i64(<4 x i64> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
 ;declare i16 @llvm.spv.firstbitlow.i16(i16)
 ;declare i32 @llvm.spv.firstbitlow.i32(i32)
 ;declare i64 @llvm.spv.firstbitlow.i64(i64)