-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[HLSL][SPIRV][DXIL] Implement dot4add_i8packed
intrinsic
#113623
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
8583e87
to
81dfa26
Compare
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-ir Author: Finn Plummer (inbelic) Changes
Resolves #99220 Full diff: https://github.com/llvm/llvm-project/pull/113623.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 90475a361bb8f8..eb6b07e8858602 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4792,6 +4792,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
let Prototype = "void(...)";
}
+def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> {
+ let Spellings = ["__builtin_hlsl_dot4add_i8packed"];
+ let Attributes = [NoThrow, Const];
+ let Prototype = "int(unsigned int, unsigned int, int)";
+}
+
def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_elementwise_frac"];
let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 28f28c70b5ae52..13ed0f99da9815 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18722,7 +18722,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
/*ReturnType=*/T0->getScalarType(),
getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()),
ArrayRef<Value *>{Op0, Op1}, nullptr, "hlsl.dot");
- } break;
+ }
+ case Builtin::BI__builtin_hlsl_dot4add_i8packed: {
+ Value *A = EmitScalarExpr(E->getArg(0));
+ Value *B = EmitScalarExpr(E->getArg(1));
+ Value *C = EmitScalarExpr(E->getArg(2));
+
+ Intrinsic::ID ID = CGM.getHLSLRuntime().getDot4AddI8PackedIntrinsic();
+ return Builder.CreateIntrinsic(
+ /*ReturnType=*/C->getType(), ID, ArrayRef<Value *>{A, B, C}, nullptr,
+ "hlsl.dot4add.i8packed");
+ }
case Builtin::BI__builtin_hlsl_lerp: {
Value *X = EmitScalarExpr(E->getArg(0));
Value *Y = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index ff7df41b5c62e7..8b1141375106cc 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -89,6 +89,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
+ GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddI8Packed, dot4add_i8packed)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 30dce60b3ff702..d10bfcbeed97ea 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -894,6 +894,16 @@ uint64_t dot(uint64_t3, uint64_t3);
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
uint64_t dot(uint64_t4, uint64_t4);
+//===----------------------------------------------------------------------===//
+// dot4add builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn int dot4add_i8packed(uint A, uint B, int C)
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.4)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed)
+int dot4add_i8packed(unsigned int, unsigned int, int);
+
//===----------------------------------------------------------------------===//
// exp builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
new file mode 100644
index 00000000000000..ea1a33d6267d2f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=spv
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test
+int test(uint a, uint b, int c) {
+ // CHECK: %[[RET:.*]] = call [[TY:i32]] @llvm.[[TARGET]].dot4add.i8packed([[TY]] %[[#]], [[TY]] %[[#]], [[TY]] %[[#]])
+ // CHECK: ret [[TY]] %[[RET]]
+ return dot4add_i8packed(a, b, c);
+}
+
+// CHECK: declare [[TY]] @llvm.[[TARGET]].dot4add.i8packed([[TY]], [[TY]], [[TY]])
diff --git a/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
new file mode 100644
index 00000000000000..ac0b430bfaf945
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg0() {
+ return __builtin_hlsl_dot4add_i8packed();
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
+}
+
+int test_too_few_arg1(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+int test_too_few_arg2(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+int test_too_many_arg(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0, p0, p0);
+ // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+struct S { float f; };
+
+int test_expr_struct_type_check(S p0, int p1) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p1, p1);
+ // expected-error@-1 {{no viable conversion from 'S' to 'unsigned int'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index e30d37f69f781e..8cd5ff9006c1b7 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -69,6 +69,7 @@ def int_dx_udot :
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6df2eb156a0774..ebea18cd932617 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -83,6 +83,7 @@ let TargetPrefix = "spv" in {
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 147b32b1ca9903..f0f40de5009f52 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -779,6 +779,16 @@ def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> {
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
}
+def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
+ let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
+ "accumulate to i32";
+ let LLVMIntrinsic = int_dx_dot4add_i8packed;
+ let arguments = [Int32Ty, Int32Ty, Int32Ty];
+ let result = Int32Ty;
+ let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+ let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
def AnnotateHandle : DXILOp<217, annotateHandle> {
let Doc = "annotate handle with resource properties";
let arguments = [HandleTy, ResPropsTy];
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d9377fe4b91a1a..d297b2fa07209f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -164,6 +164,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ template <bool Signed>
+ bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -1694,6 +1698,84 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
return Result;
}
+// Since pre-1.6 SPIRV has no DotProductInput4x8BitPacked implementation,
+// extract the elements of the packed inputs, multiply them and add the result
+// to the accumulator.
+template <bool Signed>
+bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ assert(I.getNumOperands() == 5);
+ assert(I.getOperand(2).isReg());
+ assert(I.getOperand(3).isReg());
+ assert(I.getOperand(4).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+
+ bool Result = false;
+
+ // Acc = C
+ Register Acc = I.getOperand(4).getReg();
+ SPIRVType *EltType = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+ auto ExtractOp =
+ Signed ? SPIRV::OpBitFieldSExtract : SPIRV::OpBitFieldUExtract;
+
+ // Extract the i8 element, multiply and add it to the accumulator
+ for (unsigned i = 0; i < 4; i++) {
+ // A[i]
+ Register AElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(AElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(2).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // B[i]
+ Register BElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(BElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(3).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // A[i] * B[i]
+ Register Mul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulS))
+ .addDef(Mul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(AElt)
+ .addUse(BElt)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Discard 24 highest-bits so that stored i32 register is i8 equivalent
+ Register MaskMul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(MaskMul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Mul)
+ .addUse(GR.getOrCreateConstInt(0, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Acc = Acc + A[i] * B[i]
+ Register Sum =
+ i < 3 ? MRI->createVirtualRegister(&SPIRV::IDRegClass) : ResVReg;
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
+ .addDef(Sum)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Acc)
+ .addUse(MaskMul)
+ .constrainAllUses(TII, TRI, RBI);
+
+ Acc = Sum;
+ }
+
+ return Result;
+}
+
/// Transform saturate(x) to clamp(x, 0.0f, 1.0f) as SPIRV
/// does not have a saturate builtin.
bool SPIRVInstructionSelector::selectSaturate(Register ResVReg,
@@ -2527,6 +2609,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_udot:
case Intrinsic::spv_sdot:
return selectIntegerDot(ResVReg, ResType, I);
+ case Intrinsic::spv_dot4add_i8packed:
+ return selectDot4AddPacked<true>(ResVReg, ResType, I);
case Intrinsic::spv_all:
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
diff --git a/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..7df0520505cea6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+define void @main(i32 %a, i32 %b, i32 %c) {
+entry:
+; CHECK: call i32 @dx.op.dot4AddPacked(i32 163, i32 %a, i32 %b, i32 %c)
+ %0 = call i32 @llvm.dx.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret void
+}
+
+declare i32 @llvm.dx.dot4add.i8packed(i32, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..35e2a731071103
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#int_8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#zero:]] = OpConstantNull %[[#int_8]]
+; CHECK-DAG: %[[#eight:]] = OpConstant %[[#int_8]] 8
+; CHECK-DAG: %[[#sixteen:]] = OpConstant %[[#int_8]] 16
+; CHECK-DAG: %[[#twentyfour:]] = OpConstant %[[#int_8]] 24
+; CHECK-LABEL: Begin function test_dot
+define noundef i32 @test_dot(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: %[[#A:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#B:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#C:]] = OpFunctionParameter %[[#int_32]]
+
+; First element of the packed vector
+; CHECK: %[[#A0:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#zero]] 8
+; CHECK: %[[#B0:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#zero]] 8
+; CHECK: %[[#MUL0:]] = OpIMul %[[#int_32]] %[[#A0]] %[[#B0]]
+; CHECK: %[[#MASK0:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL0]] %[[#zero]] 8
+; CHECK: %[[#ACC0:]] = OpIAdd %[[#int_32]] %[[#C]] %[[#MASK0]]
+
+; Second element of the packed vector
+; CHECK: %[[#A1:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#eight]] 8
+; CHECK: %[[#B1:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#eight]] 8
+; CHECK: %[[#MUL1:]] = OpIMul %[[#int_32]] %[[#A1]] %[[#B1]]
+; CHECK: %[[#MASK1:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL1]] %[[#zero]] 8
+; CHECK: %[[#ACC1:]] = OpIAdd %[[#int_32]] %[[#ACC0]] %[[#MASK1]]
+
+; Third element of the packed vector
+; CHECK: %[[#A2:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#sixteen]] 8
+; CHECK: %[[#B2:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#sixteen]] 8
+; CHECK: %[[#MUL2:]] = OpIMul %[[#int_32]] %[[#A2]] %[[#B2]]
+; CHECK: %[[#MASK2:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL2]] %[[#zero]] 8
+; CHECK: %[[#ACC2:]] = OpIAdd %[[#int_32]] %[[#ACC1]] %[[#MASK2]]
+
+; Fourth element of the packed vector
+; CHECK: %[[#A3:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#twentyfour]] 8
+; CHECK: %[[#B3:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#twentyfour]] 8
+; CHECK: %[[#MUL3:]] = OpIMul %[[#int_32]] %[[#A3]] %[[#B3]]
+; CHECK: %[[#MASK3:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL3]] %[[#zero]] 8
+; CHECK: %[[#ACC3:]] = OpIAdd %[[#int_32]] %[[#ACC2]] %[[#MASK3]]
+
+; CHECK: OpReturnValue %[[#ACC3]]
+ %spv.dot = call i32 @llvm.spv.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret i32 %spv.dot
+}
|
@llvm/pr-subscribers-hlsl Author: Finn Plummer (inbelic) Changes
Resolves #99220 Full diff: https://github.com/llvm/llvm-project/pull/113623.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 90475a361bb8f8..eb6b07e8858602 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4792,6 +4792,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
let Prototype = "void(...)";
}
+def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> {
+ let Spellings = ["__builtin_hlsl_dot4add_i8packed"];
+ let Attributes = [NoThrow, Const];
+ let Prototype = "int(unsigned int, unsigned int, int)";
+}
+
def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_elementwise_frac"];
let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 28f28c70b5ae52..13ed0f99da9815 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18722,7 +18722,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
/*ReturnType=*/T0->getScalarType(),
getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()),
ArrayRef<Value *>{Op0, Op1}, nullptr, "hlsl.dot");
- } break;
+ }
+ case Builtin::BI__builtin_hlsl_dot4add_i8packed: {
+ Value *A = EmitScalarExpr(E->getArg(0));
+ Value *B = EmitScalarExpr(E->getArg(1));
+ Value *C = EmitScalarExpr(E->getArg(2));
+
+ Intrinsic::ID ID = CGM.getHLSLRuntime().getDot4AddI8PackedIntrinsic();
+ return Builder.CreateIntrinsic(
+ /*ReturnType=*/C->getType(), ID, ArrayRef<Value *>{A, B, C}, nullptr,
+ "hlsl.dot4add.i8packed");
+ }
case Builtin::BI__builtin_hlsl_lerp: {
Value *X = EmitScalarExpr(E->getArg(0));
Value *Y = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index ff7df41b5c62e7..8b1141375106cc 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -89,6 +89,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
+ GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddI8Packed, dot4add_i8packed)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 30dce60b3ff702..d10bfcbeed97ea 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -894,6 +894,16 @@ uint64_t dot(uint64_t3, uint64_t3);
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
uint64_t dot(uint64_t4, uint64_t4);
+//===----------------------------------------------------------------------===//
+// dot4add builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn int dot4add_i8packed(uint A, uint B, int C)
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.4)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed)
+int dot4add_i8packed(unsigned int, unsigned int, int);
+
//===----------------------------------------------------------------------===//
// exp builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
new file mode 100644
index 00000000000000..ea1a33d6267d2f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=spv
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test
+int test(uint a, uint b, int c) {
+ // CHECK: %[[RET:.*]] = call [[TY:i32]] @llvm.[[TARGET]].dot4add.i8packed([[TY]] %[[#]], [[TY]] %[[#]], [[TY]] %[[#]])
+ // CHECK: ret [[TY]] %[[RET]]
+ return dot4add_i8packed(a, b, c);
+}
+
+// CHECK: declare [[TY]] @llvm.[[TARGET]].dot4add.i8packed([[TY]], [[TY]], [[TY]])
diff --git a/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
new file mode 100644
index 00000000000000..ac0b430bfaf945
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg0() {
+ return __builtin_hlsl_dot4add_i8packed();
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
+}
+
+int test_too_few_arg1(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+int test_too_few_arg2(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+int test_too_many_arg(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0, p0, p0);
+ // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+struct S { float f; };
+
+int test_expr_struct_type_check(S p0, int p1) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p1, p1);
+ // expected-error@-1 {{no viable conversion from 'S' to 'unsigned int'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index e30d37f69f781e..8cd5ff9006c1b7 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -69,6 +69,7 @@ def int_dx_udot :
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6df2eb156a0774..ebea18cd932617 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -83,6 +83,7 @@ let TargetPrefix = "spv" in {
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 147b32b1ca9903..f0f40de5009f52 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -779,6 +779,16 @@ def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> {
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
}
+def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
+ let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
+ "accumulate to i32";
+ let LLVMIntrinsic = int_dx_dot4add_i8packed;
+ let arguments = [Int32Ty, Int32Ty, Int32Ty];
+ let result = Int32Ty;
+ let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+ let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
def AnnotateHandle : DXILOp<217, annotateHandle> {
let Doc = "annotate handle with resource properties";
let arguments = [HandleTy, ResPropsTy];
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d9377fe4b91a1a..d297b2fa07209f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -164,6 +164,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ template <bool Signed>
+ bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -1694,6 +1698,84 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
return Result;
}
+// Since pre-1.6 SPIRV has no DotProductInput4x8BitPacked implementation,
+// extract the elements of the packed inputs, multiply them and add the result
+// to the accumulator.
+template <bool Signed>
+bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ assert(I.getNumOperands() == 5);
+ assert(I.getOperand(2).isReg());
+ assert(I.getOperand(3).isReg());
+ assert(I.getOperand(4).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+
+ bool Result = false;
+
+ // Acc = C
+ Register Acc = I.getOperand(4).getReg();
+ SPIRVType *EltType = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+ auto ExtractOp =
+ Signed ? SPIRV::OpBitFieldSExtract : SPIRV::OpBitFieldUExtract;
+
+ // Extract the i8 element, multiply and add it to the accumulator
+ for (unsigned i = 0; i < 4; i++) {
+ // A[i]
+ Register AElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(AElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(2).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // B[i]
+ Register BElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(BElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(3).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // A[i] * B[i]
+ Register Mul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulS))
+ .addDef(Mul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(AElt)
+ .addUse(BElt)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Discard 24 highest-bits so that stored i32 register is i8 equivalent
+ Register MaskMul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(MaskMul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Mul)
+ .addUse(GR.getOrCreateConstInt(0, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Acc = Acc + A[i] * B[i]
+ Register Sum =
+ i < 3 ? MRI->createVirtualRegister(&SPIRV::IDRegClass) : ResVReg;
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
+ .addDef(Sum)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Acc)
+ .addUse(MaskMul)
+ .constrainAllUses(TII, TRI, RBI);
+
+ Acc = Sum;
+ }
+
+ return Result;
+}
+
/// Transform saturate(x) to clamp(x, 0.0f, 1.0f) as SPIRV
/// does not have a saturate builtin.
bool SPIRVInstructionSelector::selectSaturate(Register ResVReg,
@@ -2527,6 +2609,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_udot:
case Intrinsic::spv_sdot:
return selectIntegerDot(ResVReg, ResType, I);
+ case Intrinsic::spv_dot4add_i8packed:
+ return selectDot4AddPacked<true>(ResVReg, ResType, I);
case Intrinsic::spv_all:
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
diff --git a/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..7df0520505cea6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+define void @main(i32 %a, i32 %b, i32 %c) {
+entry:
+; CHECK: call i32 @dx.op.dot4AddPacked(i32 163, i32 %a, i32 %b, i32 %c)
+ %0 = call i32 @llvm.dx.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret void
+}
+
+declare i32 @llvm.dx.dot4add.i8packed(i32, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..35e2a731071103
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#int_8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#zero:]] = OpConstantNull %[[#int_8]]
+; CHECK-DAG: %[[#eight:]] = OpConstant %[[#int_8]] 8
+; CHECK-DAG: %[[#sixteen:]] = OpConstant %[[#int_8]] 16
+; CHECK-DAG: %[[#twentyfour:]] = OpConstant %[[#int_8]] 24
+; CHECK-LABEL: Begin function test_dot
+define noundef i32 @test_dot(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: %[[#A:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#B:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#C:]] = OpFunctionParameter %[[#int_32]]
+
+; First element of the packed vector
+; CHECK: %[[#A0:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#zero]] 8
+; CHECK: %[[#B0:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#zero]] 8
+; CHECK: %[[#MUL0:]] = OpIMul %[[#int_32]] %[[#A0]] %[[#B0]]
+; CHECK: %[[#MASK0:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL0]] %[[#zero]] 8
+; CHECK: %[[#ACC0:]] = OpIAdd %[[#int_32]] %[[#C]] %[[#MASK0]]
+
+; Second element of the packed vector
+; CHECK: %[[#A1:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#eight]] 8
+; CHECK: %[[#B1:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#eight]] 8
+; CHECK: %[[#MUL1:]] = OpIMul %[[#int_32]] %[[#A1]] %[[#B1]]
+; CHECK: %[[#MASK1:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL1]] %[[#zero]] 8
+; CHECK: %[[#ACC1:]] = OpIAdd %[[#int_32]] %[[#ACC0]] %[[#MASK1]]
+
+; Third element of the packed vector
+; CHECK: %[[#A2:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#sixteen]] 8
+; CHECK: %[[#B2:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#sixteen]] 8
+; CHECK: %[[#MUL2:]] = OpIMul %[[#int_32]] %[[#A2]] %[[#B2]]
+; CHECK: %[[#MASK2:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL2]] %[[#zero]] 8
+; CHECK: %[[#ACC2:]] = OpIAdd %[[#int_32]] %[[#ACC1]] %[[#MASK2]]
+
+; Fourth element of the packed vector
+; CHECK: %[[#A3:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#twentyfour]] 8
+; CHECK: %[[#B3:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#twentyfour]] 8
+; CHECK: %[[#MUL3:]] = OpIMul %[[#int_32]] %[[#A3]] %[[#B3]]
+; CHECK: %[[#MASK3:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL3]] %[[#zero]] 8
+; CHECK: %[[#ACC3:]] = OpIAdd %[[#int_32]] %[[#ACC2]] %[[#MASK3]]
+
+; CHECK: OpReturnValue %[[#ACC3]]
+ %spv.dot = call i32 @llvm.spv.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret i32 %spv.dot
+}
|
@llvm/pr-subscribers-clang-codegen Author: Finn Plummer (inbelic) Changes
Resolves #99220 Full diff: https://github.com/llvm/llvm-project/pull/113623.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 90475a361bb8f8..eb6b07e8858602 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4792,6 +4792,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
let Prototype = "void(...)";
}
+def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> {
+ let Spellings = ["__builtin_hlsl_dot4add_i8packed"];
+ let Attributes = [NoThrow, Const];
+ let Prototype = "int(unsigned int, unsigned int, int)";
+}
+
def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_elementwise_frac"];
let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 28f28c70b5ae52..13ed0f99da9815 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18722,7 +18722,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
/*ReturnType=*/T0->getScalarType(),
getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()),
ArrayRef<Value *>{Op0, Op1}, nullptr, "hlsl.dot");
- } break;
+ }
+ case Builtin::BI__builtin_hlsl_dot4add_i8packed: {
+ Value *A = EmitScalarExpr(E->getArg(0));
+ Value *B = EmitScalarExpr(E->getArg(1));
+ Value *C = EmitScalarExpr(E->getArg(2));
+
+ Intrinsic::ID ID = CGM.getHLSLRuntime().getDot4AddI8PackedIntrinsic();
+ return Builder.CreateIntrinsic(
+ /*ReturnType=*/C->getType(), ID, ArrayRef<Value *>{A, B, C}, nullptr,
+ "hlsl.dot4add.i8packed");
+ }
case Builtin::BI__builtin_hlsl_lerp: {
Value *X = EmitScalarExpr(E->getArg(0));
Value *Y = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index ff7df41b5c62e7..8b1141375106cc 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -89,6 +89,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
+ GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddI8Packed, dot4add_i8packed)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 30dce60b3ff702..d10bfcbeed97ea 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -894,6 +894,16 @@ uint64_t dot(uint64_t3, uint64_t3);
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
uint64_t dot(uint64_t4, uint64_t4);
+//===----------------------------------------------------------------------===//
+// dot4add builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn int dot4add_i8packed(uint A, uint B, int C)
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.4)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed)
+int dot4add_i8packed(unsigned int, unsigned int, int);
+
//===----------------------------------------------------------------------===//
// exp builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
new file mode 100644
index 00000000000000..ea1a33d6267d2f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=spv
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test
+int test(uint a, uint b, int c) {
+ // CHECK: %[[RET:.*]] = call [[TY:i32]] @llvm.[[TARGET]].dot4add.i8packed([[TY]] %[[#]], [[TY]] %[[#]], [[TY]] %[[#]])
+ // CHECK: ret [[TY]] %[[RET]]
+ return dot4add_i8packed(a, b, c);
+}
+
+// CHECK: declare [[TY]] @llvm.[[TARGET]].dot4add.i8packed([[TY]], [[TY]], [[TY]])
diff --git a/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
new file mode 100644
index 00000000000000..ac0b430bfaf945
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg0() {
+ return __builtin_hlsl_dot4add_i8packed();
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
+}
+
+int test_too_few_arg1(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+int test_too_few_arg2(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+int test_too_many_arg(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0, p0, p0);
+ // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+struct S { float f; };
+
+int test_expr_struct_type_check(S p0, int p1) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p1, p1);
+ // expected-error@-1 {{no viable conversion from 'S' to 'unsigned int'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index e30d37f69f781e..8cd5ff9006c1b7 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -69,6 +69,7 @@ def int_dx_udot :
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6df2eb156a0774..ebea18cd932617 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -83,6 +83,7 @@ let TargetPrefix = "spv" in {
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 147b32b1ca9903..f0f40de5009f52 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -779,6 +779,16 @@ def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> {
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
}
+def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
+ let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
+ "accumulate to i32";
+ let LLVMIntrinsic = int_dx_dot4add_i8packed;
+ let arguments = [Int32Ty, Int32Ty, Int32Ty];
+ let result = Int32Ty;
+ let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+ let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
def AnnotateHandle : DXILOp<217, annotateHandle> {
let Doc = "annotate handle with resource properties";
let arguments = [HandleTy, ResPropsTy];
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d9377fe4b91a1a..d297b2fa07209f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -164,6 +164,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ template <bool Signed>
+ bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -1694,6 +1698,84 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
return Result;
}
+// Since pre-1.6 SPIRV has no DotProductInput4x8BitPacked implementation,
+// extract the elements of the packed inputs, multiply them and add the result
+// to the accumulator.
+template <bool Signed>
+bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ assert(I.getNumOperands() == 5);
+ assert(I.getOperand(2).isReg());
+ assert(I.getOperand(3).isReg());
+ assert(I.getOperand(4).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+
+ bool Result = false;
+
+ // Acc = C
+ Register Acc = I.getOperand(4).getReg();
+ SPIRVType *EltType = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+ auto ExtractOp =
+ Signed ? SPIRV::OpBitFieldSExtract : SPIRV::OpBitFieldUExtract;
+
+ // Extract the i8 element, multiply and add it to the accumulator
+ for (unsigned i = 0; i < 4; i++) {
+ // A[i]
+ Register AElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(AElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(2).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // B[i]
+ Register BElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(BElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(3).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // A[i] * B[i]
+ Register Mul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulS))
+ .addDef(Mul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(AElt)
+ .addUse(BElt)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Discard 24 highest-bits so that stored i32 register is i8 equivalent
+ Register MaskMul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(MaskMul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Mul)
+ .addUse(GR.getOrCreateConstInt(0, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Acc = Acc + A[i] * B[i]
+ Register Sum =
+ i < 3 ? MRI->createVirtualRegister(&SPIRV::IDRegClass) : ResVReg;
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
+ .addDef(Sum)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Acc)
+ .addUse(MaskMul)
+ .constrainAllUses(TII, TRI, RBI);
+
+ Acc = Sum;
+ }
+
+ return Result;
+}
+
/// Transform saturate(x) to clamp(x, 0.0f, 1.0f) as SPIRV
/// does not have a saturate builtin.
bool SPIRVInstructionSelector::selectSaturate(Register ResVReg,
@@ -2527,6 +2609,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_udot:
case Intrinsic::spv_sdot:
return selectIntegerDot(ResVReg, ResType, I);
+ case Intrinsic::spv_dot4add_i8packed:
+ return selectDot4AddPacked<true>(ResVReg, ResType, I);
case Intrinsic::spv_all:
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
diff --git a/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..7df0520505cea6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+define void @main(i32 %a, i32 %b, i32 %c) {
+entry:
+; CHECK: call i32 @dx.op.dot4AddPacked(i32 163, i32 %a, i32 %b, i32 %c)
+ %0 = call i32 @llvm.dx.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret void
+}
+
+declare i32 @llvm.dx.dot4add.i8packed(i32, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..35e2a731071103
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#int_8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#zero:]] = OpConstantNull %[[#int_8]]
+; CHECK-DAG: %[[#eight:]] = OpConstant %[[#int_8]] 8
+; CHECK-DAG: %[[#sixteen:]] = OpConstant %[[#int_8]] 16
+; CHECK-DAG: %[[#twentyfour:]] = OpConstant %[[#int_8]] 24
+; CHECK-LABEL: Begin function test_dot
+define noundef i32 @test_dot(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: %[[#A:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#B:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#C:]] = OpFunctionParameter %[[#int_32]]
+
+; First element of the packed vector
+; CHECK: %[[#A0:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#zero]] 8
+; CHECK: %[[#B0:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#zero]] 8
+; CHECK: %[[#MUL0:]] = OpIMul %[[#int_32]] %[[#A0]] %[[#B0]]
+; CHECK: %[[#MASK0:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL0]] %[[#zero]] 8
+; CHECK: %[[#ACC0:]] = OpIAdd %[[#int_32]] %[[#C]] %[[#MASK0]]
+
+; Second element of the packed vector
+; CHECK: %[[#A1:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#eight]] 8
+; CHECK: %[[#B1:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#eight]] 8
+; CHECK: %[[#MUL1:]] = OpIMul %[[#int_32]] %[[#A1]] %[[#B1]]
+; CHECK: %[[#MASK1:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL1]] %[[#zero]] 8
+; CHECK: %[[#ACC1:]] = OpIAdd %[[#int_32]] %[[#ACC0]] %[[#MASK1]]
+
+; Third element of the packed vector
+; CHECK: %[[#A2:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#sixteen]] 8
+; CHECK: %[[#B2:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#sixteen]] 8
+; CHECK: %[[#MUL2:]] = OpIMul %[[#int_32]] %[[#A2]] %[[#B2]]
+; CHECK: %[[#MASK2:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL2]] %[[#zero]] 8
+; CHECK: %[[#ACC2:]] = OpIAdd %[[#int_32]] %[[#ACC1]] %[[#MASK2]]
+
+; Fourth element of the packed vector
+; CHECK: %[[#A3:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#twentyfour]] 8
+; CHECK: %[[#B3:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#twentyfour]] 8
+; CHECK: %[[#MUL3:]] = OpIMul %[[#int_32]] %[[#A3]] %[[#B3]]
+; CHECK: %[[#MASK3:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL3]] %[[#zero]] 8
+; CHECK: %[[#ACC3:]] = OpIAdd %[[#int_32]] %[[#ACC2]] %[[#MASK3]]
+
+; CHECK: OpReturnValue %[[#ACC3]]
+ %spv.dot = call i32 @llvm.spv.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret i32 %spv.dot
+}
|
@llvm/pr-subscribers-backend-spir-v Author: Finn Plummer (inbelic) Changes
Resolves #99220 Full diff: https://github.com/llvm/llvm-project/pull/113623.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 90475a361bb8f8..eb6b07e8858602 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4792,6 +4792,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
let Prototype = "void(...)";
}
+def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> {
+ let Spellings = ["__builtin_hlsl_dot4add_i8packed"];
+ let Attributes = [NoThrow, Const];
+ let Prototype = "int(unsigned int, unsigned int, int)";
+}
+
def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_elementwise_frac"];
let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 28f28c70b5ae52..13ed0f99da9815 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18722,7 +18722,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
/*ReturnType=*/T0->getScalarType(),
getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()),
ArrayRef<Value *>{Op0, Op1}, nullptr, "hlsl.dot");
- } break;
+ }
+ case Builtin::BI__builtin_hlsl_dot4add_i8packed: {
+ Value *A = EmitScalarExpr(E->getArg(0));
+ Value *B = EmitScalarExpr(E->getArg(1));
+ Value *C = EmitScalarExpr(E->getArg(2));
+
+ Intrinsic::ID ID = CGM.getHLSLRuntime().getDot4AddI8PackedIntrinsic();
+ return Builder.CreateIntrinsic(
+ /*ReturnType=*/C->getType(), ID, ArrayRef<Value *>{A, B, C}, nullptr,
+ "hlsl.dot4add.i8packed");
+ }
case Builtin::BI__builtin_hlsl_lerp: {
Value *X = EmitScalarExpr(E->getArg(0));
Value *Y = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index ff7df41b5c62e7..8b1141375106cc 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -89,6 +89,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
+ GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddI8Packed, dot4add_i8packed)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 30dce60b3ff702..d10bfcbeed97ea 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -894,6 +894,16 @@ uint64_t dot(uint64_t3, uint64_t3);
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
uint64_t dot(uint64_t4, uint64_t4);
+//===----------------------------------------------------------------------===//
+// dot4add builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn int dot4add_i8packed(uint A, uint B, int C)
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.4)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed)
+int dot4add_i8packed(unsigned int, unsigned int, int);
+
//===----------------------------------------------------------------------===//
// exp builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
new file mode 100644
index 00000000000000..ea1a33d6267d2f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/dot4add_i8packed.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s -DTARGET=spv
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test
+int test(uint a, uint b, int c) {
+ // CHECK: %[[RET:.*]] = call [[TY:i32]] @llvm.[[TARGET]].dot4add.i8packed([[TY]] %[[#]], [[TY]] %[[#]], [[TY]] %[[#]])
+ // CHECK: ret [[TY]] %[[RET]]
+ return dot4add_i8packed(a, b, c);
+}
+
+// CHECK: declare [[TY]] @llvm.[[TARGET]].dot4add.i8packed([[TY]], [[TY]], [[TY]])
diff --git a/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
new file mode 100644
index 00000000000000..ac0b430bfaf945
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/dot4add_i8packed-errors.hlsl
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg0() {
+ return __builtin_hlsl_dot4add_i8packed();
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
+}
+
+int test_too_few_arg1(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+int test_too_few_arg2(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+int test_too_many_arg(int p0) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p0, p0, p0);
+ // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+struct S { float f; };
+
+int test_expr_struct_type_check(S p0, int p1) {
+ return __builtin_hlsl_dot4add_i8packed(p0, p1, p1);
+ // expected-error@-1 {{no viable conversion from 'S' to 'unsigned int'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index e30d37f69f781e..8cd5ff9006c1b7 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -69,6 +69,7 @@ def int_dx_udot :
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6df2eb156a0774..ebea18cd932617 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -83,6 +83,7 @@ let TargetPrefix = "spv" in {
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
+ def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 147b32b1ca9903..f0f40de5009f52 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -779,6 +779,16 @@ def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> {
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
}
+def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
+ let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
+ "accumulate to i32";
+ let LLVMIntrinsic = int_dx_dot4add_i8packed;
+ let arguments = [Int32Ty, Int32Ty, Int32Ty];
+ let result = Int32Ty;
+ let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+ let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
def AnnotateHandle : DXILOp<217, annotateHandle> {
let Doc = "annotate handle with resource properties";
let arguments = [HandleTy, ResPropsTy];
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d9377fe4b91a1a..d297b2fa07209f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -164,6 +164,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ template <bool Signed>
+ bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -1694,6 +1698,84 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
return Result;
}
+// Since pre-1.6 SPIRV has no DotProductInput4x8BitPacked implementation,
+// extract the elements of the packed inputs, multiply them and add the result
+// to the accumulator.
+template <bool Signed>
+bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ assert(I.getNumOperands() == 5);
+ assert(I.getOperand(2).isReg());
+ assert(I.getOperand(3).isReg());
+ assert(I.getOperand(4).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+
+ bool Result = false;
+
+ // Acc = C
+ Register Acc = I.getOperand(4).getReg();
+ SPIRVType *EltType = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+ auto ExtractOp =
+ Signed ? SPIRV::OpBitFieldSExtract : SPIRV::OpBitFieldUExtract;
+
+ // Extract the i8 element, multiply and add it to the accumulator
+ for (unsigned i = 0; i < 4; i++) {
+ // A[i]
+ Register AElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(AElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(2).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // B[i]
+ Register BElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(BElt)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(3).getReg())
+ .addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // A[i] * B[i]
+ Register Mul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulS))
+ .addDef(Mul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(AElt)
+ .addUse(BElt)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Discard 24 highest-bits so that stored i32 register is i8 equivalent
+ Register MaskMul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
+ .addDef(MaskMul)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Mul)
+ .addUse(GR.getOrCreateConstInt(0, I, EltType, TII))
+ .addImm(8)
+ .constrainAllUses(TII, TRI, RBI);
+
+ // Acc = Acc + A[i] * B[i]
+ Register Sum =
+ i < 3 ? MRI->createVirtualRegister(&SPIRV::IDRegClass) : ResVReg;
+ Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
+ .addDef(Sum)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(Acc)
+ .addUse(MaskMul)
+ .constrainAllUses(TII, TRI, RBI);
+
+ Acc = Sum;
+ }
+
+ return Result;
+}
+
/// Transform saturate(x) to clamp(x, 0.0f, 1.0f) as SPIRV
/// does not have a saturate builtin.
bool SPIRVInstructionSelector::selectSaturate(Register ResVReg,
@@ -2527,6 +2609,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_udot:
case Intrinsic::spv_sdot:
return selectIntegerDot(ResVReg, ResType, I);
+ case Intrinsic::spv_dot4add_i8packed:
+ return selectDot4AddPacked<true>(ResVReg, ResType, I);
case Intrinsic::spv_all:
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
diff --git a/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..7df0520505cea6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/dot4add_i8packed.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+define void @main(i32 %a, i32 %b, i32 %c) {
+entry:
+; CHECK: call i32 @dx.op.dot4AddPacked(i32 163, i32 %a, i32 %b, i32 %c)
+ %0 = call i32 @llvm.dx.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret void
+}
+
+declare i32 @llvm.dx.dot4add.i8packed(i32, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
new file mode 100644
index 00000000000000..35e2a731071103
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_i8packed.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#int_8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#zero:]] = OpConstantNull %[[#int_8]]
+; CHECK-DAG: %[[#eight:]] = OpConstant %[[#int_8]] 8
+; CHECK-DAG: %[[#sixteen:]] = OpConstant %[[#int_8]] 16
+; CHECK-DAG: %[[#twentyfour:]] = OpConstant %[[#int_8]] 24
+; CHECK-LABEL: Begin function test_dot
+define noundef i32 @test_dot(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: %[[#A:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#B:]] = OpFunctionParameter %[[#int_32]]
+; CHECK: %[[#C:]] = OpFunctionParameter %[[#int_32]]
+
+; First element of the packed vector
+; CHECK: %[[#A0:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#zero]] 8
+; CHECK: %[[#B0:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#zero]] 8
+; CHECK: %[[#MUL0:]] = OpIMul %[[#int_32]] %[[#A0]] %[[#B0]]
+; CHECK: %[[#MASK0:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL0]] %[[#zero]] 8
+; CHECK: %[[#ACC0:]] = OpIAdd %[[#int_32]] %[[#C]] %[[#MASK0]]
+
+; Second element of the packed vector
+; CHECK: %[[#A1:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#eight]] 8
+; CHECK: %[[#B1:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#eight]] 8
+; CHECK: %[[#MUL1:]] = OpIMul %[[#int_32]] %[[#A1]] %[[#B1]]
+; CHECK: %[[#MASK1:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL1]] %[[#zero]] 8
+; CHECK: %[[#ACC1:]] = OpIAdd %[[#int_32]] %[[#ACC0]] %[[#MASK1]]
+
+; Third element of the packed vector
+; CHECK: %[[#A2:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#sixteen]] 8
+; CHECK: %[[#B2:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#sixteen]] 8
+; CHECK: %[[#MUL2:]] = OpIMul %[[#int_32]] %[[#A2]] %[[#B2]]
+; CHECK: %[[#MASK2:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL2]] %[[#zero]] 8
+; CHECK: %[[#ACC2:]] = OpIAdd %[[#int_32]] %[[#ACC1]] %[[#MASK2]]
+
+; Fourth element of the packed vector
+; CHECK: %[[#A3:]] = OpBitFieldSExtract %[[#int_32]] %[[#A]] %[[#twentyfour]] 8
+; CHECK: %[[#B3:]] = OpBitFieldSExtract %[[#int_32]] %[[#B]] %[[#twentyfour]] 8
+; CHECK: %[[#MUL3:]] = OpIMul %[[#int_32]] %[[#A3]] %[[#B3]]
+; CHECK: %[[#MASK3:]] = OpBitFieldSExtract %[[#int_32]] %[[#MUL3]] %[[#zero]] 8
+; CHECK: %[[#ACC3:]] = OpIAdd %[[#int_32]] %[[#ACC2]] %[[#MASK3]]
+
+; CHECK: OpReturnValue %[[#ACC3]]
+ %spv.dot = call i32 @llvm.spv.dot4add.i8packed(i32 %a, i32 %b, i32 %c)
+ ret i32 %spv.dot
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for checking with me. To generate better SPIR-V, we should try to use the OpSDot
instruction when possible. Please use the capability when tageting SPIR-V 1.6 or the extension is available. Thanks.
; RUN: llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s | ||
; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should be using -mtriple=spirv-vulkan-unknown
. Also add tests that use Vulkan 1.3. -mtriple=spirv-vulkan1.3-unknown
I believe. Also a test that enables the extension: --spirv-ext=+SPV_KHR_integer_dot_product
.
You might need changes to SPIRVModuleAnalysis.cpp to get this to work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated the testcases to test the different cases. I seem required to use spirv[32|64]
over spirv
to meet the Int8
capability requirements.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should be using spirv
. You probably need to update initAvailableCapabilitiesForVulkan
to allow the Int8
capability. That is core in every version of spir-v, so you can make it always available.
728266c
to
580742e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good! I learned a couple things about SPIR-V by verifying this.
Just one quibble about the language version and a suggested followup.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I just want someone on the OpenCL side to make sure they are okay with the changes for OpenCL.
@@ -524,6 +524,9 @@ defm OpISubBorrow: BinOpTypedGen<"OpISubBorrow", 150, subc, 0, 1>; | |||
def OpUMulExtended: BinOp<"OpUMulExtended", 151>; | |||
def OpSMulExtended: BinOp<"OpSMulExtended", 152>; | |||
|
|||
def OpSDot: BinOp<"OpSDot", 4450>; | |||
def OpUDot: BinOp<"OpUDot", 4451>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do I understand it right that you need both OpSDot/OpUDot and OpSDotKHR/OpUDotKHR here. The former for versions since 1.6, and the latter for previous SPIR-V versions (also by default at the moment)?
Regarding the comments about differentiating between I agree that these are different ops/capabilities in the SPIR-V spec. However from a quick attempt to implement this, I found that since the Capabilities share the same bit value of ( I think implementing a way to differentiate capabilities that have the same value is out of scope for this pr. So I would propose that we file a follow-up issue to deal with that and either:
WDYT? |
I'll check with the authors of spirv-as and spirv-dis, but here are my thoughts. This distinction makes no difference in the binary format, which is what is defined by the SPIR-V specification. The SPIR-V specification does not define the human readable format. See https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_example. The textual format ouput by glslang by default is actually very different. It is just that the format used by spirv tools happens to have one. If we use Spirv-Tools as the guide, then we can use either one interchangeably. Here are two examples:
I think it is fair for the llvm backend to do the same. The only time people will have a problem is when they use llvm to generate the asm, and try to generate a binary using an older spirv-as. That is easily fixed by generating the binary in llvm. |
Hi, former SPIR WG chair here, and long time maintainer of spirv-as and spirv-dis. @s-perron is correct: only the numbers in the binary are normative.
I recommend not addressing this, ever. It's not worth it. spirv-as will map both the old and new string names to the number; it is driven from the grammar file and knows all registered names for numbers. (e.g. see here for the declaration that capability strings spirv-dis maps the number to what it considers the best name, which is the string listed for the |
- create a clang built-in in Builtins.td - link dot4add_i8packed in hlsl_intrinsics.h - add lowering to spirv backend through expansion of operation as OPSDot is missing up to SPIRV 1.6 in SPIRVInstructionSelector.cpp - add dot4add_i8packed intrinsic to IntrinsicsDirectX.td and mapping to DXIL.td op Dot4AddI8Packed - add tests for HLSL intrinsic lowering to dx/spv intrinsic in dot4add_i8packed.hlsl - add tests for sema checks in dot4add_i8packed-errors.hlsl - add test of spir-v lowering in SPIRV/dot4add_i8packed.ll - add test to dxil lowering in DirectX/dot4add_i8packed.ll
we were unable to add this capability to SPIRV target information as a command line option of llc
- use OpSDot for the lowering when the target has capabilities to do so - fix HLSL_AVAILABILITY
- dont use immediate constant
- define the 4 capabilities and add them to OpenCL init when SPIRV version is 1.6 or greater - require these capabilities during analysis of OpSDot or OpUDot instructions - verify in test case that the capability/extensions are correctly emitted
- use getRegClass instead of IDRegClass - remove unneeded check of isvulkan/opencl - fix testcase to use fixed version and -verify-machineinstrs - fix Result bool to be from and to or
- add minimum version to capabilities
4439ebe
to
b77c090
Compare
Rebased to help resolve merge conflicts. Thanks a lot for the time and reviews. Really helpful and I learned a lot. |
|
||
_HLSL_AVAILABILITY(shadermodel, 6.4) | ||
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed) | ||
int dot4add_i8packed(unsigned int, unsigned int, int); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason you didn't do uint
here? Its the same, so just curious.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whoops no. A discrepancy on my part.
Resolves #99220