[HLSL] [DXIL] Implement the AddUint64 HLSL function and the UAddc DXIL op (#127137)

Icohedron · inbelic · farzonl · web-flow · commit b4ecebe745dd · 2025-03-05T17:04:10.000-08:00
Fixes #99205. - Implements the HLSL intrinsic `AddUint64` used to perform unsigned 64-bit integer addition by using pairs of unsigned 32-bit integers instead of native 64-bit types - The LLVM intrinsic `uadd_with_overflow` is used in the implementation of `AddUint64` in `CGBuiltin.cpp` - The DXIL op `UAddc` was defined in `DXIL.td`, and a lowering of the LLVM intrinsic `uadd_with_overflow` to the `UAddc` DXIL op was implemented in `DXILOpLowering.cpp` Notes: - `__builtin_addc` was not able to be used to implement `AddUint64` in `hlsl_intrinsics.h` because its `CarryOut` argument is a pointer, and pointers are not supported in HLSL - A lowering of the LLVM intrinsic `uadd_with_overflow` to SPIR-V [already exists](https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/SPIRV/llvm-intrinsics/uadd.with.overflow.ll) - When lowering the LLVM intrinsic `uadd_with_overflow` to the `UAddc` DXIL op, the anonymous struct type `{ i32, i1 }` is replaced with a named struct type `%dx.types.i32c`. This aspect of the implementation may be changed when issue #113192 gets addressed - Fixes issues mentioned in the comments on the original PR #125319 --------- Co-authored-by: Finn Plummer <50529406+inbelic@users.noreply.github.com> Co-authored-by: Farzon Lotfi <farzonlotfi@microsoft.com> Co-authored-by: Chris B <beanz@abolishcrlf.org> Co-authored-by: Justin Bogner <mail@justinbogner.com>
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
@@ -4765,6 +4765,12 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
 }
 
 // HLSL
+def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_adduint64"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLResourceGetPointer : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_resource_getpointer"];
   let Attributes = [NoThrow];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10709,6 +10709,11 @@ def err_vector_incorrect_num_elements : Error<
   "%select{too many|too few}0 elements in vector %select{initialization|operand}3 (expected %1 elements, have %2)">;
 def err_altivec_empty_initializer : Error<"expected initializer">;
 
+def err_vector_incorrect_bit_count : Error<
+  "incorrect number of bits in vector operand (expected %select{|a multiple of}0 %1 bits, have %2)">;
+def err_integer_incorrect_bit_count : Error<
+  "incorrect number of bits in integer (expected %0 bits, have %1)">;
+
 def err_invalid_neon_type_code : Error<
   "incompatible constant for this __builtin_neon function">;
 def err_argument_invalid_range : Error<
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19470,6 +19470,62 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return nullptr;
 
   switch (BuiltinID) {
+  case Builtin::BI__builtin_hlsl_adduint64: {
+    Value *OpA = EmitScalarExpr(E->getArg(0));
+    Value *OpB = EmitScalarExpr(E->getArg(1));
+    QualType Arg0Ty = E->getArg(0)->getType();
+    uint64_t NumElements = Arg0Ty->castAs<VectorType>()->getNumElements();
+    assert(Arg0Ty == E->getArg(1)->getType() &&
+           "AddUint64 operand types must match");
+    assert(Arg0Ty->hasIntegerRepresentation() &&
+           "AddUint64 operands must have an integer representation");
+    assert((NumElements == 2 || NumElements == 4) &&
+           "AddUint64 operands must have 2 or 4 elements");
+
+    llvm::Value *LowA;
+    llvm::Value *HighA;
+    llvm::Value *LowB;
+    llvm::Value *HighB;
+
+    // Obtain low and high words of inputs A and B
+    if (NumElements == 2) {
+      LowA = Builder.CreateExtractElement(OpA, (uint64_t)0, "LowA");
+      HighA = Builder.CreateExtractElement(OpA, (uint64_t)1, "HighA");
+      LowB = Builder.CreateExtractElement(OpB, (uint64_t)0, "LowB");
+      HighB = Builder.CreateExtractElement(OpB, (uint64_t)1, "HighB");
+    } else {
+      LowA = Builder.CreateShuffleVector(OpA, ArrayRef<int>{0, 2}, "LowA");
+      HighA = Builder.CreateShuffleVector(OpA, ArrayRef<int>{1, 3}, "HighA");
+      LowB = Builder.CreateShuffleVector(OpB, ArrayRef<int>{0, 2}, "LowB");
+      HighB = Builder.CreateShuffleVector(OpB, ArrayRef<int>{1, 3}, "HighB");
+    }
+
+    // Use an uadd_with_overflow to compute the sum of low words and obtain a
+    // carry value
+    llvm::Value *Carry;
+    llvm::Value *LowSum = EmitOverflowIntrinsic(
+        *this, llvm::Intrinsic::uadd_with_overflow, LowA, LowB, Carry);
+    llvm::Value *ZExtCarry =
+        Builder.CreateZExt(Carry, HighA->getType(), "CarryZExt");
+
+    // Sum the high words and the carry
+    llvm::Value *HighSum = Builder.CreateAdd(HighA, HighB, "HighSum");
+    llvm::Value *HighSumPlusCarry =
+        Builder.CreateAdd(HighSum, ZExtCarry, "HighSumPlusCarry");
+
+    if (NumElements == 4) {
+      return Builder.CreateShuffleVector(LowSum, HighSumPlusCarry,
+                                         ArrayRef<int>{0, 2, 1, 3},
+                                         "hlsl.AddUint64");
+    }
+
+    llvm::Value *Result = PoisonValue::get(OpA->getType());
+    Result = Builder.CreateInsertElement(Result, LowSum, (uint64_t)0,
+                                         "hlsl.AddUint64.upto0");
+    Result = Builder.CreateInsertElement(Result, HighSumPlusCarry, (uint64_t)1,
+                                         "hlsl.AddUint64");
+    return Result;
+  }
   case Builtin::BI__builtin_hlsl_resource_getpointer: {
     Value *HandleOp = EmitScalarExpr(E->getArg(0));
     Value *IndexOp = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -174,6 +174,27 @@ float3 acos(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_acos)
 float4 acos(float4);
 
+//===----------------------------------------------------------------------===//
+// AddUint64 builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T AddUint64(T a, T b)
+/// \brief Implements unsigned 64-bit integer addition using pairs of unsigned
+/// 32-bit integers.
+/// \param x [in] The first unsigned 32-bit integer pair(s)
+/// \param y [in] The second unsigned 32-bit integer pair(s)
+///
+/// This function takes one or two pairs (low, high) of unsigned 32-bit integer
+/// values and returns pairs (low, high) of unsigned 32-bit integer
+/// values representing the result of unsigned 64-bit integer addition.
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_adduint64)
+uint32_t2 AddUint64(uint32_t2, uint32_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_adduint64)
+uint32_t4 AddUint64(uint32_t4, uint32_t4);
+
 //===----------------------------------------------------------------------===//
 // all builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
@@ -2086,6 +2086,18 @@ static bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
                                     checkAllFloatTypes);
 }
 
+static bool CheckUnsignedIntRepresentations(Sema *S, CallExpr *TheCall) {
+  auto checkUnsignedInteger = [](clang::QualType PassedType) -> bool {
+    clang::QualType BaseType =
+        PassedType->isVectorType()
+            ? PassedType->getAs<clang::VectorType>()->getElementType()
+            : PassedType;
+    return !BaseType->isUnsignedIntegerType();
+  };
+  return CheckAllArgTypesAreCorrect(S, TheCall, S->Context.UnsignedIntTy,
+                                    checkUnsignedInteger);
+}
+
 static bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
   auto checkFloatorHalf = [](clang::QualType PassedType) -> bool {
     clang::QualType BaseType =
@@ -2277,6 +2289,52 @@ static bool CheckResourceHandle(
 // returning an ExprError
 bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   switch (BuiltinID) {
+  case Builtin::BI__builtin_hlsl_adduint64: {
+    if (SemaRef.checkArgCount(TheCall, 2))
+      return true;
+    if (CheckVectorElementCallArgs(&SemaRef, TheCall))
+      return true;
+    if (CheckUnsignedIntRepresentations(&SemaRef, TheCall))
+      return true;
+
+    // CheckVectorElementCallArgs(...) guarantees both args are the same type.
+    assert(TheCall->getArg(0)->getType() == TheCall->getArg(1)->getType() &&
+           "Both args must be of the same type");
+
+    // ensure both args are vectors
+    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
+    if (!VTy) {
+      SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_non_vector)
+          << TheCall->getDirectCallee() << /*all*/ 1;
+      return true;
+    }
+
+    // ensure arg integers are 32-bits
+    uint64_t ElementBitCount = getASTContext()
+                                   .getTypeSizeInChars(VTy->getElementType())
+                                   .getQuantity() *
+                               8;
+    if (ElementBitCount != 32) {
+      SemaRef.Diag(TheCall->getBeginLoc(),
+                   diag::err_integer_incorrect_bit_count)
+          << 32 << ElementBitCount;
+      return true;
+    }
+
+    // ensure both args are vectors of total bit size of a multiple of 64
+    int NumElementsArg = VTy->getNumElements();
+    if (NumElementsArg != 2 && NumElementsArg != 4) {
+      SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count)
+          << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount;
+      return true;
+    }
+
+    ExprResult A = TheCall->getArg(0);
+    QualType ArgTyA = A.get()->getType();
+    // return type is the same as the input type
+    TheCall->setType(ArgTyA);
+    break;
+  }
   case Builtin::BI__builtin_hlsl_resource_getpointer: {
     if (SemaRef.checkArgCount(TheCall, 2) ||
         CheckResourceHandle(&SemaRef, TheCall, 0) ||
diff --git a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
@@ -0,0 +1,58 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+
+
+// CHECK-LABEL: define noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <2 x i32> [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[LOWA:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    [[HIGHA:%.*]] = extractelement <2 x i32> [[TMP0]], i64 1
+// CHECK-NEXT:    [[LOWB:%.*]] = extractelement <2 x i32> [[TMP1]], i64 0
+// CHECK-NEXT:    [[HIGHB:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[LOWA]], i32 [[LOWB]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0
+// CHECK-NEXT:    [[CARRYZEXT:%.*]] = zext i1 [[TMP3]] to i32
+// CHECK-NEXT:    [[HIGHSUM:%.*]] = add i32 [[HIGHA]], [[HIGHB]]
+// CHECK-NEXT:    [[HIGHSUMPLUSCARRY:%.*]] = add i32 [[HIGHSUM]], [[CARRYZEXT]]
+// CHECK-NEXT:    [[HLSL_ADDUINT64_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
+// CHECK-NEXT:    [[HLSL_ADDUINT64:%.*]] = insertelement <2 x i32> [[HLSL_ADDUINT64_UPTO0]], i32 [[HIGHSUMPLUSCARRY]], i64 1
+// CHECK-NEXT:    ret <2 x i32> [[HLSL_ADDUINT64]]
+//
+uint2 test_AddUint64_uint2(uint2 a, uint2 b) {
+  return AddUint64(a, b);
+}
+
+// CHECK-LABEL: define noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <4 x i32> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[LOWA:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[HIGHA:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[LOWB:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[HIGHB:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> [[LOWA]], <2 x i32> [[LOWB]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i1> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i1> } [[TMP2]], 0
+// CHECK-NEXT:    [[CARRYZEXT:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[HIGHSUM:%.*]] = add <2 x i32> [[HIGHA]], [[HIGHB]]
+// CHECK-NEXT:    [[HIGHSUMPLUSCARRY:%.*]] = add <2 x i32> [[HIGHSUM]], [[CARRYZEXT]]
+// CHECK-NEXT:    [[HLSL_ADDUINT64:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[HIGHSUMPLUSCARRY]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[HLSL_ADDUINT64]]
+//
+uint4 test_AddUint64_uint4(uint4 a, uint4 b) {
+  return AddUint64(a, b);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+
+uint2 test_too_few_arg() {
+  return __builtin_hlsl_adduint64();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+}
+
+uint4 test_too_many_arg(uint4 a) {
+  return __builtin_hlsl_adduint64(a, a, a);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+uint2 test_mismatched_arg_types(uint2 a, uint4 b) {
+  return __builtin_hlsl_adduint64(a, b);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_adduint64' must have the same type}}
+}
+
+uint2 test_bad_num_arg_elements(uint3 a, uint3 b) {
+  return __builtin_hlsl_adduint64(a, b);
+  // expected-error@-1 {{incorrect number of bits in vector operand (expected a multiple of 64 bits, have 96)}}
+}
+
+uint2 test_scalar_arg_type(uint a) {
+  return __builtin_hlsl_adduint64(a, a);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_adduint64' must be vectors}}
+}
+
+uint2 test_uint64_args(uint16_t2 a) {
+  return __builtin_hlsl_adduint64(a, a);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+uint2 test_signed_integer_args(int2 a, int2 b) {
+  return __builtin_hlsl_adduint64(a, b);
+// expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(unsigned int)))) unsigned int' (vector of 2 'unsigned int' values)}}
+}
+
+struct S {
+  uint2 a;
+};
+
+uint2 test_incorrect_arg_type(S a) {
+  return __builtin_hlsl_adduint64(a, a);
+  // expected-error@-1 {{passing 'S' to parameter of incompatible type 'unsigned int'}}
+}
+
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
@@ -56,6 +56,7 @@ def HandleTy : DXILOpParamType;
 def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
 def SplitDoubleTy : DXILOpParamType;
+def BinaryWithCarryTy : DXILOpParamType;
 
 class DXILOpClass;
 
@@ -744,6 +745,16 @@ def UMin : DXILOp<40, binary> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def UAddc : DXILOp<44, binaryWithCarryOrBorrow > {
+  let Doc = "unsigned add of 32-bit operand with the carry";
+  let intrinsics = [IntrinSelect<int_uadd_with_overflow>];
+  let arguments = [OverloadTy, OverloadTy];
+  let result = BinaryWithCarryTy;
+  let overloads = [Overloads<DXIL1_0, [Int32Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def FMad : DXILOp<46, tertiary> {
   let Doc = "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m "
             "* a + b.";
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -253,6 +253,14 @@ static StructType *getSplitDoubleType(LLVMContext &Context) {
   return StructType::create({Int32Ty, Int32Ty}, "dx.types.splitdouble");
 }
 
+static StructType *getBinaryWithCarryType(LLVMContext &Context) {
+  if (auto *ST = StructType::getTypeByName(Context, "dx.types.i32c"))
+    return ST;
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int1Ty = Type::getInt1Ty(Context);
+  return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c");
+}
+
 static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
                                     Type *OverloadTy) {
   switch (Kind) {
@@ -308,6 +316,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getResPropsType(Ctx);
   case OpParamType::SplitDoubleTy:
     return getSplitDoubleType(Ctx);
+  case OpParamType::BinaryWithCarryTy:
+    return getBinaryWithCarryType(Ctx);
   }
   llvm_unreachable("Invalid parameter kind");
   return nullptr;
diff --git a/llvm/test/CodeGen/DirectX/UAddc.ll b/llvm/test/CodeGen/DirectX/UAddc.ll
diff --git a/llvm/test/CodeGen/DirectX/UAddc_errors.ll b/llvm/test/CodeGen/DirectX/UAddc_errors.ll
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/AddUint64.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/AddUint64.ll