intel · frasercrmck · Jul 3, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
@@ -259,6 +259,22 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
       addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
       addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
       addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
+    } else if (auto Attr = F->getFnAttribute("sycl-max-work-group-size");
+               Attr.isValid()) {
+      // Split values in the comma-separated list integers.
+      SmallVector<StringRef, 3> ValStrs;
+      Attr.getValueAsString().split(ValStrs, ',');
+      assert(ValStrs.size() == 3 && "Must have all three dimensions for "
+                                    "sycl-max-work-group-size property");
+
+      static constexpr const char *Annots[] = {"maxntidx", "maxntidy",
+                                               "maxntidz"};
+      for (auto [AnnotStr, ValStr] : zip(Annots, reverse(ValStrs))) {
+        int Value = 0;
+        bool Error = ValStr.getAsInteger(10, Value);
+        assert(!Error && "The attribute's value is not a number");
+        addNVVMMetadata(F, AnnotStr, Value);
+      }
     }
 
     auto attrValue = [&](Expr *E) {
@@ -278,6 +294,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
         addNVVMMetadata(F, "minctasm", attrValue(MWGPCU->getValue()));
         HasMinWorkGroupPerCU = true;
       }
+    } else if (auto Attr =
+                   F->getFnAttribute("sycl-min-work-groups-per-multiprocessor");
+               Attr.isValid()) {
+      int Value = 0;
+      bool Error = Attr.getValueAsString().getAsInteger(10, Value);
+      assert(!Error && "The attribute's value is not a number");
+      addNVVMMetadata(F, "minctasm", Value);
     }
 
     if (const auto *MWGPMP =
@@ -291,6 +314,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
         // The value is guaranteed to be > 0, pass it to the metadata.
         addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
       }
+    } else if (auto Attr =
+                   F->getFnAttribute("sycl-max-work-groups-per-cluster");
+               Attr.isValid()) {
+      int Value = 0;
+      bool Error = Attr.getValueAsString().getAsInteger(10, Value);
+      assert(!Error && "The attribute's value is not a number");
+      addNVVMMetadata(F, "maxclusterrank", Value);
     }
   }
 

@@ -361,18 +361,29 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
       AddFPControlMetadataForWidth(SPIRV_DENORM_PRESERVE, 64);
   }
 
-  if (AttrKindStr == "sycl-work-group-size" ||
-      AttrKindStr == "sycl-work-group-size-hint") {
+  static constexpr std::tuple<const char *, const char *, bool>
+      SimpleWGAttrs[] = {
+          {"sycl-work-group-size", "reqd_work_group_size",
+           /*RequiresAll3Dims*/ false},
+          {"sycl-work-group-size-hint", "work_group_size_hint",
+           /*RequiresAll3Dims*/ false},
+          {"sycl-max-work-group-size", "max_work_group_size",
+           /*RequiresAll3Dims*/ true},
+      };
+
+  for (auto &[AttrKind, MDStr, Req3D] : SimpleWGAttrs) {
+    if (AttrKindStr != AttrKind)
+      continue;
     // Split values in the comma-separated list integers.
-    SmallVector<StringRef, 3> ValStrs;
-    Attr.getValueAsString().split(ValStrs, ',');
+    SmallVector<StringRef, 3> AttrValStrs;
+    Attr.getValueAsString().split(AttrValStrs, ',');
 
-    assert(ValStrs.size() <= 3 &&
-           "sycl-work-group-size and sycl-work-group-size-hint currently only "
-           "support up to three values");
+    assert(((Req3D && AttrValStrs.size() == 3) ||
+            (!Req3D && AttrValStrs.size() <= 3)) &&
+           "Incorrect number of values for kernel property");
 
     // SYCL work-group sizes must be reversed for SPIR-V.
-    std::reverse(ValStrs.begin(), ValStrs.end());
+    std::reverse(AttrValStrs.begin(), AttrValStrs.end());
 
     // Use integer pointer size as closest analogue to size_t.
     IntegerType *IntPtrTy = DLayout.getIntPtrType(Ctx);
@@ -381,24 +392,29 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
 
     // Get the integers from the strings.
     SmallVector<Metadata *, 3> MDVals;
-    for (StringRef ValStr : ValStrs)
+    for (StringRef ValStr : AttrValStrs)
       MDVals.push_back(ConstantAsMetadata::get(
           Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, ValStr, 10))));
 
-    const char *MDName = (AttrKindStr == "sycl-work-group-size")
-                             ? "reqd_work_group_size"
-                             : "work_group_size_hint";
-    return std::pair<std::string, MDNode *>(MDName, MDNode::get(Ctx, MDVals));
+    return std::pair<std::string, MDNode *>(MDStr, MDNode::get(Ctx, MDVals));
   }
 
-  if (AttrKindStr == "sycl-sub-group-size") {
-    uint32_t SubGroupSize = getAttributeAsInteger<uint32_t>(Attr);
-    IntegerType *Ty = Type::getInt32Ty(Ctx);
-    Metadata *MDVal = ConstantAsMetadata::get(
-        Constant::getIntegerValue(Ty, APInt(32, SubGroupSize)));
-    SmallVector<Metadata *, 1> MD{MDVal};
-    return std::pair<std::string, MDNode *>("intel_reqd_sub_group_size",
-                                            MDNode::get(Ctx, MD));
+  static constexpr std::pair<const char *, const char *> SimpleI32Attrs[] = {
+      {"sycl-sub-group-size", "intel_reqd_sub_group_size"},
+      {"sycl-min-work-groups-per-multiprocessor",
+       "min_work_groups_per_multiprocessor"},
+      {"sycl-max-work-groups-per-cluster", "max_work_groups_per_cluster"},
+  };
+
+  for (auto [AttrKind, MDStr] : SimpleI32Attrs) {
+    if (AttrKindStr == AttrKind) {
+      uint32_t SubGroupSize = getAttributeAsInteger<uint32_t>(Attr);
+      IntegerType *Ty = Type::getInt32Ty(Ctx);
+      Metadata *MDVal = ConstantAsMetadata::get(
+          Constant::getIntegerValue(Ty, APInt(32, SubGroupSize)));
+      SmallVector<Metadata *, 1> MD{MDVal};
+      return std::pair<std::string, MDNode *>(MDStr, MDNode::get(Ctx, MD));
+    }
   }
 
   // The sycl-single-task attribute currently only has an effect when targeting

@@ -71,6 +71,31 @@ struct single_task_kernel_key {
   using value_t = property_value<single_task_kernel_key>;
 };
 
+struct max_work_group_size_key
+    : detail::compile_time_property_key<detail::PropKind::MaxWorkGroupSize> {
+  template <size_t Dim0, size_t Dim1, size_t Dim2>
+  using value_t = property_value<max_work_group_size_key,
+                                 std::integral_constant<size_t, Dim0>,
+                                 std::integral_constant<size_t, Dim1>,
+                                 std::integral_constant<size_t, Dim2>>;
+};
+
+struct min_work_groups_per_multiprocessor_key
+    : detail::compile_time_property_key<
+          detail::PropKind::MinWorkGroupsPerMultiprocessor> {
+  template <uint32_t Size>
+  using value_t = property_value<min_work_groups_per_multiprocessor_key,
+                                 std::integral_constant<uint32_t, Size>>;
+};
+
+struct max_work_groups_per_cluster_key
+    : detail::compile_time_property_key<
+          detail::PropKind::MaxWorkGroupsPerCluster> {
+  template <uint32_t Size>
+  using value_t = property_value<max_work_groups_per_cluster_key,
+                                 std::integral_constant<uint32_t, Size>>;
+};
+
 template <size_t Dim0, size_t... Dims>
 struct property_value<work_group_size_key, std::integral_constant<size_t, Dim0>,
                       std::integral_constant<size_t, Dims>...> {
@@ -138,6 +163,44 @@ template <> struct property_value<single_task_kernel_key> {
   using key_t = single_task_kernel_key;
 };
 
+template <size_t Dim0, size_t Dim1, size_t Dim2>
+struct property_value<max_work_group_size_key,
+                      std::integral_constant<size_t, Dim0>,
+                      std::integral_constant<size_t, Dim1>,
+                      std::integral_constant<size_t, Dim2>> {
+  static_assert(
+      detail::AllNonZero<Dim0, Dim1, Dim2>::value,
+      "max_work_group_size property must only contain non-zero values.");
+
+  using key_t = max_work_group_size_key;
+
+  constexpr size_t operator[](int Dim) const {
+    return std::array<size_t, 3>{Dim0, Dim1, Dim2}[Dim];
+  }
+};
+
+template <uint32_t Size>
+struct property_value<min_work_groups_per_multiprocessor_key,
+                      std::integral_constant<uint32_t, Size>> {
+  static_assert(Size != 0, "min_work_groups_per_multiprocessor_key property "
+                           "must contain a non-zero value.");
+
+  using key_t = min_work_groups_per_multiprocessor_key;
+  using value_t = std::integral_constant<uint32_t, Size>;
+  static constexpr uint32_t value = Size;
+};
+
+template <uint32_t Size>
+struct property_value<max_work_groups_per_cluster_key,
+                      std::integral_constant<uint32_t, Size>> {
+  static_assert(Size != 0, "max_work_groups_per_cluster_key property must "
+                           "contain a non-zero value.");
+
+  using key_t = max_work_groups_per_cluster_key;
+  using value_t = std::integral_constant<uint32_t, Size>;
+  static constexpr uint32_t value = Size;
+};
+
 template <size_t Dim0, size_t... Dims>
 inline constexpr work_group_size_key::value_t<Dim0, Dims...> work_group_size;
 
@@ -156,6 +219,18 @@ inline constexpr nd_range_kernel_key::value_t<Dims> nd_range_kernel;
 
 inline constexpr single_task_kernel_key::value_t single_task_kernel;
 
+template <size_t Dim0, size_t Dim1, size_t Dim2>
+inline constexpr max_work_group_size_key::value_t<Dim0, Dim1, Dim2>
+    max_work_group_size;
+
+template <uint32_t Size>
+inline constexpr min_work_groups_per_multiprocessor_key::value_t<Size>
+    min_work_groups_per_multiprocessor;
+
+template <uint32_t Size>
+inline constexpr max_work_groups_per_cluster_key::value_t<Size>
+    max_work_groups_per_cluster;
+
 struct work_group_progress_key
     : detail::compile_time_property_key<detail::PropKind::WorkGroupProgress> {
   template <forward_progress_guarantee Guarantee,
@@ -270,6 +345,21 @@ template <> struct PropertyMetaInfo<single_task_kernel_key::value_t> {
   static constexpr const char *name = "sycl-single-task-kernel";
   static constexpr int value = 0;
 };
+template <size_t Dim0, size_t Dim1, size_t Dim2>
+struct PropertyMetaInfo<max_work_group_size_key::value_t<Dim0, Dim1, Dim2>> {
+  static constexpr const char *name = "sycl-max-work-group-size";
+  static constexpr const char *value = SizeListToStr<Dim0, Dim1, Dim2>::value;
+};
+template <uint32_t Size>
+struct PropertyMetaInfo<min_work_groups_per_multiprocessor_key::value_t<Size>> {
+  static constexpr const char *name = "sycl-min-work-groups-per-multiprocessor";
+  static constexpr uint32_t value = Size;
+};
+template <uint32_t Size>
+struct PropertyMetaInfo<max_work_groups_per_cluster_key::value_t<Size>> {
+  static constexpr const char *name = "sycl-max-work-groups-per-cluster";
+  static constexpr uint32_t value = Size;
+};
 
 template <typename T, typename = void>
 struct HasKernelPropertiesGetMethod : std::false_type {};

@@ -212,8 +212,11 @@ enum PropKind : uint32_t {
   IncludeFiles = 71,
   RegisteredKernelNames = 72,
   ClusterLaunch = 73,
+  MaxWorkGroupSize = 74,
+  MinWorkGroupsPerMultiprocessor = 75,
+  MaxWorkGroupsPerCluster = 76,
   // PropKindSize must always be the last value.
-  PropKindSize = 74,
+  PropKindSize = 77,
 };
 
 struct property_key_base_tag {};

@@ -0,0 +1,27 @@
+// RUN: %clangxx -fsycl-device-only -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR
+// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s
+// expected-no-diagnostics
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+
+  constexpr auto Props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::min_work_groups_per_multiprocessor<8>,
+      sycl::ext::oneapi::experimental::max_work_groups_per_cluster<4>,
+  };
+  // CHECK-IR: spir_kernel void @{{.*}}LaunchBoundsKernel(){{.*}} #[[LaunchBoundsAttrs:[0-9]+]]
+  // CHECK-IR-SAME: !max_work_groups_per_cluster ![[MaxWGsPerCMD:[0-9]+]]
+  // CHECK-IR-SAME: !min_work_groups_per_multiprocessor ![[MinWGsPerMPMD:[0-9]+]]
+  Q.single_task<class LaunchBoundsKernel>(Props, []() {});
+
+  return 0;
+}
+
+// CHECK-IR: attributes #[[LaunchBoundsAttrs]] = {
+// CHECK-IR-SAME: "sycl-max-work-groups-per-cluster"="4"
+// CHECK-IR-SAME: "sycl-min-work-groups-per-multiprocessor"="8"
+
+// CHECK-IR: ![[MaxWGsPerCMD]] = !{i32 4}
+// CHECK-IR: ![[MinWGsPerMPMD]] = !{i32 8}
@@ -0,0 +1,27 @@
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+
+  constexpr auto Props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::min_work_groups_per_multiprocessor<8>,
+      sycl::ext::oneapi::experimental::max_work_groups_per_cluster<4>,
+  };
+
+  // CHECK-IR: define{{.*}}void @[[LaunchBoundsKernelFn:.*LaunchBoundsKernel0]](){{.*}} #[[LaunchBoundsAttrs:[0-9]+]]
+  Q.single_task<class LaunchBoundsKernel0>(Props, []() {});
+
+  return 0;
+}
+
+// CHECK-IR: attributes #[[LaunchBoundsAttrs]] = {
+// CHECK-IR-SAME: "sycl-max-work-groups-per-cluster"="4"
+// CHECK-IR-SAME: "sycl-min-work-groups-per-multiprocessor"="8"
+
+// CHECK-IR-DAG: !{ptr @[[LaunchBoundsKernelFn]], !"kernel", i32 1}
+// CHECK-IR-DAG: !{ptr @[[LaunchBoundsKernelFn]], !"minctasm", i32 8}
+// CHECK-IR-DAG: !{ptr @[[LaunchBoundsKernelFn]], !"maxclusterrank", i32 4}
@@ -0,0 +1,29 @@
+// RUN: %clangxx -fsycl-device-only -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR
+// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s
+// expected-no-diagnostics
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+  sycl::event Ev;
+
+  constexpr auto Props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::max_work_group_size<8, 4, 2>};
+
+  // CHECK-IR: spir_kernel void @{{.*}}MaxWGSizeKernel0(){{.*}} #[[MaxWGSizeAttr1:[0-9]+]]
+  // CHECK-IR-SAME: !max_work_group_size ![[MaxWGSizeMD1:[0-9]+]]
+  Q.single_task<class MaxWGSizeKernel0>(Props, []() {});
+  // CHECK-IR: spir_kernel void @{{.*}}MaxWGSizeKernel1(){{.*}} #[[MaxWGSizeAttr1]]
+  // CHECK-IR-SAME: !max_work_group_size ![[MaxWGSizeMD1]]
+  Q.single_task<class MaxWGSizeKernel1>(Ev, Props, []() {});
+  // CHECK-IR: spir_kernel void @{{.*}}MaxWGSizeKernel2(){{.*}} #[[MaxWGSizeAttr1]]
+  // CHECK-IR-SAME: !max_work_group_size ![[MaxWGSizeMD1]]
+  Q.single_task<class MaxWGSizeKernel2>({Ev}, Props, []() {});
+
+  return 0;
+}
+
+// CHECK-IR: attributes #[[MaxWGSizeAttr1]] = { {{.*}}"sycl-max-work-group-size"="8,4,2"
+
+// CHECK-IR: ![[MaxWGSizeMD1]] = !{i64 2, i64 4, i64 8}
@@ -0,0 +1,25 @@
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue Q;
+
+  constexpr auto Props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::max_work_group_size<8, 4, 2>};
+
+  // CHECK-IR: define{{.*}}void @[[MaxWGSizeKernelFn:.*MaxWGSizeKernel0]](){{.*}} #[[MaxWGSizeAttr1:[0-9]+]]
+  Q.single_task<class MaxWGSizeKernel0>(Props, []() {});
+
+  return 0;
+}
+
+// CHECK-IR: attributes #[[MaxWGSizeAttr1]] = {
+// CHECK-IR-SAME: "sycl-max-work-group-size"="8,4,2"
+
+// CHECK-IR-DAG: !{ptr @[[MaxWGSizeKernelFn]], !"kernel", i32 1}
+// CHECK-IR-DAG: !{ptr @[[MaxWGSizeKernelFn]], !"maxntidx", i32 2}
+// CHECK-IR-DAG: !{ptr @[[MaxWGSizeKernelFn]], !"maxntidy", i32 4}
+// CHECK-IR-DAG: !{ptr @[[MaxWGSizeKernelFn]], !"maxntidz", i32 8}