Merge from main branch

PenghuiCheng · PenghuiCheng · commit 22e7cb9b5bda · 2025-05-30T01:59:08.000Z
Signed-off-by: Cheng, Penghui &lt;penghui.cheng@intel.com&gt;
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
@@ -56,17 +56,12 @@ def check_merged(pr_info):
         merged = False
     return merged
 
-def appyly_pr(pull_number, re_apply_msg):
-    # get the diff
-    os.system(f"\
-        git fetch origin pull/{pull_number}/head:{pull_number} && \
-        git checkout -f {pull_number} && \
-        git merge ci-tmp-$(hostname) --no-edit --no-ff > /dev/null && \
-        git diff ci-tmp-$(hostname) {pull_number} > {pull_number}.diff \
-    ")
+def appyly_pr(pr_info, repo_info, re_apply_msg):
+    # get pr diff
+    pr_file = pr_info["diff_url"].split("/")[-1]
+    os.system(f"gh --repo {repo_info[-4]}/{repo_info[-3]} pr diff {repo_info[-1]} > {pr_file}")
     # apply diff
-    os.system("git checkout ci-test-$(hostname)")
-    apply_cmd = f"git reset --hard && git apply --3way {pull_number}.diff"
+    apply_cmd = "git apply --3way " + pr_file
     apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
     apply_message = apply_info.communicate()[0].decode("utf-8")
     apply_status = apply_info.returncode
@@ -88,9 +83,6 @@ def appyly_pr(pull_number, re_apply_msg):
 pr_list = args.pr_list + args.extra_pr_list
 pr_list = set(pr_list)
 pr_list = sorted(pr_list)
-# checkout a base branch
-os.system("git checkout -b ci-tmp-$(hostname) && git checkout -b ci-test-$(hostname) && rm -f *.diff")
-os.system("git config --global user.email intel.com && git config --global user.name intel")
 for pr_link in pr_list:
     repo_info = pr_link.split("/")
     pr_info = requests.get('https://api.' + repo_info[-5] + '/repos/' + repo_info[-4] + '/' + \
@@ -107,7 +99,7 @@ def appyly_pr(pull_number, re_apply_msg):
                 continue
             else:
                 re_apply_msg = "is re-opened and reverted,"
-        appyly_pr(repo_info[-1], re_apply_msg)
+        appyly_pr(pr_info, repo_info, re_apply_msg)
     elif pr_info["state"].lower() == "closed":
         merged_id = next((item["id"] for item in pr_info["labels"] if item["name"] == "Merged"), -1)
         re_apply_msg = "is closed but not merged"
@@ -116,8 +108,7 @@ def appyly_pr(pull_number, re_apply_msg):
             if merged:
                 print("{} is closed and merged, no need to apply".format(pr_info["diff_url"]))
                 continue
-        appyly_pr(repo_info[-1], re_apply_msg)
+        appyly_pr(pr_info, repo_info, re_apply_msg)
     else:
         print("{} is {}, no need to apply".format(pr_info["diff_url"], pr_info["state"]))
         sys.exit(1)
-os.system("git checkout ci-test-$(hostname) && git reset --hard && git apply --3way *.diff && git status")
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -119,7 +119,7 @@ jobs:
             cd third_party/torch-xpu-ops
             git checkout ${TORCH_XPU_OPS_COMMIT}
             cd ../..
-            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
           fi
           pip install -r .ci/docker/requirements-ci.txt
       - name: Torch Config
@@ -412,7 +412,7 @@ jobs:
             cd third_party/torch-xpu-ops
             git checkout ${TORCH_XPU_OPS_COMMIT}
             cd ../..
-            python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
           fi
           pip install -r .ci/docker/requirements-ci.txt
       - name: Torch Config
diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
@@ -116,7 +116,6 @@ jobs:
           cd pytorch && git checkout ${TORCH_COMMIT_ID}
           # apply PRs for stock pytorch
           pip install requests
-          # python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
           git status && git show -s
           pip install -r requirements.txt
           TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
@@ -126,7 +125,7 @@ jobs:
           cd third_party/torch-xpu-ops
           git checkout ${TORCH_XPU_OPS_COMMIT}
           cd ../../
-          python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
       - name: Identify pinned versions
         id: pinned
         run: |
diff --git a/src/ATen/native/xpu/SpectralOps.cpp b/src/ATen/native/xpu/SpectralOps.cpp
@@ -1,10 +1,10 @@
+#include <ATen/native/Resize.h>
+#include <ATen/ops/_fft_r2c_native.h>
 #if defined(USE_ONEMKL_XPU)
 #include <ATen/native/xpu/mkl/SpectralOps.h>
 #else
-#include <ATen/native/Resize.h>
 #include <ATen/ops/_fft_c2c_native.h>
 #include <ATen/ops/_fft_c2r_native.h>
-#include <ATen/ops/_fft_r2c_native.h>
 #endif // USE_ONEMKL_XPU
 
 namespace at::native {
@@ -87,13 +87,9 @@ Tensor _fft_r2c_xpu(
     bool onesided) {
   TORCH_CHECK(self.is_floating_point());
 
-#if defined(USE_ONEMKL_XPU)
-  return native::xpu::_fft_r2c_mkl(self, dim, normalization, onesided);
-#else
   Tensor out_cpu = native::_fft_r2c_mkl(
       self.to(Device(at::kCPU)), dim, normalization, onesided);
   return out_cpu.to(Device(at::kXPU));
-#endif // USE_ONEMKL_XPU
 }
 
 Tensor& _fft_r2c_xpu_out(
@@ -104,15 +100,11 @@ Tensor& _fft_r2c_xpu_out(
     Tensor& out) {
   TORCH_CHECK(self.is_floating_point());
 
-#if defined(USE_ONEMKL_XPU)
-  return native::xpu::_fft_r2c_mkl_out(self, dim, normalization, onesided, out);
-#else
   Tensor out_cpu = native::_fft_r2c_mkl(
       self.to(Device(at::kCPU)), dim, normalization, onesided);
   at::native::resize_output(out, out_cpu.sizes());
   out.copy_(out_cpu);
   return out;
-#endif // USE_ONEMKL_XPU
 }
 
 } // namespace at::native
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
@@ -193,7 +193,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "_flash_attention_forward",
     "geqrf",
     "linalg_cholesky_ex.L",
-    "_linalg_det.result",
     "linalg_eig",
     "_linalg_eigvals",
     "linalg_eigvals.out",
@@ -206,8 +205,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "linalg_lu.out",
     "linalg_matrix_exp",
     "linalg_qr.out",
-    "_linalg_slogdet.sign",
-    "_linalg_solve_ex.result",
     "linalg_solve_triangular",
     "_linalg_svd.U",
     "lu_unpack.out",
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -118,11 +118,11 @@ struct GNRowwiseMomentsVectorizedFunctor
       sycl::nd_item<1> item) const {
     WelfordType val[VEC_SIZE];
     WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
-    auto g_start = item.get_group(0) * VEC_SIZE;
+    auto group_start = item.get_group(0) * VEC_SIZE;
 
 #pragma unroll
     for (int v = 0; v < VEC_SIZE; ++v) {
-      const int64_t i = g_start + v;
+      const int64_t i = group_start + v;
       for (int64_t j = item.get_local_id(0) * VEC_SIZE; j < N_;
            j += item.get_local_range(0) * VEC_SIZE) {
         const int64_t vec_index = i * N_ + j;
@@ -153,8 +153,8 @@ struct GNRowwiseMomentsVectorizedFunctor
         mean_vec[v] = m1;
         rstd_vec[v] = c10::xpu::compat::rsqrt(m2 + static_cast<T_ACC>(eps_));
       }
-      *(reinterpret_cast<vec_t*>(mean_ + g_start)) = mean_vec;
-      *(reinterpret_cast<vec_t*>(rstd_ + g_start)) = rstd_vec;
+      *(reinterpret_cast<vec_t*>(mean_ + group_start)) = mean_vec;
+      *(reinterpret_cast<vec_t*>(rstd_ + group_start)) = rstd_vec;
     }
   }
 
@@ -934,6 +934,91 @@ struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   sycl_local_acc_t<T_ACC> db_shared_;
 };
 
+template <typename T, int SIMD, int VEC_SIZE>
+struct ComputeInternalGradientsVectorizedFunctor
+    : public __SYCL_KER_CONFIG_CONVENTION__ {
+  using T_ACC = acc_type_device<T, kXPU>;
+  using vec_t = memory::aligned_vector<T, VEC_SIZE>;
+  using acc_vec_t = memory::aligned_vector<T_ACC, VEC_SIZE>;
+
+  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+      sycl::nd_item<1> item) const {
+    acc_vec_t sum1_vec;
+    acc_vec_t sum2_vec;
+
+#pragma unroll
+    for (int v = 0; v < VEC_SIZE; ++v) {
+      sum1_vec[v] = 0;
+      sum2_vec[v] = 0;
+    }
+
+    auto group_start = item.get_group(0) * VEC_SIZE;
+
+#pragma unroll
+    for (int v = 0; v < VEC_SIZE; ++v) {
+      const int64_t nc = group_start + v;
+      for (int64_t hw = item.get_local_id(0) * VEC_SIZE; hw < HxW_;
+           hw += item.get_local_range(0) * VEC_SIZE) {
+        const int64_t vec_index = nc * HxW_ + hw;
+        vec_t vec_dY_ =
+            *reinterpret_cast<vec_t*>(const_cast<T*>(dY_) + vec_index);
+        vec_t vec_X_ =
+            *reinterpret_cast<vec_t*>(const_cast<T*>(X_) + vec_index);
+
+#pragma unroll
+        for (int iv = 0; iv < VEC_SIZE; ++iv) {
+          sum1_vec[v] += static_cast<T_ACC>(vec_dY_[iv] * vec_X_[iv]);
+          sum2_vec[v] += static_cast<T_ACC>(vec_dY_[iv]);
+        }
+      }
+    }
+
+#pragma unroll
+    for (int v = 0; v < VEC_SIZE; ++v) {
+      sum1_vec[v] = GroupReduceSumWithoutBroadcast<T_ACC, SIMD>(
+          item, sum1_vec[v], ds_shared_);
+      sum2_vec[v] = GroupReduceSumWithoutBroadcast<T_ACC, SIMD>(
+          item, sum2_vec[v], db_shared_);
+    }
+
+    if (item.get_local_id(0) == 0) {
+      acc_vec_t ds_vec;
+      acc_vec_t db_vec;
+#pragma unroll
+      for (int v = 0; v < VEC_SIZE; ++v) {
+        ds_vec[v] = sum1_vec[v];
+        db_vec[v] = sum2_vec[v];
+      }
+      *(reinterpret_cast<acc_vec_t*>(ds_ + group_start)) = ds_vec;
+      *(reinterpret_cast<acc_vec_t*>(db_ + group_start)) = db_vec;
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+  }
+
+  ComputeInternalGradientsVectorizedFunctor(
+      int64_t HxW,
+      const T* dY,
+      const T* X,
+      T_ACC* ds,
+      T_ACC* db)
+      : HxW_(HxW), dY_(dY), X_(X), ds_(ds), db_(db) {}
+
+ private:
+  int64_t HxW_;
+  const T* dY_;
+  const T* X_;
+  T_ACC* ds_;
+  T_ACC* db_;
+  sycl_local_acc_t<T_ACC> ds_shared_;
+  sycl_local_acc_t<T_ACC> db_shared_;
+};
+
 template <typename T, typename T_ACC>
 struct GroupNormBackwardC1Functor {
   T_ACC operator()(T rstd, T gamma) const {
@@ -1272,23 +1357,50 @@ void group_norm_backward_kernel_impl(
   }
 
   auto& queue = getCurrentSYCLQueue();
-
   int64_t simd = syclMaxSubGroupSize();
-  int64_t wg_size = HxW < get_group_reduce_group_size(simd)
-      ? simd
-      : get_group_reduce_group_size(simd);
-  group_norm_kernel_simd_choice_and_launch<
-      ComputeInternalGradientsFunctor<T, SIMD16>,
-      ComputeInternalGradientsFunctor<T, SIMD32>>(
-      simd,
-      sycl::range<1>(N * C * wg_size),
-      sycl::range<1>(wg_size),
-      queue,
-      HxW,
-      dY_data,
-      X_data,
-      ds_data,
-      db_data);
+
+  constexpr int VEC_SIZE = PREFERRED_VEC_SIZE;
+  int64_t wg_size = 0;
+
+  if (can_use_vectorization(dY_data, VEC_SIZE) &&
+      can_use_vectorization(X_data, VEC_SIZE) &&
+      can_use_vectorization(ds_data, VEC_SIZE) &&
+      can_use_vectorization(db_data, VEC_SIZE) && HxW % VEC_SIZE == 0 &&
+      (N * C) % VEC_SIZE == 0) {
+    using KernelS16T =
+        ComputeInternalGradientsVectorizedFunctor<T, SIMD16, VEC_SIZE>;
+    using KernelS32T =
+        ComputeInternalGradientsVectorizedFunctor<T, SIMD32, VEC_SIZE>;
+    wg_size = (HxW / VEC_SIZE) < get_group_reduce_group_size(simd)
+        ? simd
+        : get_group_reduce_group_size(simd);
+    group_norm_kernel_simd_choice_and_launch<KernelS16T, KernelS32T>(
+        simd,
+        sycl::range<1>((N * C / VEC_SIZE) * wg_size),
+        sycl::range<1>(wg_size),
+        queue,
+        HxW,
+        dY_data,
+        X_data,
+        ds_data,
+        db_data);
+  } else {
+    using KernelS16T = ComputeInternalGradientsFunctor<T, SIMD16>;
+    using KernelS32T = ComputeInternalGradientsFunctor<T, SIMD32>;
+    wg_size = HxW < get_group_reduce_group_size(simd)
+        ? simd
+        : get_group_reduce_group_size(simd);
+    group_norm_kernel_simd_choice_and_launch<KernelS16T, KernelS32T>(
+        simd,
+        sycl::range<1>(N * C * wg_size),
+        sycl::range<1>(wg_size),
+        queue,
+        HxW,
+        dY_data,
+        X_data,
+        ds_data,
+        db_data);
+  }
 
   if (dX.defined()) {
     Tensor c1 = at::empty({0}, X.options().dtype(kAccType));
@@ -1373,8 +1485,8 @@ void group_norm_backward_kernel_impl(
       sycl_kernel_submit(sycl::range<1>(C), queue, caller);
     } else {
       // The algorithm for colwise reduction here is to accumulate each
-      // (subgroup_size) cols to a (subgroup_size^2) tile and write the tile to
-      // shared memory. Then do subgroup reduce for each col in the tile.
+      // (subgroup_size) cols to a (subgroup_size^2) tile and write the tile
+      // to shared memory. Then do subgroup reduce for each col in the tile.
       const int64_t kReduceTileSize = simd;
       const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
       auto global_range =
diff --git a/src/ATen/native/xpu/sycl/GroupReduceUtils.h b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
@@ -49,6 +49,7 @@ inline T& GroupReduceSumWithoutBroadcast(
     T& val,
     shared_t shared) {
   auto sg = item.get_sub_group();
+  int g_tid = item.get_local_linear_id();
   int sg_tid = sg.get_local_linear_id();
   int sg_id = sg.get_group_linear_id();
   int n_sg = get_local_linear_range<DIM>(item) / SIMD;
@@ -62,10 +63,12 @@ inline T& GroupReduceSumWithoutBroadcast(
     shared[sg_id] = val;
   }
   item.barrier(sycl_local_fence);
+  val = 0;
   if (sg_id == 0) {
-    for (int i = 1; i < n_sg; i++) {
+    for (int i = sg_tid; i < n_sg; i += SIMD) {
       val += shared[i];
     }
+    val = SubgroupReduceSumWithoutBroadcast<T, SIMD, DIM>(item, val);
   }
   return val;
 }
diff --git a/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.cpp b/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.cpp
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ inline T& GroupReduceSumWithoutBroadcast(`
`49`	`49`	`T& val,`
`50`	`50`	`shared_t shared) {`
`51`	`51`	`auto sg = item.get_sub_group();`
	`52`	`+ int g_tid = item.get_local_linear_id();`
`52`	`53`	`int sg_tid = sg.get_local_linear_id();`
`53`	`54`	`int sg_id = sg.get_group_linear_id();`
`54`	`55`	`int n_sg = get_local_linear_range<DIM>(item) / SIMD;`
`@@ -62,10 +63,12 @@ inline T& GroupReduceSumWithoutBroadcast(`
`62`	`63`	`shared[sg_id] = val;`
`63`	`64`	`}`
`64`	`65`	`item.barrier(sycl_local_fence);`
	`66`	`+ val = 0;`
`65`	`67`	`if (sg_id == 0) {`
`66`		`- for (int i = 1; i < n_sg; i++) {`
	`68`	`+ for (int i = sg_tid; i < n_sg; i += SIMD) {`
`67`	`69`	`val += shared[i];`
`68`	`70`	`}`
	`71`	`+ val = SubgroupReduceSumWithoutBroadcast<T, SIMD, DIM>(item, val);`
`69`	`72`	`}`
`70`	`73`	`return val;`
`71`	`74`	`}`