[Cherry-pick Fleety_12] Fix Bigtensor (PaddlePaddle#76363) (PaddlePaddle#76371)

zhengshengning · web-flow · commit 0e6f7fb91dcc · 2025-11-14T17:35:19.000+08:00
* big tensor: moe_permute/moe_unpermute/repeat_interleave/fused_transpose_wlch_split_quant

* fix

* fix int64

* fix int64 to int
diff --git a/paddle/phi/kernels/fusion/gpu/fused_transpose_wlch_split_quant_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_transpose_wlch_split_quant_kernel.cu
@@ -100,8 +100,8 @@ __global__ void __launch_bounds__(512)
       reinterpret_cast<__nv_fp8_e4m3**>(meta + num_experts);
   float** scale_ptrs = reinterpret_cast<float**>(meta + num_experts * 2);
 
-  const size_t block_off_x = blockIdx.x * size_t(128);
-  const size_t block_off_y = blockIdx.y * 128;
+  const size_t block_off_x = static_cast<size_t>(blockIdx.x) * 128;
+  const size_t block_off_y = static_cast<size_t>(blockIdx.y) * 128;
 
   // 1. Load 128x128 block from input.
   for (uint32_t i = 0; i < 8; i++) {
@@ -156,7 +156,7 @@ __global__ void __launch_bounds__(512)
       off = (off / 64) * 64 + (off % 2) * 32 + (off % 64) / 2;
     }
     float scale_out = 1.0f / col_scale[off];
-    size_t idx_y = blockIdx.x - expert_off / 128;
+    size_t idx_y = static_cast<size_t>(blockIdx.x) - expert_off / 128;
     size_t idx_x = block_off_y + threadIdx.y * 32 + threadIdx.x;
     size_t idx = idx_y * H + idx_x;
     if (idx_x < H) {
diff --git a/paddle/phi/kernels/gpu/moe_permute_kernel.cu b/paddle/phi/kernels/gpu/moe_permute_kernel.cu
@@ -245,8 +245,20 @@ void MoePermuteKernel(const Context &dev_ctx,
                       DenseTensor *zipped_expertwise_rowmap,
                       DenseTensor *token_prob_unzipped,
                       DenseTensor *XScale_unzipped) {
-  const int rows = X.dims()[0];
-  const int cols = X.dims()[1];
+  const int64_t rows = X.dims()[0];
+  const int64_t cols = X.dims()[1];
+  PADDLE_ENFORCE_LE(
+      rows,
+      std::numeric_limits<int32_t>::max(),
+      common::errors::InvalidArgument("X.dims()[0] should be less than "
+                                      "INT_MAX, received X.dims()[0]: (%ld)",
+                                      rows));
+  PADDLE_ENFORCE_LE(
+      cols,
+      std::numeric_limits<int32_t>::max(),
+      common::errors::InvalidArgument("X.dims()[1] should be less than "
+                                      "INT_MAX, received X.dims()[1]: (%ld)",
+                                      cols));
   PADDLE_ENFORCE_LE(
       num_experts,
       MAX_NUM_EXPERTS,
@@ -256,7 +268,13 @@ void MoePermuteKernel(const Context &dev_ctx,
           "value.",
           MAX_NUM_EXPERTS,
           num_experts));
-  const int quanted_cols = (XScale) ? XScale.get_ptr()->dims()[1] : 0;
+  const int64_t quanted_cols = (XScale) ? XScale.get_ptr()->dims()[1] : 0;
+  PADDLE_ENFORCE_LE(
+      quanted_cols,
+      std::numeric_limits<int32_t>::max(),
+      common::errors::InvalidArgument("quanted_cols should be less than "
+                                      "INT_MAX, received quanted_cols: (%ld)",
+                                      quanted_cols));
 
   // Expert base offset initialization, tensor numeric range [0, max_token_num]
   int expert_offset[MAX_NUM_EXPERTS];
@@ -281,7 +299,12 @@ void MoePermuteKernel(const Context &dev_ctx,
                                              dev_ctx.stream()));
   // ------------------- resource allocate -------------------------
   const int output_rows = tokens_cumulated;
-  const int topk = expert_routemap_topk.dims()[1];
+  const int64_t topk = expert_routemap_topk.dims()[1];
+  PADDLE_ENFORCE_LE(
+      topk,
+      std::numeric_limits<int32_t>::max(),
+      common::errors::InvalidArgument(
+          "topk should be less than INT_MAX, received topk: (%ld)", topk));
   token_prob_unzipped->Resize({output_rows});
   if (do_gather) {  // no gather, no resize.
     X_unzipped->Resize({output_rows, cols});
@@ -346,11 +369,11 @@ void MoePermuteKernel(const Context &dev_ctx,
                                            token_prob_unzipped,
                                            XScale_unzipped,
                                            &global_expertwise_block_cumsum,
-                                           rows,
-                                           cols,
-                                           topk,
+                                           static_cast<int>(rows),
+                                           static_cast<int>(cols),
+                                           static_cast<int>(topk),
                                            num_experts,
-                                           quanted_cols,
+                                           static_cast<int>(quanted_cols),
                                            do_gather);
 }
 #undef CUMSUM_BLOCK_SIZE
diff --git a/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu b/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu
@@ -226,8 +226,13 @@ void MoeUnpermuteKernel(const Context &dev_ctx,
                         const bool MP,
                         DenseTensor *zipped_tokens,
                         DenseTensor *zipped_probs_topk) {
-  const int rows = unzipped_tokens.dims()[0];
-  const int cols = unzipped_tokens.dims()[1];
+  const int64_t cols = unzipped_tokens.dims()[1];
+  PADDLE_ENFORCE_LE(cols,
+                    std::numeric_limits<int32_t>::max(),
+                    common::errors::InvalidArgument(
+                        "unzipped_tokens.dims()[1] should be less than "
+                        "INT_MAX, received unzipped_tokens.dims()[1]: (%ld)",
+                        cols));
   PADDLE_ENFORCE_LE(
       num_experts,
       MAX_NUM_EXPERTS,
@@ -237,7 +242,12 @@ void MoeUnpermuteKernel(const Context &dev_ctx,
           "value.",
           MAX_NUM_EXPERTS,
           num_experts));
-  const int topk = expert_routemap_topk.dims()[1];
+  const int64_t topk = expert_routemap_topk.dims()[1];
+  PADDLE_ENFORCE_LE(
+      topk,
+      std::numeric_limits<int32_t>::max(),
+      common::errors::InvalidArgument(
+          "topk should be less than INT_MAX, received topk: (%ld)", topk));
   dev_ctx.template Alloc<T>(zipped_tokens);
   dev_ctx.template Alloc<float>(zipped_probs_topk);
   if (unzipped_tokens.numel() == 0) return;  // 0-size tensor
@@ -258,8 +268,8 @@ void MoeUnpermuteKernel(const Context &dev_ctx,
                                   zipped_probs_topk,
                                   total_zipped_tokens_num,
                                   num_experts,
-                                  cols,
-                                  topk,
+                                  static_cast<int>(cols),
+                                  static_cast<int>(topk),
                                   MP);
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
@@ -37,7 +37,8 @@ __global__ void index_select_cuda_kernel(const T* input,
                                          int64_t stride,
                                          int64_t size,
                                          int64_t delta) {
-  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t idx =
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= N) {
     return;
   }
@@ -198,7 +199,8 @@ __global__ void RepeatInterleaveVecKernel(const T* __restrict__ input,
                                           const int repeats) {
   using VecType = kps::details::VectorType<T, VecSize>;
 
-  const int64_t tid = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
+  const int64_t tid =
+      (static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize;
   if (tid >= numel) return;
 
   VecType* vec_output = reinterpret_cast<VecType*>(output);