cattidea
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 4 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh‎
Lines changed: 575 additions & 300 deletions b/‎paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh‎
Lines changed: 575 additions & 300 deletions
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/buffer.cuh‎
Lines changed: 143 additions & 120 deletions b/‎paddle/fluid/distributed/collective/deep_ep/kernels/buffer.cuh‎
Lines changed: 143 additions & 120 deletions
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh‎
Lines changed: 9 additions & 5 deletions b/‎paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh‎
Lines changed: 9 additions & 5 deletions
@@ -26,7 +26,7 @@ repos:
       - id: remove-crlf
       - id: remove-tabs
         name: Tabs remover (C++)
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
+        files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|xpu|kps)$
         args: [--whitespaces-count, '2']
       - id: remove-tabs
         name: Tabs remover (Python)
@@ -35,15 +35,15 @@ repos:
         # Exclude some unit test files that require tabs.
         exclude: |
           (?x)^(
-              test/dygraph_to_static/test_error.py
+            test/dygraph_to_static/test_error.py
           )$
   - repo: local
     hooks:
       - id: copyright_checker
         name: copyright_checker
         entry: python ./tools/codestyle/copyright.py
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|pyi|sh)$
+        files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|proto|xpu|kps|py|pyi|sh)$
         exclude: |
           (?x)^(
             paddle/utils/.*|
@@ -72,7 +72,7 @@ repos:
         description: Format files with ClangFormat.
         entry: bash ./tools/codestyle/clang_format.sh -i
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
+        files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|xpu|kps)$
   - repo: local
     hooks:
       - id: cpplint-cpp-source
 
@@ -14,143 +14,166 @@
 
 // The file has been adapted from DeepSeek DeepEP project
 // Copyright (c) 2025 DeepSeek
-// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
+// Licensed under the MIT License -
+// https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
 
 #pragma once
-#include<cstdint>
+#include <cstdint>
 #include "paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh"
 #include "paddle/fluid/distributed/collective/deep_ep/kernels/exception.cuh"
 
 namespace deep_ep {
 
 template <typename dtype_t>
 struct Buffer {
-private:
-    uint8_t* ptr;
-
-public:
-    int total_bytes;
-
-    __device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
-
-    __device__ __forceinline__ Buffer(void* &gbl_ptr, int num_elems, int offset = 0) {
-        total_bytes = num_elems * sizeof(dtype_t);
-        ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
-        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
-    }
-
-    __device__ __forceinline__ Buffer advance_also(void* &gbl_ptr) {
-        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
-        return *this;
-    }
-
-    __device__ __forceinline__ dtype_t* buffer() {
-        return reinterpret_cast<dtype_t*>(ptr);
-    }
-
-    __device__ __forceinline__ dtype_t& operator[](int idx) {
-        return buffer()[idx];
-    }
+ private:
+  uint8_t* ptr;
+
+ public:
+  int total_bytes;
+
+  __device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
+
+  __device__ __forceinline__ Buffer(void*& gbl_ptr,
+                                    int num_elems,
+                                    int offset = 0) {
+    total_bytes = num_elems * sizeof(dtype_t);
+    ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
+    gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+  }
+
+  __device__ __forceinline__ Buffer advance_also(void*& gbl_ptr) {
+    gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+    return *this;
+  }
+
+  __device__ __forceinline__ dtype_t* buffer() {
+    return reinterpret_cast<dtype_t*>(ptr);
+  }
+
+  __device__ __forceinline__ dtype_t& operator[](int idx) {
+    return buffer()[idx];
+  }
 };
 
 template <typename dtype_t, int kNumRanks = 1>
 struct AsymBuffer {
-private:
-    uint8_t* ptrs[kNumRanks];
-    int num_bytes;
-
-public:
-    int total_bytes;
-
-    __device__ __forceinline__ AsymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
-                                          int sm_id = 0, int num_sms = 1, int offset = 0) {
-        EP_STATIC_ASSERT(kNumRanks == 1, "");
-        num_bytes = num_elems * sizeof(dtype_t);
-
-        int per_channel_bytes = num_bytes * num_ranks;
-        total_bytes = per_channel_bytes * num_sms;
-        ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + num_bytes * offset;
-        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
-    }
-
-    __device__ __forceinline__ AsymBuffer(void** gbl_ptrs, int num_elems, int num_ranks,
-                                          int sm_id = 0, int num_sms = 1, int offset = 0) {
-        EP_STATIC_ASSERT(kNumRanks > 1, "");
-        num_bytes = num_elems * sizeof(dtype_t);
-
-        int per_channel_bytes = num_bytes * num_ranks;
-        total_bytes = per_channel_bytes * num_sms;
-        for (int i = 0; i < kNumRanks; ++ i) {
-            ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + per_channel_bytes * sm_id + num_bytes * offset;
-            gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
-        }
-    }
-
-    __device__ __forceinline__ void advance(int shift) {
-        #pragma unroll
-        for (int i = 0; i < kNumRanks; ++ i)
-            ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
-    }
-
-    __device__ __forceinline__ AsymBuffer advance_also(void* &gbl_ptr) {
-        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
-        return *this;
-    }
-
-    template<int kNumAlsoRanks>
-    __device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
-        for (int i = 0; i < kNumAlsoRanks; ++ i)
-            gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
-        return *this;
-    }
-
-    __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
-        EP_STATIC_ASSERT(kNumRanks == 1, "`buffer` is only available for single rank case");
-        return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
-    }
-
-    __device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
-        EP_STATIC_ASSERT(kNumRanks > 1, "`buffer` is only available for single rank case");
-        return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
+ private:
+  uint8_t* ptrs[kNumRanks];
+  int num_bytes;
+
+ public:
+  int total_bytes;
+
+  __device__ __forceinline__ AsymBuffer(void*& gbl_ptr,
+                                        int num_elems,
+                                        int num_ranks,
+                                        int sm_id = 0,
+                                        int num_sms = 1,
+                                        int offset = 0) {
+    EP_STATIC_ASSERT(kNumRanks == 1, "");
+    num_bytes = num_elems * sizeof(dtype_t);
+
+    int per_channel_bytes = num_bytes * num_ranks;
+    total_bytes = per_channel_bytes * num_sms;
+    ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id +
+              num_bytes * offset;
+    gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+  }
+
+  __device__ __forceinline__ AsymBuffer(void** gbl_ptrs,
+                                        int num_elems,
+                                        int num_ranks,
+                                        int sm_id = 0,
+                                        int num_sms = 1,
+                                        int offset = 0) {
+    EP_STATIC_ASSERT(kNumRanks > 1, "");
+    num_bytes = num_elems * sizeof(dtype_t);
+
+    int per_channel_bytes = num_bytes * num_ranks;
+    total_bytes = per_channel_bytes * num_sms;
+    for (int i = 0; i < kNumRanks; ++i) {
+      ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) +
+                per_channel_bytes * sm_id + num_bytes * offset;
+      gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
     }
+  }
+
+  __device__ __forceinline__ void advance(int shift) {
+#pragma unroll
+    for (int i = 0; i < kNumRanks; ++i)
+      ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
+  }
+
+  __device__ __forceinline__ AsymBuffer advance_also(void*& gbl_ptr) {
+    gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+    return *this;
+  }
+
+  template <int kNumAlsoRanks>
+  __device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
+    for (int i = 0; i < kNumAlsoRanks; ++i)
+      gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
+    return *this;
+  }
+
+  __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
+    EP_STATIC_ASSERT(kNumRanks == 1,
+                     "`buffer` is only available for single rank case");
+    return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
+  }
+
+  __device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
+    EP_STATIC_ASSERT(kNumRanks > 1,
+                     "`buffer` is only available for single rank case");
+    return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
+  }
 };
 
 template <typename dtype_t, bool kDecoupled = true>
 struct SymBuffer {
-private:
-    // NOTES: for non-decoupled case, `recv_ptr` is not used
-    uint8_t* send_ptr;
-    uint8_t* recv_ptr;
-    int num_bytes;
-
-public:
-    int total_bytes;
-
-    __device__ __forceinline__ SymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
-                                         int sm_id = 0, int num_sms = 1) {
-        num_bytes = num_elems * sizeof(dtype_t);
-
-        int per_channel_bytes = num_bytes * num_ranks;
-        total_bytes = per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
-        send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
-        recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * (sm_id + num_sms);
-        gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
-    }
-
-    __device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
-        EP_STATIC_ASSERT(kDecoupled, "`send_buffer` is only available for non-decoupled case");
-        return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
-    }
-
-    __device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
-        EP_STATIC_ASSERT(kDecoupled, "`recv_buffer` is only available for non-decoupled case");
-        return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
-    }
-
-    __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
-        EP_STATIC_ASSERT(not kDecoupled, "`buffer` is only available for decoupled case");
-        return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
-    }
+ private:
+  // NOTES: for non-decoupled case, `recv_ptr` is not used
+  uint8_t* send_ptr;
+  uint8_t* recv_ptr;
+  int num_bytes;
+
+ public:
+  int total_bytes;
+
+  __device__ __forceinline__ SymBuffer(void*& gbl_ptr,
+                                       int num_elems,
+                                       int num_ranks,
+                                       int sm_id = 0,
+                                       int num_sms = 1) {
+    num_bytes = num_elems * sizeof(dtype_t);
+
+    int per_channel_bytes = num_bytes * num_ranks;
+    total_bytes =
+        per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
+    send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
+    recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) +
+               per_channel_bytes * (sm_id + num_sms);
+    gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
+  }
+
+  __device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
+    EP_STATIC_ASSERT(kDecoupled,
+                     "`send_buffer` is only available for non-decoupled case");
+    return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
+  }
+
+  __device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
+    EP_STATIC_ASSERT(kDecoupled,
+                     "`recv_buffer` is only available for non-decoupled case");
+    return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
+  }
+
+  __device__ __forceinline__ dtype_t* buffer(int idx = 0) {
+    EP_STATIC_ASSERT(not kDecoupled,
+                     "`buffer` is only available for decoupled case");
+    return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
+  }
 };
 
-} // namespace deep_ep
+}  // namespace deep_ep
@@ -14,7 +14,8 @@
 
 // The file has been adapted from DeepSeek DeepEP project
 // Copyright (c) 2025 DeepSeek
-// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
+// Licensed under the MIT License -
+// https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
 
 #pragma once
 
@@ -27,17 +28,20 @@
 
 #define FINISHED_SUM_TAG 1024
 #define NUM_CPU_TIMEOUT_SECS 100
-#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
+#define NUM_TIMEOUT_CYCLES 200000000000ull  // 200G cycles ~= 100s
 #define NUM_WAIT_NANOSECONDS 500
 
 #define LOW_LATENCY_SEND_PHASE 1
 #define LOW_LATENCY_RECV_PHASE 2
 
 // Make CLion CUDA indexing work
 #ifdef __CLION_IDE__
-#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
-#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
-__host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { asm volatile("trap;"); }
+#define __CUDA_ARCH__ 900  // NOLINT(*-reserved-identifier)
+#define __CUDACC_RDC__     // NOLINT(*-reserved-identifier)
+__host__ __device__ __forceinline__ void host_device_printf(const char* format,
+                                                            ...) {
+  asm volatile("trap;");
+}
 #define printf host_device_printf
 #endif