Skip to content

Commit fb512c4

Browse files
committed
[CodeStyle] Format all .cuh files
1 parent 275e2db commit fb512c4

File tree

20 files changed

+4551
-2953
lines changed

20 files changed

+4551
-2953
lines changed

.pre-commit-config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ repos:
2626
- id: remove-crlf
2727
- id: remove-tabs
2828
name: Tabs remover (C++)
29-
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
29+
files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|xpu|kps)$
3030
args: [--whitespaces-count, '2']
3131
- id: remove-tabs
3232
name: Tabs remover (Python)
@@ -35,15 +35,15 @@ repos:
3535
# Exclude some unit test files that require tabs.
3636
exclude: |
3737
(?x)^(
38-
test/dygraph_to_static/test_error.py
38+
test/dygraph_to_static/test_error.py
3939
)$
4040
- repo: local
4141
hooks:
4242
- id: copyright_checker
4343
name: copyright_checker
4444
entry: python ./tools/codestyle/copyright.py
4545
language: system
46-
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|pyi|sh)$
46+
files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|proto|xpu|kps|py|pyi|sh)$
4747
exclude: |
4848
(?x)^(
4949
paddle/utils/.*|
@@ -72,7 +72,7 @@ repos:
7272
description: Format files with ClangFormat.
7373
entry: bash ./tools/codestyle/clang_format.sh -i
7474
language: system
75-
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
75+
files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx|xpu|kps)$
7676
- repo: local
7777
hooks:
7878
- id: cpplint-cpp-source

paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh

Lines changed: 575 additions & 300 deletions
Large diffs are not rendered by default.

paddle/fluid/distributed/collective/deep_ep/kernels/buffer.cuh

Lines changed: 143 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -14,143 +14,166 @@
1414

1515
// The file has been adapted from DeepSeek DeepEP project
1616
// Copyright (c) 2025 DeepSeek
17-
// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
17+
// Licensed under the MIT License -
18+
// https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
1819

1920
#pragma once
20-
#include<cstdint>
21+
#include <cstdint>
2122
#include "paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh"
2223
#include "paddle/fluid/distributed/collective/deep_ep/kernels/exception.cuh"
2324

2425
namespace deep_ep {
2526

2627
template <typename dtype_t>
2728
struct Buffer {
28-
private:
29-
uint8_t* ptr;
30-
31-
public:
32-
int total_bytes;
33-
34-
__device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
35-
36-
__device__ __forceinline__ Buffer(void* &gbl_ptr, int num_elems, int offset = 0) {
37-
total_bytes = num_elems * sizeof(dtype_t);
38-
ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
39-
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
40-
}
41-
42-
__device__ __forceinline__ Buffer advance_also(void* &gbl_ptr) {
43-
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
44-
return *this;
45-
}
46-
47-
__device__ __forceinline__ dtype_t* buffer() {
48-
return reinterpret_cast<dtype_t*>(ptr);
49-
}
50-
51-
__device__ __forceinline__ dtype_t& operator[](int idx) {
52-
return buffer()[idx];
53-
}
29+
private:
30+
uint8_t* ptr;
31+
32+
public:
33+
int total_bytes;
34+
35+
__device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {}
36+
37+
__device__ __forceinline__ Buffer(void*& gbl_ptr,
38+
int num_elems,
39+
int offset = 0) {
40+
total_bytes = num_elems * sizeof(dtype_t);
41+
ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t);
42+
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
43+
}
44+
45+
__device__ __forceinline__ Buffer advance_also(void*& gbl_ptr) {
46+
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
47+
return *this;
48+
}
49+
50+
__device__ __forceinline__ dtype_t* buffer() {
51+
return reinterpret_cast<dtype_t*>(ptr);
52+
}
53+
54+
__device__ __forceinline__ dtype_t& operator[](int idx) {
55+
return buffer()[idx];
56+
}
5457
};
5558

5659
template <typename dtype_t, int kNumRanks = 1>
5760
struct AsymBuffer {
58-
private:
59-
uint8_t* ptrs[kNumRanks];
60-
int num_bytes;
61-
62-
public:
63-
int total_bytes;
64-
65-
__device__ __forceinline__ AsymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
66-
int sm_id = 0, int num_sms = 1, int offset = 0) {
67-
EP_STATIC_ASSERT(kNumRanks == 1, "");
68-
num_bytes = num_elems * sizeof(dtype_t);
69-
70-
int per_channel_bytes = num_bytes * num_ranks;
71-
total_bytes = per_channel_bytes * num_sms;
72-
ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + num_bytes * offset;
73-
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
74-
}
75-
76-
__device__ __forceinline__ AsymBuffer(void** gbl_ptrs, int num_elems, int num_ranks,
77-
int sm_id = 0, int num_sms = 1, int offset = 0) {
78-
EP_STATIC_ASSERT(kNumRanks > 1, "");
79-
num_bytes = num_elems * sizeof(dtype_t);
80-
81-
int per_channel_bytes = num_bytes * num_ranks;
82-
total_bytes = per_channel_bytes * num_sms;
83-
for (int i = 0; i < kNumRanks; ++ i) {
84-
ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + per_channel_bytes * sm_id + num_bytes * offset;
85-
gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
86-
}
87-
}
88-
89-
__device__ __forceinline__ void advance(int shift) {
90-
#pragma unroll
91-
for (int i = 0; i < kNumRanks; ++ i)
92-
ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
93-
}
94-
95-
__device__ __forceinline__ AsymBuffer advance_also(void* &gbl_ptr) {
96-
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
97-
return *this;
98-
}
99-
100-
template<int kNumAlsoRanks>
101-
__device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
102-
for (int i = 0; i < kNumAlsoRanks; ++ i)
103-
gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
104-
return *this;
105-
}
106-
107-
__device__ __forceinline__ dtype_t* buffer(int idx = 0) {
108-
EP_STATIC_ASSERT(kNumRanks == 1, "`buffer` is only available for single rank case");
109-
return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
110-
}
111-
112-
__device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
113-
EP_STATIC_ASSERT(kNumRanks > 1, "`buffer` is only available for single rank case");
114-
return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
61+
private:
62+
uint8_t* ptrs[kNumRanks];
63+
int num_bytes;
64+
65+
public:
66+
int total_bytes;
67+
68+
__device__ __forceinline__ AsymBuffer(void*& gbl_ptr,
69+
int num_elems,
70+
int num_ranks,
71+
int sm_id = 0,
72+
int num_sms = 1,
73+
int offset = 0) {
74+
EP_STATIC_ASSERT(kNumRanks == 1, "");
75+
num_bytes = num_elems * sizeof(dtype_t);
76+
77+
int per_channel_bytes = num_bytes * num_ranks;
78+
total_bytes = per_channel_bytes * num_sms;
79+
ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id +
80+
num_bytes * offset;
81+
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
82+
}
83+
84+
__device__ __forceinline__ AsymBuffer(void** gbl_ptrs,
85+
int num_elems,
86+
int num_ranks,
87+
int sm_id = 0,
88+
int num_sms = 1,
89+
int offset = 0) {
90+
EP_STATIC_ASSERT(kNumRanks > 1, "");
91+
num_bytes = num_elems * sizeof(dtype_t);
92+
93+
int per_channel_bytes = num_bytes * num_ranks;
94+
total_bytes = per_channel_bytes * num_sms;
95+
for (int i = 0; i < kNumRanks; ++i) {
96+
ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) +
97+
per_channel_bytes * sm_id + num_bytes * offset;
98+
gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
11599
}
100+
}
101+
102+
__device__ __forceinline__ void advance(int shift) {
103+
#pragma unroll
104+
for (int i = 0; i < kNumRanks; ++i)
105+
ptrs[i] = ptrs[i] + shift * sizeof(dtype_t);
106+
}
107+
108+
__device__ __forceinline__ AsymBuffer advance_also(void*& gbl_ptr) {
109+
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
110+
return *this;
111+
}
112+
113+
template <int kNumAlsoRanks>
114+
__device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) {
115+
for (int i = 0; i < kNumAlsoRanks; ++i)
116+
gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes;
117+
return *this;
118+
}
119+
120+
__device__ __forceinline__ dtype_t* buffer(int idx = 0) {
121+
EP_STATIC_ASSERT(kNumRanks == 1,
122+
"`buffer` is only available for single rank case");
123+
return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx);
124+
}
125+
126+
__device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) {
127+
EP_STATIC_ASSERT(kNumRanks > 1,
128+
"`buffer` is only available for single rank case");
129+
return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx);
130+
}
116131
};
117132

118133
template <typename dtype_t, bool kDecoupled = true>
119134
struct SymBuffer {
120-
private:
121-
// NOTES: for non-decoupled case, `recv_ptr` is not used
122-
uint8_t* send_ptr;
123-
uint8_t* recv_ptr;
124-
int num_bytes;
125-
126-
public:
127-
int total_bytes;
128-
129-
__device__ __forceinline__ SymBuffer(void* &gbl_ptr, int num_elems, int num_ranks,
130-
int sm_id = 0, int num_sms = 1) {
131-
num_bytes = num_elems * sizeof(dtype_t);
132-
133-
int per_channel_bytes = num_bytes * num_ranks;
134-
total_bytes = per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
135-
send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
136-
recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * (sm_id + num_sms);
137-
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
138-
}
139-
140-
__device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
141-
EP_STATIC_ASSERT(kDecoupled, "`send_buffer` is only available for non-decoupled case");
142-
return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
143-
}
144-
145-
__device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
146-
EP_STATIC_ASSERT(kDecoupled, "`recv_buffer` is only available for non-decoupled case");
147-
return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
148-
}
149-
150-
__device__ __forceinline__ dtype_t* buffer(int idx = 0) {
151-
EP_STATIC_ASSERT(not kDecoupled, "`buffer` is only available for decoupled case");
152-
return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
153-
}
135+
private:
136+
// NOTES: for non-decoupled case, `recv_ptr` is not used
137+
uint8_t* send_ptr;
138+
uint8_t* recv_ptr;
139+
int num_bytes;
140+
141+
public:
142+
int total_bytes;
143+
144+
__device__ __forceinline__ SymBuffer(void*& gbl_ptr,
145+
int num_elems,
146+
int num_ranks,
147+
int sm_id = 0,
148+
int num_sms = 1) {
149+
num_bytes = num_elems * sizeof(dtype_t);
150+
151+
int per_channel_bytes = num_bytes * num_ranks;
152+
total_bytes =
153+
per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1);
154+
send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id;
155+
recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) +
156+
per_channel_bytes * (sm_id + num_sms);
157+
gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes;
158+
}
159+
160+
__device__ __forceinline__ dtype_t* send_buffer(int idx = 0) {
161+
EP_STATIC_ASSERT(kDecoupled,
162+
"`send_buffer` is only available for non-decoupled case");
163+
return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
164+
}
165+
166+
__device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) {
167+
EP_STATIC_ASSERT(kDecoupled,
168+
"`recv_buffer` is only available for non-decoupled case");
169+
return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx);
170+
}
171+
172+
__device__ __forceinline__ dtype_t* buffer(int idx = 0) {
173+
EP_STATIC_ASSERT(not kDecoupled,
174+
"`buffer` is only available for decoupled case");
175+
return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx);
176+
}
154177
};
155178

156-
} // namespace deep_ep
179+
} // namespace deep_ep

paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515
// The file has been adapted from DeepSeek DeepEP project
1616
// Copyright (c) 2025 DeepSeek
17-
// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
17+
// Licensed under the MIT License -
18+
// https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
1819

1920
#pragma once
2021

@@ -27,17 +28,20 @@
2728

2829
#define FINISHED_SUM_TAG 1024
2930
#define NUM_CPU_TIMEOUT_SECS 100
30-
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
31+
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
3132
#define NUM_WAIT_NANOSECONDS 500
3233

3334
#define LOW_LATENCY_SEND_PHASE 1
3435
#define LOW_LATENCY_RECV_PHASE 2
3536

3637
// Make CLion CUDA indexing work
3738
#ifdef __CLION_IDE__
38-
#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
39-
#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
40-
__host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { asm volatile("trap;"); }
39+
#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
40+
#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
41+
__host__ __device__ __forceinline__ void host_device_printf(const char* format,
42+
...) {
43+
asm volatile("trap;");
44+
}
4145
#define printf host_device_printf
4246
#endif
4347

0 commit comments

Comments
 (0)