|
14 | 14 |
|
15 | 15 | // The file has been adapted from DeepSeek DeepEP project |
16 | 16 | // Copyright (c) 2025 DeepSeek |
17 | | -// Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE |
| 17 | +// Licensed under the MIT License - |
| 18 | +// https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE |
18 | 19 |
|
19 | 20 | #pragma once |
20 | | -#include<cstdint> |
| 21 | +#include <cstdint> |
21 | 22 | #include "paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh" |
22 | 23 | #include "paddle/fluid/distributed/collective/deep_ep/kernels/exception.cuh" |
23 | 24 |
|
24 | 25 | namespace deep_ep { |
25 | 26 |
|
26 | 27 | template <typename dtype_t> |
27 | 28 | struct Buffer { |
28 | | -private: |
29 | | - uint8_t* ptr; |
30 | | - |
31 | | -public: |
32 | | - int total_bytes; |
33 | | - |
34 | | - __device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {} |
35 | | - |
36 | | - __device__ __forceinline__ Buffer(void* &gbl_ptr, int num_elems, int offset = 0) { |
37 | | - total_bytes = num_elems * sizeof(dtype_t); |
38 | | - ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t); |
39 | | - gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
40 | | - } |
41 | | - |
42 | | - __device__ __forceinline__ Buffer advance_also(void* &gbl_ptr) { |
43 | | - gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
44 | | - return *this; |
45 | | - } |
46 | | - |
47 | | - __device__ __forceinline__ dtype_t* buffer() { |
48 | | - return reinterpret_cast<dtype_t*>(ptr); |
49 | | - } |
50 | | - |
51 | | - __device__ __forceinline__ dtype_t& operator[](int idx) { |
52 | | - return buffer()[idx]; |
53 | | - } |
| 29 | + private: |
| 30 | + uint8_t* ptr; |
| 31 | + |
| 32 | + public: |
| 33 | + int total_bytes; |
| 34 | + |
| 35 | + __device__ __forceinline__ Buffer() : ptr(nullptr), total_bytes(0) {} |
| 36 | + |
| 37 | + __device__ __forceinline__ Buffer(void*& gbl_ptr, |
| 38 | + int num_elems, |
| 39 | + int offset = 0) { |
| 40 | + total_bytes = num_elems * sizeof(dtype_t); |
| 41 | + ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + offset * sizeof(dtype_t); |
| 42 | + gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
| 43 | + } |
| 44 | + |
| 45 | + __device__ __forceinline__ Buffer advance_also(void*& gbl_ptr) { |
| 46 | + gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
| 47 | + return *this; |
| 48 | + } |
| 49 | + |
| 50 | + __device__ __forceinline__ dtype_t* buffer() { |
| 51 | + return reinterpret_cast<dtype_t*>(ptr); |
| 52 | + } |
| 53 | + |
| 54 | + __device__ __forceinline__ dtype_t& operator[](int idx) { |
| 55 | + return buffer()[idx]; |
| 56 | + } |
54 | 57 | }; |
55 | 58 |
|
56 | 59 | template <typename dtype_t, int kNumRanks = 1> |
57 | 60 | struct AsymBuffer { |
58 | | -private: |
59 | | - uint8_t* ptrs[kNumRanks]; |
60 | | - int num_bytes; |
61 | | - |
62 | | -public: |
63 | | - int total_bytes; |
64 | | - |
65 | | - __device__ __forceinline__ AsymBuffer(void* &gbl_ptr, int num_elems, int num_ranks, |
66 | | - int sm_id = 0, int num_sms = 1, int offset = 0) { |
67 | | - EP_STATIC_ASSERT(kNumRanks == 1, ""); |
68 | | - num_bytes = num_elems * sizeof(dtype_t); |
69 | | - |
70 | | - int per_channel_bytes = num_bytes * num_ranks; |
71 | | - total_bytes = per_channel_bytes * num_sms; |
72 | | - ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + num_bytes * offset; |
73 | | - gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
74 | | - } |
75 | | - |
76 | | - __device__ __forceinline__ AsymBuffer(void** gbl_ptrs, int num_elems, int num_ranks, |
77 | | - int sm_id = 0, int num_sms = 1, int offset = 0) { |
78 | | - EP_STATIC_ASSERT(kNumRanks > 1, ""); |
79 | | - num_bytes = num_elems * sizeof(dtype_t); |
80 | | - |
81 | | - int per_channel_bytes = num_bytes * num_ranks; |
82 | | - total_bytes = per_channel_bytes * num_sms; |
83 | | - for (int i = 0; i < kNumRanks; ++ i) { |
84 | | - ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + per_channel_bytes * sm_id + num_bytes * offset; |
85 | | - gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes; |
86 | | - } |
87 | | - } |
88 | | - |
89 | | - __device__ __forceinline__ void advance(int shift) { |
90 | | - #pragma unroll |
91 | | - for (int i = 0; i < kNumRanks; ++ i) |
92 | | - ptrs[i] = ptrs[i] + shift * sizeof(dtype_t); |
93 | | - } |
94 | | - |
95 | | - __device__ __forceinline__ AsymBuffer advance_also(void* &gbl_ptr) { |
96 | | - gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
97 | | - return *this; |
98 | | - } |
99 | | - |
100 | | - template<int kNumAlsoRanks> |
101 | | - __device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) { |
102 | | - for (int i = 0; i < kNumAlsoRanks; ++ i) |
103 | | - gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes; |
104 | | - return *this; |
105 | | - } |
106 | | - |
107 | | - __device__ __forceinline__ dtype_t* buffer(int idx = 0) { |
108 | | - EP_STATIC_ASSERT(kNumRanks == 1, "`buffer` is only available for single rank case"); |
109 | | - return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx); |
110 | | - } |
111 | | - |
112 | | - __device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) { |
113 | | - EP_STATIC_ASSERT(kNumRanks > 1, "`buffer` is only available for single rank case"); |
114 | | - return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx); |
| 61 | + private: |
| 62 | + uint8_t* ptrs[kNumRanks]; |
| 63 | + int num_bytes; |
| 64 | + |
| 65 | + public: |
| 66 | + int total_bytes; |
| 67 | + |
| 68 | + __device__ __forceinline__ AsymBuffer(void*& gbl_ptr, |
| 69 | + int num_elems, |
| 70 | + int num_ranks, |
| 71 | + int sm_id = 0, |
| 72 | + int num_sms = 1, |
| 73 | + int offset = 0) { |
| 74 | + EP_STATIC_ASSERT(kNumRanks == 1, ""); |
| 75 | + num_bytes = num_elems * sizeof(dtype_t); |
| 76 | + |
| 77 | + int per_channel_bytes = num_bytes * num_ranks; |
| 78 | + total_bytes = per_channel_bytes * num_sms; |
| 79 | + ptrs[0] = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id + |
| 80 | + num_bytes * offset; |
| 81 | + gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
| 82 | + } |
| 83 | + |
| 84 | + __device__ __forceinline__ AsymBuffer(void** gbl_ptrs, |
| 85 | + int num_elems, |
| 86 | + int num_ranks, |
| 87 | + int sm_id = 0, |
| 88 | + int num_sms = 1, |
| 89 | + int offset = 0) { |
| 90 | + EP_STATIC_ASSERT(kNumRanks > 1, ""); |
| 91 | + num_bytes = num_elems * sizeof(dtype_t); |
| 92 | + |
| 93 | + int per_channel_bytes = num_bytes * num_ranks; |
| 94 | + total_bytes = per_channel_bytes * num_sms; |
| 95 | + for (int i = 0; i < kNumRanks; ++i) { |
| 96 | + ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + |
| 97 | + per_channel_bytes * sm_id + num_bytes * offset; |
| 98 | + gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes; |
115 | 99 | } |
| 100 | + } |
| 101 | + |
| 102 | + __device__ __forceinline__ void advance(int shift) { |
| 103 | +#pragma unroll |
| 104 | + for (int i = 0; i < kNumRanks; ++i) |
| 105 | + ptrs[i] = ptrs[i] + shift * sizeof(dtype_t); |
| 106 | + } |
| 107 | + |
| 108 | + __device__ __forceinline__ AsymBuffer advance_also(void*& gbl_ptr) { |
| 109 | + gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
| 110 | + return *this; |
| 111 | + } |
| 112 | + |
| 113 | + template <int kNumAlsoRanks> |
| 114 | + __device__ __forceinline__ AsymBuffer advance_also(void** gbl_ptrs) { |
| 115 | + for (int i = 0; i < kNumAlsoRanks; ++i) |
| 116 | + gbl_ptrs[i] = reinterpret_cast<uint8_t*>(gbl_ptrs[i]) + total_bytes; |
| 117 | + return *this; |
| 118 | + } |
| 119 | + |
| 120 | + __device__ __forceinline__ dtype_t* buffer(int idx = 0) { |
| 121 | + EP_STATIC_ASSERT(kNumRanks == 1, |
| 122 | + "`buffer` is only available for single rank case"); |
| 123 | + return reinterpret_cast<dtype_t*>(ptrs[0] + num_bytes * idx); |
| 124 | + } |
| 125 | + |
| 126 | + __device__ __forceinline__ dtype_t* buffer_by(int rank_idx, int idx = 0) { |
| 127 | + EP_STATIC_ASSERT(kNumRanks > 1, |
| 128 | + "`buffer` is only available for single rank case"); |
| 129 | + return reinterpret_cast<dtype_t*>(ptrs[rank_idx] + num_bytes * idx); |
| 130 | + } |
116 | 131 | }; |
117 | 132 |
|
118 | 133 | template <typename dtype_t, bool kDecoupled = true> |
119 | 134 | struct SymBuffer { |
120 | | -private: |
121 | | - // NOTES: for non-decoupled case, `recv_ptr` is not used |
122 | | - uint8_t* send_ptr; |
123 | | - uint8_t* recv_ptr; |
124 | | - int num_bytes; |
125 | | - |
126 | | -public: |
127 | | - int total_bytes; |
128 | | - |
129 | | - __device__ __forceinline__ SymBuffer(void* &gbl_ptr, int num_elems, int num_ranks, |
130 | | - int sm_id = 0, int num_sms = 1) { |
131 | | - num_bytes = num_elems * sizeof(dtype_t); |
132 | | - |
133 | | - int per_channel_bytes = num_bytes * num_ranks; |
134 | | - total_bytes = per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1); |
135 | | - send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id; |
136 | | - recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * (sm_id + num_sms); |
137 | | - gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
138 | | - } |
139 | | - |
140 | | - __device__ __forceinline__ dtype_t* send_buffer(int idx = 0) { |
141 | | - EP_STATIC_ASSERT(kDecoupled, "`send_buffer` is only available for non-decoupled case"); |
142 | | - return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx); |
143 | | - } |
144 | | - |
145 | | - __device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) { |
146 | | - EP_STATIC_ASSERT(kDecoupled, "`recv_buffer` is only available for non-decoupled case"); |
147 | | - return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx); |
148 | | - } |
149 | | - |
150 | | - __device__ __forceinline__ dtype_t* buffer(int idx = 0) { |
151 | | - EP_STATIC_ASSERT(not kDecoupled, "`buffer` is only available for decoupled case"); |
152 | | - return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx); |
153 | | - } |
| 135 | + private: |
| 136 | + // NOTES: for non-decoupled case, `recv_ptr` is not used |
| 137 | + uint8_t* send_ptr; |
| 138 | + uint8_t* recv_ptr; |
| 139 | + int num_bytes; |
| 140 | + |
| 141 | + public: |
| 142 | + int total_bytes; |
| 143 | + |
| 144 | + __device__ __forceinline__ SymBuffer(void*& gbl_ptr, |
| 145 | + int num_elems, |
| 146 | + int num_ranks, |
| 147 | + int sm_id = 0, |
| 148 | + int num_sms = 1) { |
| 149 | + num_bytes = num_elems * sizeof(dtype_t); |
| 150 | + |
| 151 | + int per_channel_bytes = num_bytes * num_ranks; |
| 152 | + total_bytes = |
| 153 | + per_channel_bytes * num_sms * (static_cast<int>(kDecoupled) + 1); |
| 154 | + send_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + per_channel_bytes * sm_id; |
| 155 | + recv_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + |
| 156 | + per_channel_bytes * (sm_id + num_sms); |
| 157 | + gbl_ptr = reinterpret_cast<uint8_t*>(gbl_ptr) + total_bytes; |
| 158 | + } |
| 159 | + |
| 160 | + __device__ __forceinline__ dtype_t* send_buffer(int idx = 0) { |
| 161 | + EP_STATIC_ASSERT(kDecoupled, |
| 162 | + "`send_buffer` is only available for non-decoupled case"); |
| 163 | + return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx); |
| 164 | + } |
| 165 | + |
| 166 | + __device__ __forceinline__ dtype_t* recv_buffer(int idx = 0) { |
| 167 | + EP_STATIC_ASSERT(kDecoupled, |
| 168 | + "`recv_buffer` is only available for non-decoupled case"); |
| 169 | + return reinterpret_cast<dtype_t*>(recv_ptr + num_bytes * idx); |
| 170 | + } |
| 171 | + |
| 172 | + __device__ __forceinline__ dtype_t* buffer(int idx = 0) { |
| 173 | + EP_STATIC_ASSERT(not kDecoupled, |
| 174 | + "`buffer` is only available for decoupled case"); |
| 175 | + return reinterpret_cast<dtype_t*>(send_ptr + num_bytes * idx); |
| 176 | + } |
154 | 177 | }; |
155 | 178 |
|
156 | | -} // namespace deep_ep |
| 179 | +} // namespace deep_ep |
0 commit comments