Skip to content

Commit 9147618

Browse files
committed
[CP][BugFix]Dev fix custom ar unstable result (PaddlePaddle#4437)
1 parent fde827f commit 9147618

File tree

14 files changed

+21
-299
lines changed

14 files changed

+21
-299
lines changed

fastdeploy/distributed/communication.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def tensor_model_parallel_all_reduce(
5353
global _TP_AR
5454
if _TP_AR is not None and _TP_AR.should_custom_ar(input_):
5555
# TODO: supports different_group custom allreduce
56-
_TP_AR.custom_all_reduce(input_)
56+
input_ = _TP_AR.custom_all_reduce(input_)
5757
elif paddle.in_dynamic_mode():
5858
if group_ is not None:
5959
dist.all_reduce(input_, group=group_)
@@ -63,6 +63,7 @@ def tensor_model_parallel_all_reduce(
6363
dist.all_reduce(input_, group=mp_group)
6464
else:
6565
dist.all_reduce(input_)
66+
return input_
6667

6768
except:
6869
tensor_model_parallel_all_reduce = None

fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,13 +212,13 @@ def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
212212
stream_capturing = lib.cudaStreamIsCapturing(stream)
213213
if stream_capturing.value == 1:
214214
# 1 is cudaStreamCaptureStatusActive: The stream is capturing.
215-
return self.all_reduce(input, input, registered=True)
215+
return self.all_reduce(input, registered=True)
216216
else:
217217
# If warm up, mimic the allocation pattern since custom
218218
# allreduce is out-of-place.
219219
return paddle.empty_like(input)
220220
else:
221-
return self.all_reduce(input, input, registered=False)
221+
return self.all_reduce(input, registered=False)
222222

223223
def close(self):
224224
if self._ptr:

fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,5 +243,5 @@ def apply(
243243
out = intermediate_cache3.sum(axis=1)
244244

245245
if layer.tp_size > 1:
246-
tensor_model_parallel_all_reduce(out)
246+
out = tensor_model_parallel_all_reduce(out)
247247
return out

fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def compute_ffn(
180180
tensor_model_parallel_all_reduce,
181181
)
182182

183-
tensor_model_parallel_all_reduce(fused_moe_out)
183+
fused_moe_out = tensor_model_parallel_all_reduce(fused_moe_out)
184184

185185
return fused_moe_out
186186

fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py

Lines changed: 0 additions & 279 deletions
This file was deleted.

fastdeploy/model_executor/layers/linear.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,7 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
778778
out = paddle.matmul(x, self.weight)
779779

780780
if self.reduce_results and self.nranks > 1:
781-
tensor_model_parallel_all_reduce(out, self.tp_group)
781+
out = tensor_model_parallel_all_reduce(out, self.tp_group)
782782
if not self.fd_config.quant_config and self.add_bias:
783783
out = paddle.add(out, self.bias)
784784
return out

fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ def apply_tp(
298298
)
299299

300300
if layer.reduce_results and layer.tp_size > 1:
301-
tensor_model_parallel_all_reduce(fused_moe_out)
301+
fused_moe_out = tensor_model_parallel_all_reduce(fused_moe_out, layer.fd_config.parallel_config.tp_group)
302302

303303
return fused_moe_out
304304

fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,6 @@ def apply_tp(
575575
1.0,
576576
)[0]
577577
if layer.tp_size > 1:
578-
tensor_model_parallel_all_reduce(tmp_ffn_out)
578+
tmp_ffn_out = tensor_model_parallel_all_reduce(tmp_ffn_out)
579579

580580
return tmp_ffn_out

fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,6 @@ def apply(
353353
ffn_out = ffn_out.sum(axis=1)
354354

355355
if layer.reduce_results and layer.tp_size > 1:
356-
tensor_model_parallel_all_reduce(ffn_out)
356+
ffn_out = tensor_model_parallel_all_reduce(ffn_out)
357357

358358
return ffn_out

fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ def apply(
599599
out = down_proj_out.sum(axis=1)
600600

601601
if layer.tp_size > 1:
602-
tensor_model_parallel_all_reduce(out)
602+
out = tensor_model_parallel_all_reduce(out)
603603

604604
return out
605605

@@ -997,6 +997,6 @@ def apply(
997997
out = intermediate_cache3.sum(axis=1)
998998

999999
if layer.tp_size > 1:
1000-
tensor_model_parallel_all_reduce(out)
1000+
out = tensor_model_parallel_all_reduce(out)
10011001

10021002
return out

0 commit comments

Comments
 (0)