Skip to content

Commit d5f6915

Browse files
authored
Fix release of IPG buffer (#7376)
#6993 broke many paths in ZeRO1/2 optimizer. This PR fixes most of the issues the PR caused. Currently we still have one error with tests in `unit/runtime/zero`. ``` ====================================== short test summary info ====================================== FAILED test_zero.py::TestParamPartitioningSkipInit::test[dtype1] - RuntimeError: mat1 and mat2 must have the same dtype, but got Half and BFloat16 ========= 1 failed, 204 passed, 66 skipped, 15 deselected, 5 warnings in 2305.03s (0:38:25) ========= ``` --------- Signed-off-by: Masahiro Tanaka <[email protected]>
1 parent 9606f8f commit d5f6915

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

deepspeed/runtime/zero/stage_1_and_2.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ class IPGBucket:
109109
has_moe_params: bool = False
110110

111111
def clear(self):
112-
self.buffer.clear()
113112
self.params.clear()
114113
self.grads.clear()
115114
self.elements = 0
@@ -734,7 +733,7 @@ def _round_robin_reorder(self, tensor_list, num_partitions):
734733
def _release_ipg_buffers(self):
735734
if self.contiguous_gradients:
736735
for bucket in self.ipg_buckets.values():
737-
bucket.clear()
736+
bucket.buffer.clear()
738737

739738
self.grads_in_partition = None
740739
self.grads_in_partition_offset = 0
@@ -1443,10 +1442,11 @@ def reduce_ipg_grads(self):
14431442
if self.contiguous_gradients:
14441443
if comm_dtype in self.extra_large_param_to_reduce:
14451444
assert len(bucket.params) == 1, "more than 1 param in ipg bucket, this shouldn't happen"
1446-
_, _, param_id = self.params[0]
1447-
assert self.get_param_id(self.extra_large_param_to_reduce
1445+
_, _, param_id = bucket.params[0]
1446+
assert self.get_param_id(self.extra_large_param_to_reduce[comm_dtype]
14481447
) == param_id, "param in ipg bucket does not match extra-large param"
1449-
extra_large_grad_reduc = self.get_gradient_for_reduction(self.extra_large_param_to_reduce)
1448+
extra_large_grad_reduc = self.get_gradient_for_reduction(
1449+
self.extra_large_param_to_reduce[comm_dtype])
14501450
self.average_tensor(extra_large_grad_reduc.view(-1), comm_dtype)
14511451
del self.extra_large_param_to_reduce[comm_dtype]
14521452
else:

0 commit comments

Comments
 (0)