Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/big_model_inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pip install transformers
To reproduce or test a new setup, run

```py
python inference_acc.py model_name
python big_model_inference.py model_name
```

This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`.
Expand Down Expand Up @@ -43,4 +43,4 @@ Note on the results:

You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary:
- peak GPU memory is exactly the size of the model put on a given GPU
- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.
- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.
28 changes: 17 additions & 11 deletions benchmarks/big_model_inference/measures_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
import psutil
import torch

from accelerate.test_utils.testing import get_backend


torch_device_type, _, _ = get_backend()
torch_accelerator_module = getattr(torch, torch_device_type, torch.cuda)


class PeakCPUMemory:
def __init__(self):
Expand Down Expand Up @@ -54,16 +60,16 @@ def start_measure():
measures = {"time": time.time()}

gc.collect()
torch.cuda.empty_cache()
torch_accelerator_module.empty_cache()

# CPU mem
measures["cpu"] = psutil.Process().memory_info().rss
cpu_peak_tracker.start()

# GPU mem
for i in range(torch.cuda.device_count()):
measures[str(i)] = torch.cuda.memory_allocated(i)
torch.cuda.reset_peak_memory_stats()
for i in range(torch_accelerator_module.device_count()):
measures[str(i)] = torch_accelerator_module.memory_allocated(i)
torch_accelerator_module.reset_peak_memory_stats()

return measures

Expand All @@ -73,26 +79,26 @@ def end_measure(start_measures):
measures = {"time": time.time() - start_measures["time"]}

gc.collect()
torch.cuda.empty_cache()
torch_accelerator_module.empty_cache()

# CPU mem
measures["cpu"] = (psutil.Process().memory_info().rss - start_measures["cpu"]) / 2**20
measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20

# GPU mem
for i in range(torch.cuda.device_count()):
measures[str(i)] = (torch.cuda.memory_allocated(i) - start_measures[str(i)]) / 2**20
measures[f"{i}-peak"] = (torch.cuda.max_memory_allocated(i) - start_measures[str(i)]) / 2**20
for i in range(torch_accelerator_module.device_count()):
measures[str(i)] = (torch_accelerator_module.memory_allocated(i) - start_measures[str(i)]) / 2**20
measures[f"{i}-peak"] = (torch_accelerator_module.max_memory_allocated(i) - start_measures[str(i)]) / 2**20

return measures


def log_measures(measures, description):
print(f"{description}:")
print(f"- Time: {measures['time']:.2f}s")
for i in range(torch.cuda.device_count()):
print(f"- GPU {i} allocated: {measures[str(i)]:.2f}MiB")
for i in range(torch_accelerator_module.device_count()):
print(f"- {torch_device_type} {i} allocated: {measures[str(i)]:.2f}MiB")
peak = measures[f"{i}-peak"]
print(f"- GPU {i} peak: {peak:.2f}MiB")
print(f"- {torch_device_type} {i} peak: {peak:.2f}MiB")
print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB")
print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB")