Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
d1ea793
deepspeed
haqishen Jul 18, 2023
5ef4792
shard
haqishen Jul 19, 2023
b5fac57
full param deepspeed works by this commit
haqishen Jul 25, 2023
0f7086b
offload optimizer & documentation
haqishen Jul 26, 2023
687c456
format & fix save deepspeed weight
haqishen Aug 2, 2023
3b7ff0d
format & update save_checkpoint
haqishen Aug 3, 2023
105a849
update pipfile
haqishen Aug 4, 2023
f583395
update pipfile
haqishen Aug 5, 2023
cbc50fb
zero init for transformers
haqishen Aug 9, 2023
ffee1c0
add some new config
haqishen Aug 9, 2023
f40ef52
fix bug
haqishen Aug 9, 2023
9cd37ab
min 1e6
haqishen Aug 10, 2023
69e9eb1
update deepspeed config
haqishen Aug 17, 2023
1415cdc
Merge main to deepspeed
haqishen Aug 17, 2023
9db3fbd
Merge branch 'main' into deepspeed
haqishen Aug 17, 2023
b0df016
Update requirements.txt
haqishen Aug 17, 2023
d30b51c
remove duplicate code
haqishen Aug 18, 2023
a4b76c3
Merge branch 'deepspeed' of github.com:h2oai/h2o-llmstudio into deeps…
haqishen Aug 18, 2023
67629ee
throw warning when compile w/ deepspeed
haqishen Aug 18, 2023
48d7f71
black
haqishen Aug 18, 2023
d1efef5
integrate deepspeed into wrap_model_distributed
haqishen Aug 18, 2023
d6b0748
remove unuse code
haqishen Aug 18, 2023
3f89359
style
haqishen Aug 18, 2023
5c253f2
fix bug
haqishen Aug 18, 2023
9ff717f
fix bug
haqishen Aug 18, 2023
405b207
Merge branch 'main' into deepspeed
haqishen Aug 18, 2023
b3495d4
max token len to 16k
haqishen Aug 18, 2023
7b78538
deepspeed save lora
haqishen Aug 21, 2023
892f47c
update get optimizer
haqishen Aug 21, 2023
f2dfb89
fix check disk
haqishen Aug 21, 2023
efe77bb
Merge branch 'main' into deepspeed
haqishen Aug 23, 2023
d297ec9
comment out offload CPU
haqishen Aug 28, 2023
a6781f1
Merge branch 'deepspeed' of github.com:h2oai/h2o-llmstudio into deeps…
haqishen Aug 28, 2023
e6e46dc
Merge branch 'main' into deepspeed
haqishen Aug 28, 2023
e16cab8
Pipfile.lock
haqishen Aug 28, 2023
65a1b2d
Merge branch 'main' into deepspeed
haqishen Aug 28, 2023
32b16a5
Update requirements.txt
haqishen Aug 28, 2023
eb4c990
Merge branch 'main' into deepspeed
haqishen Aug 28, 2023
e36fada
make black
haqishen Aug 29, 2023
bc4c239
Merge branch 'deepspeed' of github.com:h2oai/h2o-llmstudio into deeps…
haqishen Aug 29, 2023
b5e59e9
add default
haqishen Aug 29, 2023
24eeb16
minor fix
haqishen Sep 4, 2023
b9e5934
minor fix
haqishen Sep 4, 2023
a296cca
minor fix
haqishen Sep 4, 2023
11a4b8d
fix val loader
haqishen Sep 5, 2023
3efa2c9
potential val loader fix
psinger Sep 7, 2023
14bc17e
update
psinger Sep 8, 2023
0f40322
merge
psinger Sep 8, 2023
bd1e134
lock
psinger Sep 8, 2023
6f81182
Update requirements.txt
psinger Sep 8, 2023
62fc9c5
improve model saving for deepspeed
haqishen Sep 26, 2023
dbbbcdf
solved INFLIGHT problem
haqishen Sep 26, 2023
c023d19
update doc
haqishen Sep 26, 2023
2785f9f
deepspeed default push to hub by cpu
haqishen Sep 28, 2023
aa17c0b
Revert "improve model saving for deepspeed"
haqishen Oct 5, 2023
4491c16
remove unuse code
haqishen Oct 5, 2023
fa031f2
Merge branch 'main' into deepspeed
haqishen Oct 10, 2023
9337741
Update requirements.txt
haqishen Oct 10, 2023
263f48a
deepspeed==0.11.1
haqishen Oct 19, 2023
83429b6
Merge branch 'main' into deepspeed
haqishen Oct 19, 2023
882631a
Update requirements.txt
haqishen Oct 19, 2023
368f0af
temp fix for deepspeed slow gen
haqishen Oct 20, 2023
011e269
Merge branch 'deepspeed' of github.com:h2oai/h2o-llmstudio into deeps…
haqishen Oct 20, 2023
d5dbbfb
style
haqishen Oct 20, 2023
5b8499c
style
haqishen Oct 20, 2023
07bb4b2
fix
psinger Oct 24, 2023
91562e9
Merge branch 'main' into deepspeed
haqishen Oct 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ Jinja2 = ">=3.1.2, <4.0.0"
tenacity = ">=8.2.2, <9.0.0"
h2o-wave = "0.26"
tiktoken = "*"
deepspeed = ">=0.9.2"
mpi4py = ">=3.1.4"

[dev-packages]
pytest = "==7.1.3"
Expand Down
496 changes: 329 additions & 167 deletions Pipfile.lock

Large diffs are not rendered by default.

50 changes: 35 additions & 15 deletions app_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,21 +136,41 @@ def start_process(
# )
else:
free_port = find_free_port()
p = subprocess.Popen(
[
"env",
f"CUDA_VISIBLE_DEVICES={','.join(gpu_list)}",
"torchrun",
f"--nproc_per_node={str(num_gpus)}",
f"--master_port={str(free_port)}",
"train_wave.py",
"-Y",
config_name,
"-Q",
",".join([str(x) for x in process_queue]),
],
env=env,
)
if cfg.environment.use_deepspeed:
logger.info("Starting deepspeed...")
p = subprocess.Popen(
[
"env",
"deepspeed",
"--include",
f"localhost:{','.join(gpu_list)}",
"--master_port",
f"{str(free_port)}",
"train_wave.py",
"-Y",
config_name,
"-Q",
",".join([str(x) for x in process_queue]),
],
env=env,
)
else:
logger.info("Starting torchrun...")
p = subprocess.Popen(
[
"env",
f"CUDA_VISIBLE_DEVICES={','.join(gpu_list)}",
"torchrun",
f"--nproc_per_node={str(num_gpus)}",
f"--master_port={str(free_port)}",
"train_wave.py",
"-Y",
config_name,
"-Q",
",".join([str(x) for x in process_queue]),
],
env=env,
)
logger.info(f"Percentage of RAM memory used: {psutil.virtual_memory().percent}")

return p
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Whether to offload optimizer to cpu for saving more GPU ram during training. Note that turn on offload_optimizer would further make training speed slower.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but make training speed slower.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but make training speed slower.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Whether to use deepspeed for saving GPU ram during training. Note that turn on deepspeed would make training speed slower.
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,10 @@ class ConfigNLPCausalLMEnvironment(DefaultConfig):

compile_model: bool = False
use_fsdp: bool = False
use_deepspeed: bool = False
deepspeed_offload_optimizer: bool = False
deepspeed_stage3_max_live_parameters: int = 1e9
deepspeed_stage3_max_reuse_distance: int = 1e9

find_unused_parameters: bool = False
trust_remote_code: bool = True
Expand Down Expand Up @@ -420,6 +424,20 @@ def __post_init__(self):

self._possible_values["number_of_workers"] = (1, multiprocessing.cpu_count(), 1)
self._possible_values["seed"] = possible_values.Number(step=1, min=-1)
self._possible_values[
"deepspeed_stage3_max_live_parameters"
] = possible_values.Number(step=1, min=1e7)
self._possible_values[
"deepspeed_stage3_max_reuse_distance"
] = possible_values.Number(step=1, min=1e7)
self._nesting.add(
[
"deepspeed_offload_optimizer",
"deepspeed_stage3_max_live_parameters",
"deepspeed_stage3_max_reuse_distance",
],
[Dependency(key="use_deepspeed", value=False, is_set=False)],
)


@dataclass
Expand Down
218 changes: 156 additions & 62 deletions llm_studio/src/utils/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
from typing import Any, Dict

import coolname
import deepspeed
import numpy as np
import torch
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
from torch.cuda.amp import autocast
from torch.distributed.fsdp.fully_sharded_data_parallel import (
FullyShardedDataParallel,
Expand Down Expand Up @@ -75,15 +77,27 @@ def save_checkpoint(model: torch.nn.Module, path: str, cfg: Any):
Dictionary with all the keys to save
"""

model = unwrap_model(model)

if hasattr(cfg.training, "lora") and cfg.training.lora:
model.backbone.save_pretrained(path)

checkpoint = {"model": model.state_dict()}

if path is not None:
torch.save(checkpoint, os.path.join(path, "checkpoint.pth"))
if cfg.environment.use_deepspeed:
if path is not None:
# gather model params from all ranks
model.save_checkpoint(os.path.join(path, "ds_checkpoint"))
if cfg.environment._local_rank == 0:
# load to cpu
state_dict = get_fp32_state_dict_from_zero_checkpoint(
os.path.join(path, "ds_checkpoint")
)
# save as normal checkpoint that can be loaded by `load_state_dict`
checkpoint = {"model": state_dict}
torch.save(checkpoint, os.path.join(path, "checkpoint.pth"))
shutil.rmtree(os.path.join(path, "ds_checkpoint"))
else:
if cfg.environment._local_rank == 0:
model = unwrap_model(model)
if hasattr(cfg.training, "lora") and cfg.training.lora:
model.backbone.save_pretrained(path)
checkpoint = {"model": model.state_dict()}
if path is not None:
torch.save(checkpoint, os.path.join(path, "checkpoint.pth"))


def load_model_weights(
Expand Down Expand Up @@ -169,6 +183,79 @@ def load_checkpoint(
logger.info(f"Weights loaded from: {weights_path}")


def deepspeed_initialize(
model: torch.nn.Module,
optimizer: torch.optim.Optimizer,
lr_scheduler: torch.optim.lr_scheduler._LRScheduler,
training_data: torch.utils.data.Dataset,
validating_data: torch.utils.data.Dataset,
cfg: Any,
):
mconfig = AutoConfig.from_pretrained(cfg.llm_backbone)
if hasattr(mconfig, "hidden_size"):
model_hidden_size = mconfig.hidden_size
elif hasattr(mconfig, "d_model"):
model_hidden_size = mconfig.d_model
else:
raise Exception(f"deepspeed do not support {cfg.llm_backbone}")
ds_config = {
"fp16": {
"enabled": True if cfg.architecture.backbone_dtype == "float16" else False,
"loss_scale_window": 100,
},
"bf16": {
"enabled": True if cfg.architecture.backbone_dtype == "bfloat16" else False,
"loss_scale_window": 100,
},
# https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
"zero_force_ds_cpu_optimizer": False,
"zero_optimization": {
"stage": 3,
"overlap_comm": False,
"contiguous_gradients": True,
"reduce_bucket_size": model_hidden_size * model_hidden_size,
# zero3
"mics_shard_size": cfg.environment._world_size,
"stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
"stage3_param_persistence_threshold": 10 * model_hidden_size,
"stage3_max_live_parameters": cfg.environment.deepspeed_stage3_max_live_parameters, # noqa: E501
"stage3_max_reuse_distance": cfg.environment.deepspeed_stage3_max_reuse_distance, # noqa: E501
# zero++
# "reduce_scatter": True,
# "zero_quantized_weights": True,
# "zero_hpz_partition_size": 16,
# "zero_quantized_gradients": True,
},
"steps_per_print": 2000,
"train_batch_size": cfg.training.batch_size * cfg.environment._world_size,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": False,
}
if cfg.environment.deepspeed_offload_optimizer:
ds_config["zero_optimization"]["offload_optimizer"] = {
"device": "cpu",
"pin_memory": True,
}
# TODO: RuntimeError: Tensors must be CUDA and dense
# if cfg.environment.deepspeed_offload_param:
# ds_config["zero_optimization"]["offload_param"] =
# {"device": "cpu", "pin_memory": True}

model, optimizer, train_dataloader, scheduler = deepspeed.initialize(
model=model,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
training_data=training_data,
config_params=ds_config,
)
_, _, val_dataloader, _ = deepspeed.initialize(
model=torch.nn.Linear(1, 1),
training_data=validating_data,
config_params=ds_config,
)
return model, optimizer, train_dataloader, val_dataloader, scheduler


def wrap_model_distributed(model: torch.nn.Module, cfg: Any, fsdp: bool):
if fsdp:
auto_wrap_policy = None
Expand Down Expand Up @@ -214,59 +301,66 @@ def get_optimizer(model: torch.nn.Module, cfg: Any) -> torch.optim.Optimizer:
Returns:
Optimizer
"""

no_decay = ["bias", "LayerNorm.weight"]
differential_layers = cfg.training.differential_learning_rate_layers
optimizer = Optimizers.get(cfg.training.optimizer)(
[
{
"params": [
param
for name, param in model.named_parameters()
if (not any(layer in name for layer in differential_layers))
and (not any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.learning_rate,
"weight_decay": cfg.training.weight_decay,
},
{
"params": [
param
for name, param in model.named_parameters()
if (not any(layer in name for layer in differential_layers))
and (any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.learning_rate,
"weight_decay": 0,
},
{
"params": [
param
for name, param in model.named_parameters()
if (any(layer in name for layer in differential_layers))
and (not any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.differential_learning_rate,
"weight_decay": cfg.training.weight_decay,
},
{
"params": [
param
for name, param in model.named_parameters()
if (any(layer in name for layer in differential_layers))
and (any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.differential_learning_rate,
"weight_decay": 0,
},
],
lr=cfg.training.learning_rate,
weight_decay=cfg.training.weight_decay,
)
if cfg.environment.use_deepspeed and cfg.training.lora:
logger.info(
"Deepspeed /w Lora training do not support differential learning rate."
)
optimizer = Optimizers.get(cfg.training.optimizer)(
model.parameters(), lr=cfg.training.learning_rate
)
else:
no_decay = ["bias", "LayerNorm.weight"]
differential_layers = cfg.training.differential_learning_rate_layers
optimizer = Optimizers.get(cfg.training.optimizer)(
[
{
"params": [
param
for name, param in model.named_parameters()
if (not any(layer in name for layer in differential_layers))
and (not any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.learning_rate,
"weight_decay": cfg.training.weight_decay,
},
{
"params": [
param
for name, param in model.named_parameters()
if (not any(layer in name for layer in differential_layers))
and (any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.learning_rate,
"weight_decay": 0,
},
{
"params": [
param
for name, param in model.named_parameters()
if (any(layer in name for layer in differential_layers))
and (not any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.differential_learning_rate,
"weight_decay": cfg.training.weight_decay,
},
{
"params": [
param
for name, param in model.named_parameters()
if (any(layer in name for layer in differential_layers))
and (any(nd in name for nd in no_decay))
# and param.requires_grad
],
"lr": cfg.training.differential_learning_rate,
"weight_decay": 0,
},
],
lr=cfg.training.learning_rate,
weight_decay=cfg.training.weight_decay,
)

return optimizer

Expand Down
Loading