Skip to content

Commit 58e9972

Browse files
Add LLM pretraining support (#622)
* support rerank model finetuning. Signed-off-by: Ye, Xinyu <[email protected]> * adapt rerank model to transformers' scheme. Signed-off-by: Ye, Xinyu <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix typo. Signed-off-by: Ye, Xinyu <[email protected]> * refined readme. Signed-off-by: Ye, Xinyu <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add LLM pretraining support. Signed-off-by: Ye, Xinyu <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added readme. Signed-off-by: Ye, Xinyu <[email protected]> * added rerank finetuning test. Signed-off-by: Ye, Xinyu <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added LLM pretraining test. Signed-off-by: Ye, Xinyu <[email protected]> --------- Signed-off-by: Ye, Xinyu <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent d76751a commit 58e9972

File tree

6 files changed

+348
-10
lines changed

6 files changed

+348
-10
lines changed

comps/finetuning/README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,32 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
173173

174174
```
175175

176+
### 3.2.4 LLM Pretraining
177+
178+
Use the following command to launch a job for LLM pretraining, such as `meta-llama/Llama-2-7b-hf`:
179+
180+
```bash
181+
# create a finetuning job
182+
curl http://${your_ip}:8015/v1/fine_tuning/jobs \
183+
-X POST \
184+
-H "Content-Type: application/json" \
185+
-d '{
186+
"training_file": "test_data.json",
187+
"model": "meta-llama/Llama-2-7b-hf",
188+
"General":{
189+
"task":"pretraining",
190+
"lora_config":null
191+
}
192+
}'
193+
```
194+
195+
Below is an example for the format of the pretraining dataset:
196+
197+
```json
198+
{"text": "A girl with a blue tank top sitting watching three dogs."}
199+
{"text": "A boy with a blue tank top sitting watching three dogs."}
200+
```
201+
176202
## 3.3 Manage fine-tuning job
177203

178204
Below commands show how to list finetuning jobs, retrieve a finetuning job, cancel a finetuning job and list checkpoints of a finetuning job.

comps/finetuning/finetune_config.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
DEVICE_CPU = "cpu"
1717
DEVICE_HPU = "hpu"
1818
DEVICE_GPU = "gpu"
19+
DEVICE_CUDA = "cuda"
1920

2021
ACCELERATE_STRATEGY_DDP = "DDP"
2122
ACCELERATE_STRATEGY_FSDP = "FSDP"
@@ -57,7 +58,7 @@ def check_report_to(cls, v: str):
5758

5859
@validator("task")
5960
def check_task(cls, v: str):
60-
assert v in ["instruction_tuning", "rerank", "embedding"]
61+
assert v in ["instruction_tuning", "pretraining", "rerank", "embedding"]
6162
return v
6263

6364

@@ -136,7 +137,7 @@ class TrainingConfig(BaseModel):
136137
def check_device(cls, v: str):
137138
# will convert to lower case
138139
if v:
139-
assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU]
140+
assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU, DEVICE_CUDA]
140141
return v.lower()
141142

142143
@validator("hpu_execution_mode")

comps/finetuning/llm_on_ray/finetune/data_process.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
IGNORE_INDEX = -100
1919

2020

21-
class DataProcessor:
21+
class InstructionDataProcessor:
2222
# We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
2323
def __init__(self, config, tokenizer):
2424
self.tokenizer = tokenizer
@@ -202,6 +202,39 @@ def tokenize(self, examples):
202202
return examples
203203

204204

205+
class PretrainingDataProcessor:
206+
def __init__(self, config, tokenizer):
207+
self.tokenizer = tokenizer
208+
self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
209+
self.truncation = config["Dataset"].get("truncation", True)
210+
self.padding = config["Dataset"].get("padding", True)
211+
212+
def tokenize(self, examples):
213+
keys = list(examples.data.keys())
214+
if len(keys) != 1 and "text" not in keys:
215+
raise ValueError("Unsupported dataset format")
216+
217+
key = keys[0] if len(keys) == 1 else "text"
218+
examples["input_ids"] = []
219+
examples["labels"] = []
220+
examples["attention_mask"] = []
221+
for exp in examples[key]:
222+
results = self.tokenizer(
223+
exp,
224+
padding=self.padding,
225+
truncation=self.truncation,
226+
return_tensors=None,
227+
max_length=self.max_length,
228+
)
229+
230+
input_ids = results["input_ids"]
231+
labels = copy.deepcopy(input_ids)
232+
examples["input_ids"].append(results["input_ids"])
233+
examples["labels"].append(labels)
234+
examples["attention_mask"].append(results["attention_mask"])
235+
return examples
236+
237+
205238
class TrainDatasetForCE(Dataset):
206239
def __init__(self, dataset, args, tokenizer):
207240
self.dataset = dataset

comps/finetuning/llm_on_ray/finetune/finetune.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@
2828
from comps.finetuning.finetune_config import FinetuneConfig
2929
from comps.finetuning.llm_on_ray import common
3030
from comps.finetuning.llm_on_ray.finetune.data_process import (
31-
DataProcessor,
3231
EmbedCollator,
3332
GroupCollator,
33+
InstructionDataProcessor,
34+
PretrainingDataProcessor,
3435
TrainDatasetForCE,
3536
TrainDatasetForEmbedding,
3637
)
@@ -198,9 +199,9 @@ def tokenize_dataset(config: Dict, tokenizer, dataset):
198199
if task == "instruction_tuning":
199200
group = config["Dataset"].get("group", True)
200201
block_size = config["Dataset"].get("block_size", 512)
201-
tokenizer.pad_token = tokenizer.eos_token
202+
tokenizer.pad_token = tokenizer.eos_token if not tokenizer.pad_token else tokenizer.pad_token
202203

203-
processor = DataProcessor(config, tokenizer)
204+
processor = InstructionDataProcessor(config, tokenizer)
204205

205206
for key in dataset:
206207
prompts = processor.make_prompt(dataset[key])
@@ -221,6 +222,48 @@ def tokenize_dataset(config: Dict, tokenizer, dataset):
221222
desc="Tokenize dataset",
222223
)
223224

225+
if group:
226+
227+
def group_texts(examples):
228+
# Concatenate all texts.
229+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
230+
total_length = len(concatenated_examples[list(examples.keys())[0]])
231+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
232+
# customize this part to your needs.
233+
if total_length >= block_size:
234+
total_length = (total_length // block_size) * block_size
235+
# Split by chunks of max_len.
236+
result = {
237+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
238+
for k, t in concatenated_examples.items()
239+
}
240+
return result
241+
242+
tokenized_dataset = tokenized_dataset.map(
243+
group_texts,
244+
batched=True,
245+
load_from_cache_file=False,
246+
desc=f"Grouping texts in chunks of {block_size}",
247+
)
248+
249+
return tokenized_dataset
250+
elif task == "pretraining":
251+
group = True
252+
block_size = config["Dataset"].get("block_size", 512)
253+
tokenizer.pad_token = tokenizer.eos_token if not tokenizer.pad_token else tokenizer.pad_token
254+
255+
processor = PretrainingDataProcessor(config, tokenizer)
256+
257+
column_names = list(dataset["train"].features)
258+
259+
tokenized_dataset = dataset.map(
260+
processor.tokenize,
261+
remove_columns=column_names,
262+
batched=True,
263+
load_from_cache_file=False,
264+
desc="Tokenize dataset",
265+
)
266+
224267
if group:
225268

226269
def group_texts(examples):
@@ -258,7 +301,7 @@ def group_texts(examples):
258301

259302
def prepare_data_collator(config: Dict, tokenizer):
260303
task = config["General"].get("task", "instruction_tuning")
261-
if task == "instruction_tuning":
304+
if task == "instruction_tuning" or task == "pretraining":
262305
return transformers.DataCollatorForLanguageModeling(
263306
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
264307
)
@@ -280,10 +323,10 @@ def load_model(config: Dict):
280323
model_dtype = convert_dtype(config["Training"].get("mixed_precision", "no"))
281324
model_config = config["General"].get("config", {})
282325
task = config["General"].get("task", "instruction_tuning")
283-
if task == "instruction_tuning":
326+
if task == "instruction_tuning" or task == "pretraining":
284327
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, **model_config)
285328
lora_config = config["General"].get("lora_config", None)
286-
if lora_config:
329+
if lora_config and task != "pretraining":
287330
peft_config = LoraConfig(**lora_config)
288331
model = get_peft_model(model, peft_config)
289332
elif task == "rerank":
@@ -326,7 +369,7 @@ def load_model(config: Dict):
326369

327370
def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator):
328371
device = config["Training"]["device"]
329-
if device in ["cpu", "gpu"]:
372+
if device in ["cpu", "gpu", "cuda"]:
330373
training_args = convert_to_training_args(TrainingArguments, config)
331374
trainer = Trainer(
332375
model=model,
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/bin/bash
2+
# Copyright (C) 2024 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
set -x
6+
7+
WORKPATH=$(dirname "$PWD")
8+
LOG_PATH="$WORKPATH/tests"
9+
ip_address=$(hostname -I | awk '{print $1}')
10+
finetuning_service_port=8015
11+
ray_port=8265
12+
13+
function build_docker_images() {
14+
cd $WORKPATH
15+
echo $(pwd)
16+
docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/docker/Dockerfile_cpu .
17+
if [ $? -ne 0 ]; then
18+
echo "opea/finetuning built fail"
19+
exit 1
20+
else
21+
echo "opea/finetuning built successful"
22+
fi
23+
}
24+
25+
function start_service() {
26+
export no_proxy="localhost,127.0.0.1,"${ip_address}
27+
docker run -d --name="finetuning-server" -p $finetuning_service_port:$finetuning_service_port -p $ray_port:$ray_port --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy opea/finetuning:latest
28+
sleep 1m
29+
}
30+
31+
function validate_microservice() {
32+
cd $LOG_PATH
33+
export no_proxy="localhost,127.0.0.1,"${ip_address}
34+
35+
# test /v1/dataprep upload file
36+
URL="http://${ip_address}:$finetuning_service_port/v1/files"
37+
cat <<EOF > test_data.json
38+
{"text": "Five women walk along a beach wearing flip-flops."}
39+
{"text": "A woman standing on a high cliff on one leg looking over a river."}
40+
{"text": "Two woman are playing instruments; one a clarinet, the other a violin."}
41+
{"text": "A girl with a blue tank top sitting watching three dogs."}
42+
{"text": "A yellow dog running along a forest path."}
43+
{"text": "It sets out essential activities in each phase along with critical factors related to those activities."}
44+
EOF
45+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'file=@./test_data.json' -F purpose="fine-tune" -H 'Content-Type: multipart/form-data' "$URL")
46+
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
47+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
48+
SERVICE_NAME="finetuning-server - upload - file"
49+
50+
# Parse the JSON response
51+
purpose=$(echo "$RESPONSE_BODY" | jq -r '.purpose')
52+
filename=$(echo "$RESPONSE_BODY" | jq -r '.filename')
53+
54+
# Define expected values
55+
expected_purpose="fine-tune"
56+
expected_filename="test_data.json"
57+
58+
if [ "$HTTP_STATUS" -ne "200" ]; then
59+
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
60+
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_upload_file.log
61+
exit 1
62+
else
63+
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
64+
fi
65+
# Check if the parsed values match the expected values
66+
if [[ "$purpose" != "$expected_purpose" || "$filename" != "$expected_filename" ]]; then
67+
echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
68+
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_upload_file.log
69+
exit 1
70+
else
71+
echo "[ $SERVICE_NAME ] Content is as expected."
72+
fi
73+
74+
# test /v1/fine_tuning/jobs
75+
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs"
76+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_data.json","model": "facebook/opt-125m","General":{"task":"pretraining","lora_config":null}}' "$URL")
77+
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
78+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
79+
SERVICE_NAME="finetuning-server - create finetuning job"
80+
81+
if [ "$HTTP_STATUS" -ne "200" ]; then
82+
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
83+
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
84+
exit 1
85+
else
86+
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
87+
fi
88+
if [[ "$RESPONSE_BODY" != *'{"id":"ft-job'* ]]; then
89+
echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
90+
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
91+
exit 1
92+
else
93+
echo "[ $SERVICE_NAME ] Content is as expected."
94+
fi
95+
96+
sleep 3m
97+
}
98+
99+
function stop_docker() {
100+
cid=$(docker ps -aq --filter "name=finetuning-server*")
101+
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
102+
}
103+
104+
function main() {
105+
106+
stop_docker
107+
108+
build_docker_images
109+
start_service
110+
111+
validate_microservice
112+
113+
stop_docker
114+
echo y | docker system prune
115+
116+
}
117+
118+
main

0 commit comments

Comments
 (0)