Skip to content

Commit c718602

Browse files
lkk12014402rootpre-commit-ci[bot]
authored
add resume finetuning checkpoint ut. (#646)
* add resume finetuning checkpoint ut. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add final tuned model. --------- Co-authored-by: root <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 94eb60f commit c718602

File tree

2 files changed

+93
-6
lines changed

2 files changed

+93
-6
lines changed

comps/finetuning/handlers.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import os
55
import random
6+
import re
67
import time
78
import urllib.parse
89
import uuid
@@ -61,7 +62,7 @@ def update_job_status(job_id: FineTuningJobID):
6162
status = "cancelled" if status == "stopped" else status
6263
logger.info(f"Status of job {job_id} is '{status}'")
6364
running_finetuning_jobs[job_id].status = status
64-
if status == "finished" or status == "cancelled" or status == "failed":
65+
if status == "succeeded" or status == "cancelled" or status == "failed":
6566
break
6667
time.sleep(CHECK_JOB_STATUS_INTERVAL)
6768

@@ -190,7 +191,21 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
190191
checkpoints = []
191192
if os.path.exists(output_dir):
192193
# Iterate over the contents of the directory and add an entry for each
193-
for _ in os.listdir(output_dir): # Loop over directory contents
194+
files = os.listdir(output_dir)
195+
for file in files: # Loop over directory contents
196+
file_path = os.path.join(output_dir, file)
197+
if os.path.isdir(file_path) and file.startswith("checkpoint"):
198+
steps = re.findall("\d+", file)[0]
199+
checkpointsResponse = FineTuningJobCheckpoint(
200+
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
201+
created_at=int(time.time()), # Use the current timestamp
202+
fine_tuned_model_checkpoint=file_path, # Directory path itself
203+
fine_tuning_job_id=fine_tuning_job_id,
204+
object="fine_tuning.job.checkpoint",
205+
step_number=steps,
206+
)
207+
checkpoints.append(checkpointsResponse)
208+
if job.status == "succeeded":
194209
checkpointsResponse = FineTuningJobCheckpoint(
195210
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
196211
created_at=int(time.time()), # Use the current timestamp
@@ -199,7 +214,6 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
199214
object="fine_tuning.job.checkpoint",
200215
)
201216
checkpoints.append(checkpointsResponse)
202-
checkpoint_id_to_checkpoint_path[checkpointsResponse.id] = checkpointsResponse.fine_tuned_model_checkpoint
203217

204218
return checkpoints
205219

tests/test_finetuning_embedding_hpu.sh

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ function start_service() {
2828
sleep 1m
2929
}
3030

31+
3132
function validate_microservice() {
3233
cd $LOG_PATH
3334
export no_proxy="localhost,127.0.0.1,"${ip_address}
@@ -79,8 +80,10 @@ function validate_microservice() {
7980
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_embed_data.json","model": "BAAI/bge-base-en-v1.5","General":{"task":"embedding","lora_cofig":null,"save_strategy":"epoch"},"Dataset":{"query_max_len":128,"passage_max_len":128,"padding":"max_length"},"Training":{"epochs":3}}' "$URL")
8081
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
8182
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
83+
FINTUNING_ID=$(echo "$RESPONSE_BODY" | jq -r '.id')
8284
SERVICE_NAME="finetuning-server - create finetuning job"
8385

86+
8487
if [ "$HTTP_STATUS" -ne "200" ]; then
8588
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
8689
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
@@ -96,10 +99,80 @@ function validate_microservice() {
9699
echo "[ $SERVICE_NAME ] Content is as expected."
97100
fi
98101

99-
sleep 10m
102+
# test /v1/fine_tuning/jobs/retrieve
103+
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs/retrieve"
104+
for((i=1;i<=10;i++));
105+
do
106+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL")
107+
echo $HTTP_RESPONSE
108+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
109+
STATUS=$(echo "$RESPONSE_BODY" | jq -r '.status')
110+
if [[ "$STATUS" == "succeeded" ]]; then
111+
echo "training: succeeded."
112+
break
113+
elif [[ "$STATUS" == "failed" ]]; then
114+
echo "training: failed."
115+
exit 1
116+
else
117+
echo "training: '$STATUS'"
118+
fi
119+
sleep 1m
120+
done
121+
122+
# test /v1/finetune/list_checkpoints
123+
URL="http://${ip_address}:$finetuning_service_port/v1/finetune/list_checkpoints"
124+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL")
125+
echo $HTTP_RESPONSE
126+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
127+
fine_tuned_model_checkpoint=$(echo "$RESPONSE_BODY" | jq -r '.[0].fine_tuned_model_checkpoint')
128+
echo $fine_tuned_model_checkpoint
129+
130+
echo "start resume checkpoint............................................."
131+
# resume checkpoint /v1/fine_tuning/jobs
132+
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs"
133+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d '{"training_file": "test_embed_data.json","model": "BAAI/bge-base-en-v1.5","General":{"task":"embedding","lora_cofig":null,"save_strategy":"epoch","resume_from_checkpoint":"'$fine_tuned_model_checkpoint'"},"Dataset":{"query_max_len":128,"passage_max_len":128,"padding":"max_length"},"Training":{"epochs":5}}' "$URL")
134+
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
135+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
136+
FINTUNING_ID=$(echo "$RESPONSE_BODY" | jq -r '.id')
137+
SERVICE_NAME="finetuning-server - resume checkpoint"
138+
139+
140+
if [ "$HTTP_STATUS" -ne "200" ]; then
141+
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
142+
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
143+
exit 1
144+
else
145+
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
146+
fi
147+
if [[ "$RESPONSE_BODY" != *'{"id":"ft-job'* ]]; then
148+
echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
149+
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
150+
exit 1
151+
else
152+
echo "[ $SERVICE_NAME ] Content is as expected."
153+
fi
154+
155+
# check training status /v1/fine_tuning/jobs/retrieve
156+
URL="http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs/retrieve"
157+
for((i=1;i<=10;i++));
158+
do
159+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": "'$FINTUNING_ID'"}' "$URL")
160+
echo $HTTP_RESPONSE
161+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
162+
STATUS=$(echo "$RESPONSE_BODY" | jq -r '.status')
163+
if [[ "$STATUS" == "succeeded" ]]; then
164+
echo "training: succeeded."
165+
break
166+
elif [[ "$STATUS" == "failed" ]]; then
167+
echo "training: failed."
168+
exit 1
169+
else
170+
echo "training: '$STATUS'"
171+
fi
172+
sleep 1m
173+
done
174+
100175

101-
# get logs
102-
docker logs finetuning-server >> ${LOG_PATH}/finetuning-server_create.log
103176
}
104177

105178
function stop_docker() {

0 commit comments

Comments
 (0)