Skip to content

Commit 5b3053f

Browse files
lkk12014402rootpre-commit-ci[bot]
authored
refine logging code. (#559)
* add ut and refine logging code. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update microservice port. --------- Co-authored-by: root <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 3e87c3b commit 5b3053f

File tree

10 files changed

+32
-81
lines changed

10 files changed

+32
-81
lines changed

comps/finetuning/README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu
6161
Start docker container with below command:
6262

6363
```bash
64-
docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
64+
docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
6565
```
6666

6767
## 2.2 Setup on Gaudi2
@@ -81,7 +81,7 @@ Start docker container with below command:
8181

8282
```bash
8383
export HF_TOKEN=${your_huggingface_token}
84-
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
84+
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
8585
```
8686

8787
# 🚀3. Consume Finetuning Service
@@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in
9292

9393
```bash
9494
# upload a training file
95-
curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
95+
curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
9696

9797
# create a finetuning job
98-
curl http://${your_ip}:8005/v1/fine_tuning/jobs \
98+
curl http://${your_ip}:8015/v1/fine_tuning/jobs \
9999
-X POST \
100100
-H "Content-Type: application/json" \
101101
-d '{
@@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \
104104
}'
105105

106106
# list finetuning jobs
107-
curl http://${your_ip}:8005/v1/fine_tuning/jobs -X GET
107+
curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET
108108

109109
# retrieve one finetuning job
110-
curl http://localhost:8005/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
110+
curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
111111
"fine_tuning_job_id": ${fine_tuning_job_id}}'
112112

113113
# cancel one finetuning job
114114

115-
curl http://localhost:8005/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
115+
curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
116116
"fine_tuning_job_id": ${fine_tuning_job_id}}'
117117

118118
# list checkpoints of a finetuning job
119-
curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
119+
curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
120120

121121
```

comps/finetuning/datasets/.gitkeep

Whitespace-only changes.

comps/finetuning/finetuning_service.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,28 @@
2020
)
2121

2222

23-
@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005)
23+
@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
2424
def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks):
2525
return handle_create_finetuning_jobs(request, background_tasks)
2626

2727

2828
@register_microservice(
29-
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"]
29+
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
3030
)
3131
def list_finetuning_jobs():
3232
return handle_list_finetuning_jobs()
3333

3434

3535
@register_microservice(
36-
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005
36+
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
3737
)
3838
def retrieve_finetuning_job(request: FineTuningJobIDRequest):
3939
job = handle_retrieve_finetuning_job(request)
4040
return job
4141

4242

4343
@register_microservice(
44-
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005
44+
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
4545
)
4646
def cancel_finetuning_job(request: FineTuningJobIDRequest):
4747
job = handle_cancel_finetuning_job(request)
@@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest):
5252
name="opea_service@finetuning",
5353
endpoint="/v1/finetune/upload_training_files",
5454
host="0.0.0.0",
55-
port=8005,
55+
port=8015,
5656
)
5757
async def upload_training_files(
5858
files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
@@ -69,7 +69,7 @@ async def upload_training_files(
6969

7070

7171
@register_microservice(
72-
name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005
72+
name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
7373
)
7474
def list_checkpoints(request: FineTuningJobIDRequest):
7575
checkpoints = handle_list_finetuning_checkpoints(request)

comps/finetuning/handlers.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from pydantic_yaml import parse_yaml_raw_as, to_yaml_file
1313
from ray.job_submission import JobSubmissionClient
1414

15+
from comps import CustomLogger
1516
from comps.cores.proto.api_protocol import (
1617
FineTuningJob,
1718
FineTuningJobIDRequest,
@@ -20,6 +21,8 @@
2021
)
2122
from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig
2223

24+
logger = CustomLogger("finetuning_handlers")
25+
2326
MODEL_CONFIG_FILE_MAP = {
2427
"meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml",
2528
"mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml",
@@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID):
5053
status = str(job_status).lower()
5154
# Ray status "stopped" is OpenAI status "cancelled"
5255
status = "cancelled" if status == "stopped" else status
53-
print(f"Status of job {job_id} is '{status}'")
56+
logger.info(f"Status of job {job_id} is '{status}'")
5457
running_finetuning_jobs[job_id].status = status
5558
if status == "finished" or status == "cancelled" or status == "failed":
5659
break
@@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
102105
)
103106
finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id)
104107
if os.getenv("DEVICE", ""):
105-
print(f"specific device: {os.getenv('DEVICE')}")
108+
logger.info(f"specific device: {os.getenv('DEVICE')}")
106109
finetune_config.Training.device = os.getenv("DEVICE")
107110

108111
finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
@@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
117120
# Path to the local directory that contains the script.py file
118121
runtime_env={"working_dir": "./"},
119122
)
120-
print(f"Submitted Ray job: {ray_job_id} ...")
123+
logger.info(f"Submitted Ray job: {ray_job_id} ...")
121124

122125
running_finetuning_jobs[job.id] = job
123126
finetuning_job_to_ray_job[job.id] = ray_job_id
@@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content):
169172
content = await content.read()
170173
fout.write(content)
171174
except Exception as e:
172-
print(f"Write file failed. Exception: {e}")
175+
logger.info(f"Write file failed. Exception: {e}")
173176
raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")
174177

175178

comps/finetuning/jobs/.gitkeep

Whitespace-only changes.

comps/finetuning/lanuch.sh renamed to comps/finetuning/launch.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
if [[ -n "$RAY_PORT" ]];then
5-
export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
65
ray start --head --port $RAY_PORT
76
else
8-
export RAY_ADDRESS=http://127.0.0.1:8265
97
ray start --head
8+
export RAY_PORT=8265
109
fi
1110

11+
export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
1212
python finetuning_service.py

comps/finetuning/llm_on_ray/common/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33
#
44
# Copyright 2023 The LLM-on-Ray Authors.
55

6-
from .logging import logger
76
from .torch_config import TorchConfig

comps/finetuning/llm_on_ray/common/common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
import importlib
88
import os
99

10-
from .logging import logger
10+
from comps import CustomLogger
11+
12+
logger = CustomLogger("llm_on_ray")
1113

1214

1315
def import_all_modules(basedir, prefix=None):

comps/finetuning/llm_on_ray/common/logging.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

comps/finetuning/llm_on_ray/finetune/finetune.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,13 @@
2323
from ray.air.config import ScalingConfig
2424
from ray.train.torch import TorchTrainer
2525

26+
from comps import CustomLogger
2627
from comps.finetuning.llm_on_ray import common
2728
from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor
2829
from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig
2930

31+
logger = CustomLogger("llm_on_ray/finetune")
32+
3033

3134
def adapt_transformers_to_device(config: Dict):
3235
device = config["Training"]["device"]
@@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]):
332335

333336
training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator)
334337

335-
common.logger.info("train start")
338+
logger.info("train start")
336339
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
337340
trainer.save_model()
338-
common.logger.info("train finish")
341+
logger.info("train finish")
339342

340343

341344
def get_finetune_config():
@@ -401,7 +404,7 @@ def main(external_config=None):
401404
else:
402405
ray.init(runtime_env=runtime_env)
403406

404-
common.logger.info(f"ray available resources = {ray.available_resources()}")
407+
logger.info(f"ray available resources = {ray.available_resources()}")
405408
use_gpu = True if device == "gpu" else False
406409
scaling_config = ScalingConfig(
407410
num_workers=num_training_workers,

0 commit comments

Comments
 (0)