Skip to content

Commit b8ef563

Browse files
committed
[Iluvatar GPU] fix ci caused by rebuild_padding params
1 parent a498736 commit b8ef563

File tree

2 files changed

+90
-39
lines changed

2 files changed

+90
-39
lines changed

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,9 @@ def rebuild_padding(
732732
seq_lens_decoder,
733733
seq_lens_encoder,
734734
output_padding_offset,
735+
first_token_out,
735736
max_input_length,
737+
enable_logprob,
736738
)
737739
elif current_platform.is_gcu():
738740
from fastdeploy.model_executor.ops.gcu import rebuild_padding
Lines changed: 88 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,91 @@
1+
import functools
2+
import sys
3+
import threading
4+
15
from fastdeploy import LLM, SamplingParams
26
from fastdeploy.utils import set_random_seed
37

4-
set_random_seed(123)
5-
6-
prompts = [
7-
"Hello, my name is",
8-
]
9-
10-
# 采样参数
11-
sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16)
12-
13-
# 加载模型
14-
llm = LLM(
15-
model="/data1/fastdeploy/ERNIE_300B_4L",
16-
tensor_parallel_size=8,
17-
max_model_len=8192,
18-
quantization="wint8",
19-
block_size=16,
20-
)
21-
22-
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
23-
outputs = llm.generate(prompts, sampling_params)
24-
25-
assert outputs[0].outputs.token_ids == [
26-
23768,
27-
97000,
28-
47814,
29-
59335,
30-
68170,
31-
183,
32-
49080,
33-
94717,
34-
82966,
35-
99140,
36-
31615,
37-
51497,
38-
94851,
39-
60764,
40-
10889,
41-
2,
42-
], f"{outputs[0].outputs.token_ids}"
8+
9+
def timeout(seconds):
10+
def decorator(func):
11+
@functools.wraps(func)
12+
def wrapper(*args, **kwargs):
13+
result = [None]
14+
exception = [None]
15+
16+
def target():
17+
try:
18+
result[0] = func(*args, **kwargs)
19+
except Exception as e:
20+
exception[0] = e
21+
22+
thread = threading.Thread(target=target)
23+
thread.daemon = True
24+
thread.start()
25+
thread.join(seconds)
26+
27+
if thread.is_alive():
28+
raise TimeoutError(f"Function timed out after {seconds} seconds")
29+
30+
if exception[0]:
31+
raise exception[0]
32+
33+
return result[0]
34+
35+
return wrapper
36+
37+
return decorator
38+
39+
40+
@timeout(60)
41+
def offline_infer_check():
42+
set_random_seed(123)
43+
44+
prompts = [
45+
"Hello, my name is",
46+
]
47+
48+
# 采样参数
49+
sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16)
50+
51+
# 加载模型
52+
llm = LLM(
53+
model="/data1/fastdeploy/ERNIE_300B_4L",
54+
tensor_parallel_size=8,
55+
max_model_len=8192,
56+
quantization="wint8",
57+
block_size=16,
58+
)
59+
60+
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
61+
outputs = llm.generate(prompts, sampling_params)
62+
63+
assert outputs[0].outputs.token_ids == [
64+
23768,
65+
97000,
66+
47814,
67+
59335,
68+
68170,
69+
183,
70+
49080,
71+
94717,
72+
82966,
73+
99140,
74+
31615,
75+
51497,
76+
94851,
77+
60764,
78+
10889,
79+
2,
80+
], f"{outputs[0].outputs.token_ids}"
81+
print("PASSED")
82+
83+
84+
if __name__ == "__main__":
85+
try:
86+
result = offline_infer_check()
87+
sys.exit(0)
88+
except TimeoutError:
89+
sys.exit(124)
90+
except Exception:
91+
sys.exit(1)

0 commit comments

Comments
 (0)