Skip to content

Commit da41d9d

Browse files
committed
xpu
1 parent b09ebb2 commit da41d9d

File tree

4 files changed

+13
-7
lines changed

4 files changed

+13
-7
lines changed

fastdeploy/model_executor/utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,13 @@ def h2d_copy(dst, src, blocking=True):
353353
def v1_loader_support(fd_config):
354354
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration"]
355355

356+
def _get_unsupported_quant():
357+
if current_platform.is_cuda():
358+
return {"w4a8", "w4afp8", "wint2"}
359+
elif current_platform.is_xpu():
360+
return {"w4a8", "w8a8"}
361+
return set()
362+
356363
def _err_msg(msg: str) -> str:
357364
logger.info(msg + "; fallback to the v0 loader for model loading.")
358365

@@ -375,7 +382,7 @@ def _err_msg(msg: str) -> str:
375382
else:
376383
moe_quant_type = fd_config.quant_config.name()
377384
dense_quant_type = fd_config.quant_config.name()
378-
unsupported_quant = {"w4a8", "w4afp8", "wint2"}
385+
unsupported_quant = _get_unsupported_quant()
379386

380387
if unsupported_quant & {moe_quant_type, dense_quant_type}:
381388
_err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")

requirements_dcu.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ moviepy
2727
use-triton-in-paddle
2828
crcmod
2929
fastsafetensors==0.1.14
30+
safetensors==0.7.0rc0
3031
msgpack
3132
gunicorn
3233
opentelemetry-api>=1.24.0

requirements_metaxgpu.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ triton
2929
use-triton-in-paddle
3030
crcmod
3131
fastsafetensors==0.1.14
32+
safetensors==0.7.0rc0
3233
msgpack
3334
gunicorn
3435
modelscope

scripts/run_ci_xpu.sh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
8484
--num-gpu-blocks-override 16384 \
8585
--max-model-len 32768 \
8686
--max-num-seqs 128 \
87-
--quantization wint4 \
88-
--load-choices default > server.log 2>&1 &
87+
--quantization wint4 > server.log 2>&1 &
8988

9089
sleep 60
9190
# 探活
@@ -160,8 +159,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
160159
--num-gpu-blocks-override 16384 \
161160
--max-model-len 32768 \
162161
--max-num-seqs 64 \
163-
--quantization "W4A8" \
164-
--load-choices default > server.log 2>&1 &
162+
--quantization "W4A8" > server.log 2>&1 &
165163

166164
sleep 60
167165
# 探活
@@ -239,8 +237,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
239237
--enable-mm \
240238
--mm-processor-kwargs '{"video_max_frames": 30}' \
241239
--limit-mm-per-prompt '{"image": 10, "video": 3}' \
242-
--reasoning-parser ernie-45-vl \
243-
--load-choices default > server.log 2>&1 &
240+
--reasoning-parser ernie-45-vl > server.log 2>&1 &
244241

245242
sleep 60
246243
# 探活

0 commit comments

Comments
 (0)