Skip to content

Commit e6e36a0

Browse files
zhyncsAlcanderianQiaolin-YuFridge003mickqian
authored andcommitted
chore: upgrade flashinfer v0.2.6.post1 jit (sgl-project#6958)
Co-authored-by: alcanderian <[email protected]> Co-authored-by: Qiaolin Yu <[email protected]> Co-authored-by: Baizhou Zhang <[email protected]> Co-authored-by: Mick <[email protected]> Co-authored-by: ispobock <[email protected]>
1 parent dfa836d commit e6e36a0

File tree

14 files changed

+189
-27
lines changed

14 files changed

+189
-27
lines changed

.github/workflows/vllm-dependency-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
- name: Install dependencies
3131
run: |
3232
bash scripts/ci_install_dependency.sh
33-
pip install "vllm==0.8.4"
33+
pip install "vllm==0.9.0.1"
3434
pip install "bitsandbytes>=0.44.0"
3535
3636
- name: Run VLLM dependency tests

lmms-eval

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 514082ea326d903f7dfed9ec04bdbc70b7018015

python/pyproject.toml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,11 @@ runtime_common = [
4949

5050
srt = [
5151
"sglang[runtime_common]",
52-
"sgl-kernel==0.1.6.post1",
53-
"flashinfer_python==0.2.5",
54-
"torch==2.6.0",
55-
"torchvision==0.21.0",
52+
"sgl-kernel==0.1.7",
53+
"flashinfer_python==0.2.6.post1",
54+
"torch==2.7.1",
55+
"torchaudio==2.7.1",
56+
"torchvision==0.22.1",
5657
"cuda-python",
5758
"outlines>=0.0.44,<=0.1.11",
5859
"einops",
@@ -61,12 +62,13 @@ srt = [
6162
blackwell = [
6263
"sglang[runtime_common]",
6364
"sgl-kernel",
64-
"torch==2.7.0",
65+
"torch==2.7.1",
66+
"torchaudio==2.7.1",
6567
"torchvision==0.22.0",
6668
"cuda-python",
6769
"outlines>=0.0.44,<=0.1.11",
6870
"einops",
69-
"flashinfer_python==0.2.5",
71+
"flashinfer_python==0.2.6.post1",
7072
]
7173

7274
# HIP (Heterogeneous-computing Interface for Portability) for AMD

python/sglang/srt/entrypoints/engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -571,15 +571,15 @@ def _set_envs_and_config(server_args: ServerArgs):
571571
if server_args.attention_backend == "flashinfer":
572572
assert_pkg_version(
573573
"flashinfer_python",
574-
"0.2.5",
574+
"0.2.6.post1",
575575
"Please uninstall the old version and "
576576
"reinstall the latest version by following the instructions "
577577
"at https://docs.flashinfer.ai/installation.html.",
578578
)
579579
if _is_cuda:
580580
assert_pkg_version(
581581
"sgl-kernel",
582-
"0.1.6.post1",
582+
"0.1.7",
583583
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
584584
)
585585

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 64,
5+
"BLOCK_SIZE_K": 256,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 4,
8+
"num_stages": 4
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 64,
12+
"BLOCK_SIZE_N": 64,
13+
"BLOCK_SIZE_K": 128,
14+
"GROUP_SIZE_M": 16,
15+
"num_warps": 4,
16+
"num_stages": 5
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 16,
20+
"BLOCK_SIZE_N": 32,
21+
"BLOCK_SIZE_K": 256,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 4,
24+
"num_stages": 3
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 32,
28+
"BLOCK_SIZE_N": 128,
29+
"BLOCK_SIZE_K": 128,
30+
"GROUP_SIZE_M": 1,
31+
"num_warps": 4,
32+
"num_stages": 3
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 64,
36+
"BLOCK_SIZE_N": 128,
37+
"BLOCK_SIZE_K": 128,
38+
"GROUP_SIZE_M": 64,
39+
"num_warps": 8,
40+
"num_stages": 3
41+
},
42+
"24": {
43+
"BLOCK_SIZE_M": 16,
44+
"BLOCK_SIZE_N": 256,
45+
"BLOCK_SIZE_K": 128,
46+
"GROUP_SIZE_M": 32,
47+
"num_warps": 8,
48+
"num_stages": 3
49+
},
50+
"32": {
51+
"BLOCK_SIZE_M": 64,
52+
"BLOCK_SIZE_N": 128,
53+
"BLOCK_SIZE_K": 128,
54+
"GROUP_SIZE_M": 1,
55+
"num_warps": 4,
56+
"num_stages": 3
57+
},
58+
"48": {
59+
"BLOCK_SIZE_M": 64,
60+
"BLOCK_SIZE_N": 128,
61+
"BLOCK_SIZE_K": 128,
62+
"GROUP_SIZE_M": 1,
63+
"num_warps": 4,
64+
"num_stages": 4
65+
},
66+
"64": {
67+
"BLOCK_SIZE_M": 64,
68+
"BLOCK_SIZE_N": 128,
69+
"BLOCK_SIZE_K": 128,
70+
"GROUP_SIZE_M": 1,
71+
"num_warps": 4,
72+
"num_stages": 4
73+
},
74+
"96": {
75+
"BLOCK_SIZE_M": 64,
76+
"BLOCK_SIZE_N": 128,
77+
"BLOCK_SIZE_K": 64,
78+
"GROUP_SIZE_M": 1,
79+
"num_warps": 4,
80+
"num_stages": 5
81+
},
82+
"128": {
83+
"BLOCK_SIZE_M": 64,
84+
"BLOCK_SIZE_N": 128,
85+
"BLOCK_SIZE_K": 64,
86+
"GROUP_SIZE_M": 1,
87+
"num_warps": 4,
88+
"num_stages": 5
89+
},
90+
"256": {
91+
"BLOCK_SIZE_M": 128,
92+
"BLOCK_SIZE_N": 128,
93+
"BLOCK_SIZE_K": 64,
94+
"GROUP_SIZE_M": 1,
95+
"num_warps": 8,
96+
"num_stages": 5
97+
},
98+
"512": {
99+
"BLOCK_SIZE_M": 128,
100+
"BLOCK_SIZE_N": 256,
101+
"BLOCK_SIZE_K": 64,
102+
"GROUP_SIZE_M": 64,
103+
"num_warps": 8,
104+
"num_stages": 4
105+
},
106+
"1024": {
107+
"BLOCK_SIZE_M": 128,
108+
"BLOCK_SIZE_N": 256,
109+
"BLOCK_SIZE_K": 64,
110+
"GROUP_SIZE_M": 64,
111+
"num_warps": 8,
112+
"num_stages": 4
113+
},
114+
"1536": {
115+
"BLOCK_SIZE_M": 128,
116+
"BLOCK_SIZE_N": 256,
117+
"BLOCK_SIZE_K": 64,
118+
"GROUP_SIZE_M": 32,
119+
"num_warps": 8,
120+
"num_stages": 4
121+
},
122+
"2048": {
123+
"BLOCK_SIZE_M": 128,
124+
"BLOCK_SIZE_N": 256,
125+
"BLOCK_SIZE_K": 64,
126+
"GROUP_SIZE_M": 64,
127+
"num_warps": 8,
128+
"num_stages": 3
129+
},
130+
"3072": {
131+
"BLOCK_SIZE_M": 128,
132+
"BLOCK_SIZE_N": 256,
133+
"BLOCK_SIZE_K": 64,
134+
"GROUP_SIZE_M": 32,
135+
"num_warps": 8,
136+
"num_stages": 4
137+
},
138+
"4096": {
139+
"BLOCK_SIZE_M": 128,
140+
"BLOCK_SIZE_N": 256,
141+
"BLOCK_SIZE_K": 64,
142+
"GROUP_SIZE_M": 16,
143+
"num_warps": 8,
144+
"num_stages": 4
145+
}
146+
}

python/sglang/srt/layers/moe/fused_moe_triton/layer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def __init__(
316316
if params_dtype is None:
317317
params_dtype = torch.get_default_dtype()
318318

319+
self.hidden_size = hidden_size
319320
self.tp_size = (
320321
tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
321322
)

python/sglang/srt/layers/multimodal.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ def hash_kernel(
3232
offsets = block_start + tl.arange(0, BLOCK_SIZE)
3333
mask = offsets < n_elements
3434

35-
data = tl.load(input_ptr + offsets, mask=mask, other=0)
36-
mixed = data ^ (offsets + XCONST)
35+
data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
36+
mixed = data ^ (offsets.to(tl.int64) + XCONST)
3737
hash_val = mixed * PRIME
3838
hash_val = hash_val ^ (hash_val >> 16)
3939
hash_val = hash_val * (PRIME ^ XCONST)
@@ -53,7 +53,7 @@ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
5353
BLOCK_SIZE = 1024
5454
grid = (triton.cdiv(n, BLOCK_SIZE),)
5555

56-
intermediate_hashes = torch.empty(n, dtype=torch.int32, device=tensor.device)
56+
intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
5757

5858
hash_kernel[grid](
5959
tensor,

python/sglang/srt/layers/quantization/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
114114
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
115115
raise ValueError(
116116
f"{quantization} quantization requires some operators from vllm. "
117-
"Please install vllm by `pip install vllm==0.8.4`"
117+
"Please install vllm by `pip install vllm==0.9.0.1`"
118118
)
119119

120120
return QUANTIZATION_METHODS[quantization]
@@ -316,7 +316,7 @@ def new_apply(
316316
if correction_bias is not None:
317317
if not has_correction_bias:
318318
raise ValueError(
319-
"Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
319+
"Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
320320
)
321321
kwargs["e_score_correction_bias"] = correction_bias
322322
return original_apply(**kwargs)

python/sglang/test/test_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@
8181
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
8282
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
8383
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
84-
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
8584

8685
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
8786
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"

scripts/ci_install_dependency.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,18 @@ bash "${SCRIPT_DIR}/killall_sglang.sh"
1010
pip install --upgrade pip
1111

1212
# Clean up existing installations
13-
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm
14-
pip cache purge
13+
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm || true
14+
pip cache purge || true
1515
rm -rf /root/.cache/flashinfer
1616
rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer*
1717
rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
1818

1919
# Install the main package
2020
pip install -e "python[dev]"
2121

22+
# Show current packages
23+
pip list
24+
2225
# Install additional dependencies
2326
pip install mooncake-transfer-engine==0.3.2.post1 nvidia-cuda-nvrtc-cu12
2427

@@ -27,7 +30,13 @@ git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eva
2730
pip install -e lmms-eval/
2831

2932
# Install FlashMLA for attention backend tests
30-
pip install git+https://github.com/deepseek-ai/FlashMLA.git
33+
# pip install git+https://github.com/deepseek-ai/FlashMLA.git
3134

3235
# Install hf_xet
3336
pip install huggingface_hub[hf_xet]
37+
38+
# Install xformers
39+
pip install -U xformers --index-url https://download.pytorch.org/whl/cu126 --no-deps --force-reinstall
40+
41+
# Show current packages
42+
pip list

0 commit comments

Comments
 (0)