Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
110 commits
Select commit Hold shift + click to select a range
b313689
fix issues with nvfp4 dense emulation in vllm (squash)
fxmarty-amd Mar 2, 2026
bc6ff39
address comments
fxmarty-amd Mar 2, 2026
14bc668
nvfp4 moe emulation support
fxmarty-amd Mar 2, 2026
a11d131
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 2, 2026
95c6a4a
wip use TritonExperts
fxmarty-amd Mar 2, 2026
5a2cf8c
wip cleanup
fxmarty-amd Mar 2, 2026
0ea8f82
wip cleanup
fxmarty-amd Mar 2, 2026
d99373e
wip cleanup
fxmarty-amd Mar 2, 2026
7a5f2ba
fix activation quantization
fxmarty-amd Mar 2, 2026
457f9df
address comment
fxmarty-amd Mar 2, 2026
86d6316
aot weight dequantization
fxmarty-amd Mar 3, 2026
2cb040b
use emulation_dequantize_weights for quark OCP MX as well
fxmarty-amd Mar 3, 2026
7a67180
tiny fix
fxmarty-amd Mar 3, 2026
01b4dce
enable test on non-blackwell devices
fxmarty-amd Mar 3, 2026
aef916d
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Mar 3, 2026
c4aff81
add test
fxmarty-amd Mar 3, 2026
4710a00
add test
fxmarty-amd Mar 3, 2026
affdda7
support quark dense and moe nvfp4
fxmarty-amd Mar 3, 2026
da111bd
wip cleanup
fxmarty-amd Mar 3, 2026
0cc4207
bug fixes and add test
fxmarty-amd Mar 3, 2026
1d6c770
Merge branch 'main' into upstream-nvfp4-simulation-support-moe
fxmarty-amd Mar 4, 2026
cf189ef
cleanup
fxmarty-amd Mar 4, 2026
b83ea66
aot weight dequantization
fxmarty-amd Mar 4, 2026
b74afa8
use emulation_dequantize_weights for quark OCP MX as well
fxmarty-amd Mar 3, 2026
913824f
tiny fix
fxmarty-amd Mar 3, 2026
c473004
add test
fxmarty-amd Mar 3, 2026
43345ed
add test
fxmarty-amd Mar 3, 2026
6cc2a0d
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Mar 4, 2026
a8c7ee8
fix
fxmarty-amd Mar 4, 2026
ca2c2b8
Merge branch 'upstream-nvfp4-simulation-aot-weight-dequantization' in…
fxmarty-amd Mar 4, 2026
dbc5fb5
fix moe_mk.apply
fxmarty-amd Mar 4, 2026
6db0c7b
Merge branch 'main-upstream' into upstream-nvfp4-simulation-support-rocm
fxmarty-amd Mar 4, 2026
ec1f4b8
address comment
fxmarty-amd Mar 4, 2026
309cefb
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 4, 2026
cca5040
fix
fxmarty-amd Mar 4, 2026
6f08a2d
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 4, 2026
c7cfa6b
Merge branch 'upstream-nvfp4-simulation-aot-weight-dequantization' in…
fxmarty-amd Mar 4, 2026
0094cb9
important note about parallel layers
fxmarty-amd Mar 4, 2026
d440f75
fix wrong inversion
fxmarty-amd Mar 4, 2026
80b6b6c
Merge branch 'upstream-nvfp4-simulation-aot-weight-dequantization' in…
fxmarty-amd Mar 4, 2026
797b856
remove weight scale inversion
fxmarty-amd Mar 4, 2026
dc16065
use min for a13_scale
fxmarty-amd Mar 4, 2026
9007357
Merge branch 'main' into upstream-nvfp4-simulation-support-rocm
fxmarty-amd Mar 5, 2026
e7d72f5
address bowen's comments
fxmarty-amd Mar 6, 2026
e3a8ebd
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 6, 2026
311d47d
linting
fxmarty-amd Mar 6, 2026
74e6eec
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 6, 2026
bf46483
use a single global scale for a2 in MOE, following flashinfer default…
fxmarty-amd Mar 6, 2026
0b47522
do not modify test_blackwell_moe
fxmarty-amd Mar 6, 2026
4a5c5c1
fix test and typo
fxmarty-amd Mar 6, 2026
6ed0611
fix typo
fxmarty-amd Mar 6, 2026
80a37f6
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 6, 2026
35c88a8
simplify test
fxmarty-amd Mar 6, 2026
d495ef7
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Mar 6, 2026
d439e80
remove outdated comment
fxmarty-amd Mar 6, 2026
de79775
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Mar 6, 2026
58b90f1
Merge branch 'upstream-nvfp4-simulation-aot-weight-dequantization' in…
fxmarty-amd Mar 6, 2026
a5da270
revert min change
fxmarty-amd Mar 6, 2026
2d9e65c
Merge branch 'main' into upstream-nvfp4-simulation-support-rocm
fxmarty-amd Mar 24, 2026
c6791f7
address Michael's comments
fxmarty-amd Mar 26, 2026
1fa136e
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Mar 30, 2026
56dd2bf
Merge branch 'main' into upstream-nvfp4-simulation-support-rocm
fxmarty-amd Apr 1, 2026
ad93d2a
linting
fxmarty-amd Apr 1, 2026
0d788d8
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Apr 1, 2026
e8a596f
Update vllm/model_executor/layers/quantization/compressed_tensors/sch…
fxmarty-amd Apr 1, 2026
c6adfe8
Update vllm/model_executor/layers/quantization/compressed_tensors/sch…
fxmarty-amd Apr 1, 2026
e36296a
move unsupported reasons warning in is_backend_supported
fxmarty-amd Apr 1, 2026
33f118f
Merge branch 'upstream-nvfp4-simulation-support-rocm' of https://gith…
fxmarty-amd Apr 1, 2026
44aadca
fix input
fxmarty-amd Apr 1, 2026
3f36269
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Apr 2, 2026
911b316
addres Michael's comments
fxmarty-amd Apr 2, 2026
90a54e3
simulation -> emulation
fxmarty-amd Apr 2, 2026
74b9212
linting
fxmarty-amd Apr 2, 2026
d930b84
Merge branch 'main' into upstream-nvfp4-simulation-support-rocm
fxmarty-amd Apr 2, 2026
24ec4ce
pre-commit passes locally and should not take 50min
fxmarty-amd Apr 2, 2026
58439aa
Merge branch 'upstream-nvfp4-simulation-support-rocm' into upstream-n…
fxmarty-amd Apr 3, 2026
34fba54
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Apr 3, 2026
70b2d5d
remove unnecessary changes
fxmarty-amd Apr 3, 2026
0b6b325
fix
fxmarty-amd Apr 3, 2026
0b2de40
fix
fxmarty-amd Apr 3, 2026
8e61be3
Merge branch 'main' into upstream-nvfp4-simulation-support-moe
fxmarty-amd Apr 8, 2026
f2204ce
refactor OCP MX MOE emulation and address comment about moe_kernel_qu…
fxmarty-amd Apr 8, 2026
ca07f68
move to experts subfolder
fxmarty-amd Apr 8, 2026
223c275
simplifications
fxmarty-amd Apr 9, 2026
d8e9283
linting
fxmarty-amd Apr 9, 2026
1e1d139
Merge branch 'main' into upstream-nvfp4-simulation-support-moe
fxmarty-amd Apr 9, 2026
757c1bc
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Apr 9, 2026
68257c5
remove unnecessary changes
fxmarty-amd Apr 9, 2026
2b74e98
fix issues
fxmarty-amd Apr 9, 2026
896ef66
linting
fxmarty-amd Apr 9, 2026
3663f59
fix quant_dtype
fxmarty-amd Apr 9, 2026
28ef57d
outdated comment
fxmarty-amd Apr 9, 2026
adfb9da
precise comment about maybe_roundup_sizes
fxmarty-amd Apr 13, 2026
9513361
add Qwen3-30B-A3B-NVFP4, Qwen3.5-35B-A3B-MXFP4-TP2 to gfx942 tests
fxmarty-amd Apr 13, 2026
c06e387
Merge branch 'main' into upstream-nvfp4-simulation-support-moe
fxmarty-amd Apr 13, 2026
df32bf3
Merge branch 'upstream-nvfp4-simulation-support-moe' of https://githu…
fxmarty-amd Apr 13, 2026
58c499b
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Apr 13, 2026
dee3b31
update to use the kernel abstraction
fxmarty-amd Apr 13, 2026
4e7ab24
Merge branch 'main' into upstream-nvfp4-simulation-support-moe
fxmarty-amd Apr 15, 2026
bfc4f90
address comment
fxmarty-amd Apr 15, 2026
1e914e9
Merge branch 'main' into upstream-nvfp4-simulation-support-moe
fxmarty-amd Apr 16, 2026
ace24ac
Merge branch 'upstream-nvfp4-simulation-support-moe' into upstream-nv…
fxmarty-amd Apr 16, 2026
970797f
Merge branch 'main' into upstream-nvfp4-simulated-quark
fxmarty-amd Apr 16, 2026
f8cbdc2
Merge branch 'main' into upstream-nvfp4-simulated-quark
fxmarty-amd Apr 23, 2026
42518a1
Merge branch 'main' into upstream-nvfp4-simulated-quark
fxmarty-amd May 4, 2026
a6dd538
address review comment - precise warning
fxmarty-amd May 4, 2026
a1814ad
fix typo
fxmarty-amd May 4, 2026
41be38d
Merge branch 'main' into upstream-nvfp4-simulated-quark
fxmarty-amd May 5, 2026
4c729d3
remove outdated comments
fxmarty-amd May 5, 2026
6a8352d
retrigger ci: docker: Error response from daemon: failed to create ta…
fxmarty-amd May 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions tests/quantization/test_quark.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,13 @@ def get_model_args(
not QUARK_MXFP4_AVAILABLE,
reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
)
@pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS)
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize(
"config",
[pytest.param(val, id=f"config:{val}") for val in WIKITEXT_ACCURACY_CONFIGS],
)
@pytest.mark.parametrize(
"tp_size", [pytest.param(val, id=f"tp_size:{val}") for val in [1, 2]]
)
def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
device_count = torch.accelerator.device_count()
if device_count < tp_size:
Expand All @@ -268,6 +273,53 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"


@pytest.mark.skipif(
not QUARK_MXFP4_AVAILABLE,
reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
)
@pytest.mark.parametrize("tp_size", [1, 2])
def test_nvfp4_wikitext_correctness(tp_size: int):
device_count = torch.accelerator.device_count()
if device_count < tp_size:
pytest.skip(f"This test requires >={tp_size} gpus, got only {device_count}")

# NOTE: expected_value from nvidia/Qwen3-30B-A3B-NVFP4
expected_value = 11.2391

model_name = "amd-quark/Qwen3-30B-A3B-nvfp4-quark"
task = "wikitext"

rtol = 0.25

config = AccuracyTestConfig(
model_name=model_name,
excepted_value=expected_value,
)

model_args = config.get_model_args(
tp_size=tp_size,
kwargs={
"cudagraph_capture_sizes": [16],
},
)
model_args.pop("add_bos_token")

# Smaller cudagraph_capture_sizes to speed up the test.
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=task,
batch_size=64,
)

EXPECTED_VALUE = config.excepted_value
measured_value = results["results"][task]["word_perplexity,none"]
assert (
measured_value < EXPECTED_VALUE + rtol
and measured_value > EXPECTED_VALUE - rtol
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"


@pytest.mark.parametrize("config", GSM8K_ACCURACY_CONFIGS)
@pytest.mark.skipif(
not QUARK_MXFP4_AVAILABLE,
Expand Down
53 changes: 52 additions & 1 deletion vllm/model_executor/layers/quantization/quark/quark.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
QuarkMoEMethod,
)
from vllm.model_executor.layers.quantization.quark.schemes import (
QuarkNVFP4,
QuarkOCP_MX,
QuarkScheme,
QuarkW4A8_MXFP4_FP8,
Expand Down Expand Up @@ -395,6 +396,54 @@ def _is_dynamic_per_token_w8a8(
and is_weight_symmetric
)

def _is_nvfp4(
self,
weight_quant: dict[str, Any] | list[dict[str, Any]] | None,
input_quant: dict[str, Any] | list[dict[str, Any]] | None,
) -> bool:
# Confirm weights and input quantized.
if weight_quant is None or input_quant is None:
return False

# Confirm both weight_quant and input_quant are lists with 2 elements
if not isinstance(weight_quant, list) or len(weight_quant) != 2:
return False
if not isinstance(input_quant, list) or len(input_quant) != 2:
return False

# First element should be fp4 with per_group quantization
is_fp4_per_group_weight = (
weight_quant[0].get("dtype") == "fp4"
and weight_quant[0].get("qscheme") == "per_group"
and weight_quant[0].get("group_size") == 16
and not weight_quant[0].get("is_dynamic")
)
is_fp4_per_group_input = (
input_quant[0].get("dtype") == "fp4"
and input_quant[0].get("qscheme") == "per_group"
and input_quant[0].get("group_size") == 16
and input_quant[0].get("is_dynamic")
)

# Second element should be fp8_e4m3 with per_tensor quantization
is_fp8_per_tensor_weight = (
weight_quant[1].get("dtype") == "fp8_e4m3"
and weight_quant[1].get("qscheme") == "per_tensor"
and not weight_quant[1].get("is_dynamic")
)
is_fp8_per_tensor_input = (
input_quant[1].get("dtype") == "fp8_e4m3"
and input_quant[1].get("qscheme") == "per_tensor"
and not input_quant[1].get("is_dynamic")
)

return (
is_fp4_per_group_weight # type: ignore[return-value]
and is_fp4_per_group_input
and is_fp8_per_tensor_weight
and is_fp8_per_tensor_input
)

def _is_w_ocp_mx_a_x(
self, weight_quant: dict[str, Any] | None, input_quant: dict[str, Any] | None
) -> bool:
Expand Down Expand Up @@ -543,7 +592,9 @@ def _get_scheme_from_config(
weight_config = cast(dict[str, Any], config.get("weight"))
input_config = cast(dict[str, Any], config.get("input_tensors"))

if self._is_fp8_w8a8(weight_config, input_config):
if self._is_nvfp4(weight_config, input_config):
return QuarkNVFP4()
elif self._is_fp8_w8a8(weight_config, input_config):
is_fp8_w8a8_supported = self._check_scheme_supported(
QuarkW8A8Fp8.get_min_capability(), error=False
)
Expand Down
Loading
Loading