[MXFP4] Patch fp4_utils.py rounding logic following #975 (#2249)

Knarf04 · valarLip · web-flow · commit e6af9a487dc4 · 2026-03-18T12:39:38.000+08:00
* [MXFP4] Patch fp4_utils.py rounding logic following #975 * Fix formatting * [MXFP4] Trim the padding for non-shuffled input --------- Co-authored-by: Lingpeng Jin <103567126+valarLip@users.noreply.github.com>
diff --git a/aiter/utility/fp4_utils.py b/aiter/utility/fp4_utils.py
@@ -296,30 +296,63 @@ def _dynamic_mxfp4_quant_kernel_asm_layout(
     #           S101 -> +/- 3.0
     #           S110 -> +/- 4.0
     #           S111 -> +/- 6.0
+    # FP4 format constants
+    EXP_BIAS_FP32: tl.constexpr = 127
+    EXP_BIAS_FP4: tl.constexpr = 1
+    EBITS_F32: tl.constexpr = 8
+    EBITS_FP4: tl.constexpr = 2
+    MBITS_F32: tl.constexpr = 23
+    MBITS_FP4: tl.constexpr = 1
+
+    max_normal: tl.constexpr = 6
+    min_normal: tl.constexpr = 1
+
     qx = qx.to(tl.uint32, bitcast=True)
 
-    # Extract sign, exponents and mantissa fields from FP32
+    # Extract sign
     s = qx & 0x80000000
-    e = (qx >> 23) & 0xFF
-    m = qx & 0x7FFFFF
+    # Set everything to positive, will add sign back at the end
+    qx = qx ^ s
 
-    E8_BIAS: tl.constexpr = 127
-    E2_BIAS: tl.constexpr = 1
+    qx_fp32 = qx.to(tl.float32, bitcast=True)
+    saturate_mask = qx_fp32 >= max_normal
+    denormal_mask = (not saturate_mask) & (qx_fp32 < min_normal)
+    normal_mask = not (saturate_mask | denormal_mask)
 
     # Denormal numbers
-    # If exponent is less than 127, then it's a denormal number
-    # See above, for denormal number mantissa is always 1 and we set bit 1 of mantissa
-    adjusted_exponents = tl.core.sub(E8_BIAS, e + 1, sanitize_overflow=False)
-    m = tl.where(e < E8_BIAS, (0x400000 | (m >> 1)) >> adjusted_exponents, m)
+    denorm_exp: tl.constexpr = (
+        (EXP_BIAS_FP32 - EXP_BIAS_FP4) + (MBITS_F32 - MBITS_FP4) + 1
+    )
+    denorm_mask_int: tl.constexpr = denorm_exp << MBITS_F32
+    denorm_mask_float: tl.constexpr = tl.cast(denorm_mask_int, tl.float32, bitcast=True)
+
+    denormal_x = qx_fp32 + denorm_mask_float
+    denormal_x = denormal_x.to(tl.uint32, bitcast=True)
+    denormal_x -= denorm_mask_int
+    denormal_x = denormal_x.to(tl.uint8)
+
+    # Normal numbers
+    normal_x = qx
+    # resulting mantissa is odd
+    mant_odd = (normal_x >> (MBITS_F32 - MBITS_FP4)) & 1
+    # update exponent, rounding bias part 1
+    val_to_add = ((EXP_BIAS_FP4 - EXP_BIAS_FP32) << MBITS_F32) + (1 << 21) - 1
+    normal_x += val_to_add
+    # rounding bias part 2
+    normal_x += mant_odd
+    # take the bits!
+    normal_x = normal_x >> (MBITS_F32 - MBITS_FP4)
+    normal_x = normal_x.to(tl.uint8)
 
-    # For normal numbers, bias is changed from 127 to 1, and for subnormals, we keep exponent as 0.
-    # Note: E8_BIAS - E2_BIAS = 126, so for normals we subtract that.
-    e = tl.maximum(e, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS)
+    # Merge results
+    e2m1_value = tl.full(qx.type.get_block_shapes(), 0x7, dtype=tl.uint8)
+    e2m1_value = tl.where(normal_mask, normal_x, e2m1_value)
+    e2m1_value = tl.where(denormal_mask, denormal_x, e2m1_value)
 
-    # Combine sign, exponent, and mantissa, while saturating
-    # rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right
-    e2m1_tmp = tl.minimum((((e << 2) | (m >> 21)) + 1) >> 1, 0x7)
-    e2m1_value = ((s >> 28) | e2m1_tmp).to(tl.uint8)
+    # add sign back
+    sign_lp = s >> (MBITS_F32 + EBITS_F32 - MBITS_FP4 - EBITS_FP4)
+    sign_lp = sign_lp.to(tl.uint8)
+    e2m1_value = e2m1_value | sign_lp
 
     e2m1_value = tl.reshape(e2m1_value, [BLOCK_SIZE, MXFP4_QUANT_BLOCK_SIZE // 2, 2])
     evens, odds = tl.split(e2m1_value)
@@ -422,6 +455,10 @@ def dynamic_mxfp4_quant(
         SHUFFLE=shuffle,
     )
 
+    if not shuffle:
+        # Trim the padding if not shuffled
+        blockscale_e8m0 = blockscale_e8m0[:M, :scaleN_valid].contiguous()
+
     return (x_fp4.view(dtypes.fp4x2), blockscale_e8m0.view(dtypes.fp8_e8m0))
 
 
diff --git a/op_tests/triton_tests/quant/test_quant_mxfp4.py b/op_tests/triton_tests/quant/test_quant_mxfp4.py
@@ -5,6 +5,9 @@
 import pytest
 
 from aiter.ops.triton.quant import dynamic_mxfp4_quant
+from aiter.utility.fp4_utils import (
+    dynamic_mxfp4_quant as fp4_utils_dynamic_mxfp4_quant,
+)
 
 DEBUG_MODE = False
 
@@ -202,3 +205,55 @@ def test_dynamic_mxfp4_quant(M: int, N: int, dtype):
 
     torch.testing.assert_close(triton_scale, torch_scale)
     torch.testing.assert_close(triton_out, torch_out)
+
+
+@pytest.mark.parametrize(
+    "M, N",
+    [
+        (1, 4),
+        (1, 28),
+        (1, 32),
+        (1, 64),
+        (1, 68),
+        (2, 4),
+        (2, 28),
+        (2, 32),
+        (2, 64),
+        (2, 68),
+        (128, 4),
+        (128, 28),
+        (128, 32),
+        (128, 64),
+        (128, 68),
+        (256, 32),
+        (160, 40),
+        (280, 20),
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_fp4_utils_dynamic_mxfp4_quant(M: int, N: int, dtype):
+    torch.cuda.empty_cache()
+    torch.manual_seed(20)
+    x = torch.randn((M, N), dtype=dtype, device="cuda")
+
+    if DEBUG_MODE:
+        print(f"x.shape={x.shape} x={x}")
+
+    fp4_utils_out, fp4_utils_scale = fp4_utils_dynamic_mxfp4_quant(x)
+    if DEBUG_MODE:
+        print(
+            f"fp4_utils_out.shape={fp4_utils_out.shape} fp4_utils_out={fp4_utils_out}"
+        )
+        print(
+            f"fp4_utils_scale.shape={fp4_utils_scale.shape} fp4_utils_scale={fp4_utils_scale}"
+        )
+
+    torch_out, torch_scale = torch_dynamic_mxfp4_quant(x)
+    if DEBUG_MODE:
+        print(f"torch_out.shape={torch_out.shape} torch_out={torch_out}")
+        print(f"torch_scale.shape={torch_scale.shape} torch_scale={torch_scale}")
+
+    torch.testing.assert_close(
+        fp4_utils_scale.view(torch.uint8).cpu(), torch_scale.cpu()
+    )
+    torch.testing.assert_close(fp4_utils_out.view(torch.uint8).cpu(), torch_out.cpu())