[OPUS] Add finfo class for float-valued type properties (#2330)

carlushuang · web-flow · commit cb0b0c8817fe · 2026-03-19T14:19:37.000+08:00
* [OPUS] Add finfo class for float-valued type properties (eps/max/min/tiny/bits)

Supports fp32, fp16, bf16, fp8, bf8, fp4, e8m0 with gfx950/gfx942 specializations.
Verified bitwise against torch.finfo on both MI355 (gfx950) and MI308 (gfx942).

* [OPUS] Use explicit opus:: namespace in test_finfo.cu

* [OPUS] Apply black formatting and update README compile times

* [OPUS] Use __gfx942__ guard instead of __gfx950__ in numeric_limits and finfo
diff --git a/csrc/include/opus/opus.hpp b/csrc/include/opus/opus.hpp
@@ -828,7 +828,8 @@ REGISTER_DTYPE(i8  , signed char)
 REGISTER_DTYPE(u8  , unsigned char)
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-// numeric_limits -- min/max/lowest/quiet_nan/infinity for all registered dtypes
+// numeric_limits -- returns min/max/lowest/quiet_nan/infinity in the *original* dtype
+// (see finfo below for float-valued properties like eps/max/min/tiny)
 template<typename T> struct numeric_limits;
 
 template<> struct numeric_limits<fp32_t> {
@@ -858,10 +859,10 @@ template<> struct numeric_limits<bf16_t> {
 // fp8 E4M3: gfx950=OCP(ieee-like, NaN=0x7F), gfx942=fnuz(NaN=0x80). No infinity in either format.
 // NOTE: __builtin_bit_cast with _BitInt(8) is not yet constexpr in clang, so use static_cast via signed char.
 template<> struct numeric_limits<fp8_t> {
-#if defined(__gfx950__)
-    static constexpr unsigned char bin_min = 0x08, bin_max = 0x7E, bin_lowest = 0xFE, bin_qnan = 0x7F, bin_inf = 0x00;
-#else
+#if defined(__gfx942__)
     static constexpr unsigned char bin_min = 0x08, bin_max = 0x7F, bin_lowest = 0xFF, bin_qnan = 0x80, bin_inf = 0x00;
+#else
+    static constexpr unsigned char bin_min = 0x08, bin_max = 0x7E, bin_lowest = 0xFE, bin_qnan = 0x7F, bin_inf = 0x00;
 #endif
     OPUS_H_D static constexpr fp8_t min()       { return static_cast<fp8_t>(static_cast<signed char>(bin_min)); }
     OPUS_H_D static constexpr fp8_t max()       { return static_cast<fp8_t>(static_cast<signed char>(bin_max)); }
@@ -871,10 +872,10 @@ template<> struct numeric_limits<fp8_t> {
 };
 // bf8 E5M2: gfx950=OCP(ieee, has inf=0x7C, NaN=0x7E), gfx942=fnuz(no inf, NaN=0x80)
 template<> struct numeric_limits<bf8_t> {
-#if defined(__gfx950__)
-    static constexpr unsigned char bin_min = 0x04, bin_max = 0x7B, bin_lowest = 0xFB, bin_qnan = 0x7F, bin_inf = 0x7C;
-#else
+#if defined(__gfx942__)
     static constexpr unsigned char bin_min = 0x04, bin_max = 0x7F, bin_lowest = 0xFF, bin_qnan = 0x80, bin_inf = 0x00;
+#else
+    static constexpr unsigned char bin_min = 0x04, bin_max = 0x7B, bin_lowest = 0xFB, bin_qnan = 0x7F, bin_inf = 0x7C;
 #endif
     OPUS_H_D static constexpr bf8_t min()       { return static_cast<bf8_t>(bin_min); }
     OPUS_H_D static constexpr bf8_t max()       { return static_cast<bf8_t>(bin_max); }
@@ -927,6 +928,61 @@ template<> struct numeric_limits<u8_t> {
     OPUS_H_D static constexpr u8_t infinity()  { return 0; }
 };
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// finfo -- like torch.finfo: eps/max/min/tiny as float, bits as int
+template<typename T> struct finfo;
+
+template<> struct finfo<fp32_t> {
+    static constexpr int bits = 32;
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x34000000u); }  // 2^-23
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x7F7FFFFFu); }  // 3.4028235e+38
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xFF7FFFFFu); }  // -3.4028235e+38
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x00800000u); }  // 2^-126
+};
+template<> struct finfo<fp16_t> {
+    static constexpr int bits = 16;
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3A800000u); }  // 2^-10 = 9.765625e-4
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x477FE000u); }  // 65504.0
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xC77FE000u); }  // -65504.0
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x38800000u); }  // 2^-14
+};
+template<> struct finfo<bf16_t> {
+    static constexpr int bits = 16;
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3C000000u); }  // 2^-7 = 0.0078125
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x7F7F0000u); }  // 3.389531e+38
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xFF7F0000u); }  // -3.389531e+38
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x00800000u); }  // 2^-126
+};
+// fp8 E4M3: gfx950=OCP(float8_e4m3fn, bias=7), gfx942=fnuz(float8_e4m3fnuz, bias=8)
+template<> struct finfo<fp8_t> {
+    static constexpr int bits = 8;
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3E000000u); }  // 2^-3 = 0.125
+#if defined(__gfx942__)
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x43700000u); }  // 240.0
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xC3700000u); }  // -240.0
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x3C000000u); }  // 2^-7 = 0.0078125
+#else
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x43E00000u); }  // 448.0
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xC3E00000u); }  // -448.0
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x3C800000u); }  // 2^-6 = 0.015625
+#endif
+};
+// bf8 E5M2: gfx950=OCP(float8_e5m2, bias=15), gfx942=fnuz(float8_e5m2fnuz, bias=16)
+template<> struct finfo<bf8_t> {
+    static constexpr int bits = 8;
+#if defined(__gfx942__)
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3E000000u); }  // 2^-3 = 0.125
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x47600000u); }  // 57344.0
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xC7600000u); }  // -57344.0
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x38000000u); }  // 2^-15
+#else
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3E800000u); }  // 2^-2 = 0.25
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x47600000u); }  // 57344.0
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xC7600000u); }  // -57344.0
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x38800000u); }  // 2^-14
+#endif
+};
+
 template<typename C, typename... S, std::enable_if_t<is_dtype_v<C> && (is_constant_v<S> && ...), bool> = true>
 OPUS_H_D constexpr auto slice(C&& container, S&&.../*ss*/) { return container; }    // TODO: fallback slice a normal value does nonthing
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1039,6 +1095,24 @@ OPUS_DEFINE_DPACKS(uint4_t, unsigned char, 4, false)          // uint4x2
 OPUS_DEFINE_FPACKS(fp4_t,   unsigned char, 4, 2, 1, true)     // fp4x2
 OPUS_DEFINE_FPACKS(e8m0_t,  unsigned char, 8, 8, 0, false)    // fp4x2
 
+// finfo specializations for subbyte/packed types (defined after OPUS_DEFINE_FPACKS)
+// fp4 E2M1: 1 sign, 2 exp, 1 mantissa, bias=1
+template<> struct finfo<fp4_t> {
+    static constexpr int bits = 4;
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3F000000u); }  // 2^-1 = 0.5
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x40C00000u); }  // 6.0
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0xC0C00000u); }  // -6.0
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x3F800000u); }  // 1.0
+};
+// e8m0: 8-bit exponent only, unsigned, bias=127
+template<> struct finfo<e8m0_t> {
+    static constexpr int bits = 8;
+    OPUS_H_D static constexpr float eps()  { return __builtin_bit_cast(float, 0x3F800000u); }  // 1.0
+    OPUS_H_D static constexpr float max()  { return __builtin_bit_cast(float, 0x7F000000u); }  // 2^127
+    OPUS_H_D static constexpr float min()  { return __builtin_bit_cast(float, 0x00400000u); }  // 2^-127 (unsigned, no negative)
+    OPUS_H_D static constexpr float tiny() { return __builtin_bit_cast(float, 0x00400000u); }  // 2^-127
+};
+
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wuninitialized"
 #pragma clang diagnostic ignored "-Wc++20-extensions"
diff --git a/op_tests/opus/README.md b/op_tests/opus/README.md
@@ -20,9 +20,10 @@ op_tests/opus/
 │   ├── test_dtype_convert.cu    # FP32<->BF16/FP16/FP8/FP4 round-trip kernels
 │   ├── test_load_store_if.cu    # Predicated load/store + free function API tests
 │   ├── test_numeric_limits.cu   # opus::numeric_limits kernel
+│   ├── test_finfo.cu            # opus::finfo kernel
 │   ├── test_mdiv.cu             # opus::magic_div kernel
 │   ├── test_workgroup_barrier.cu# Workgroup barrier kernel
-│   ├── setup.py                 # Parallel hipcc build: 11 .cu -> .o -> .so
+│   ├── setup.py                 # Parallel hipcc build: 12 .cu -> .o -> .so
 │   └── test_opus_device.py      # Python test runner (builds .so, runs all tests)
 ├── run_tests.sh                 # Runs host test + device tests
 └── README.md
@@ -110,18 +111,19 @@ Total wall clock                    ~6.9 s
 ### Per-file device compile times
 
 ```
-test_vector_add.cu         187 ms
-test_async_load.cu         185 ms
-test_numeric_limits.cu     191 ms
-test_workgroup_barrier.cu  216 ms
-test_mdiv.cu               243 ms
-test_mxfp.cu               248 ms
-test_load_store_if.cu      354 ms
-test_dtype_convert.cu      506 ms
-test_mfma_f32.cu         1,445 ms
-test_mfma_f8.cu          1,654 ms
-test_mfma_f16.cu         1,712 ms  <-- critical path
-link                        31 ms
+test_finfo.cu              127 ms
+test_async_load.cu         130 ms
+test_numeric_limits.cu     143 ms
+test_vector_add.cu         147 ms
+test_workgroup_barrier.cu  147 ms
+test_mdiv.cu               167 ms
+test_load_store_if.cu      216 ms
+test_mxfp.cu               224 ms
+test_dtype_convert.cu      292 ms
+test_mfma_f32.cu           769 ms
+test_mfma_f16.cu           863 ms
+test_mfma_f8.cu            884 ms  <-- critical path
+link                        25 ms
 ```
 
 ## How to add a new device test
@@ -232,10 +234,11 @@ In `device/test_opus_device.py`:
 | `test_load_store_if` | free_func_vector_add | Free functions `opus::load`/`opus::store`, `is_gmem_v`/`is_mem_v` type traits | all |
 | `test_load_store_if` | predicated_async_load | `gmem::async_load_if`, free function `opus::async_load_if`, `layout_linear::operator+` | all |
 | `test_numeric_limits` | all types | `opus::numeric_limits<T>` for fp32/fp16/bf16/fp8/bf8/i32/i16/i8/u8 | all |
+| `test_finfo` | all float types | `opus::finfo<T>` (eps/max/min/tiny/bits) for fp32/fp16/bf16/fp8/bf8/fp4/e8m0 | all |
 | `test_mdiv` | 11 divisors | `opus::magic_div` integer division by magic multiply | all |
 | `test_workgroup_barrier` | cumulative + streamk | `opus::workgroup_barrier` cross-workgroup synchronization | all |
 
-Total: **50+ test calls** (14 MFMA + 4 MXFP + 1 vector_add + 1 async_load + 11 dtype_convert + 3 load_store_if + 9 numeric_limits + 11 mdiv + 4 workgroup_barrier).
+Total: **50+ test calls** (14 MFMA + 4 MXFP + 1 vector_add + 1 async_load + 11 dtype_convert + 3 load_store_if + 9 numeric_limits + 7 finfo + 11 mdiv + 4 workgroup_barrier).
 
 ## Notes
 
diff --git a/op_tests/opus/device/setup.py b/op_tests/opus/device/setup.py
@@ -33,6 +33,7 @@
     "test_mdiv.cu",
     "test_numeric_limits.cu",
     "test_workgroup_barrier.cu",
+    "test_finfo.cu",
 ]
 
 
diff --git a/op_tests/opus/device/test_finfo.cu b/op_tests/opus/device/test_finfo.cu
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Device test for opus::finfo.
+// Single-thread kernel writes eps/max/min/tiny as float and bits as int.
+
+#ifdef __HIP_DEVICE_COMPILE__
+// ── Device pass ─────────────────────────────────────────────────────────────
+#include "opus/opus.hpp"
+namespace {
+
+// Each type writes 5 floats: eps, max, min, tiny, __int_as_float(bits)
+template<typename T>
+__device__ void write_finfo(float* out) {
+    out[0] = opus::finfo<T>::eps();
+    out[1] = opus::finfo<T>::max();
+    out[2] = opus::finfo<T>::min();
+    out[3] = opus::finfo<T>::tiny();
+    out[4] = __builtin_bit_cast(float, opus::finfo<T>::bits);
+}
+
+__global__ void finfo_kernel(float* out) {
+    if (__builtin_amdgcn_workitem_id_x() != 0) return;
+    write_finfo<opus::fp32_t>(out +  0);
+    write_finfo<opus::fp16_t>(out +  5);
+    write_finfo<opus::bf16_t>(out + 10);
+    write_finfo<opus::fp8_t >(out + 15);
+    write_finfo<opus::bf8_t >(out + 20);
+    write_finfo<opus::fp4_t >(out + 25);
+    write_finfo<opus::e8m0_t>(out + 30);
+}
+} // anonymous namespace
+
+#else
+// ── Host pass ───────────────────────────────────────────────────────────────
+#include "hip_host_minimal.h"
+#include <cstdio>
+
+namespace {
+__global__ void finfo_kernel(float* out) {}
+}
+
+extern "C" void run_finfo(void* d_out) {
+    finfo_kernel<<<1, 1>>>(static_cast<float*>(d_out));
+    hipError_t err = hipDeviceSynchronize();
+    if (err != hipSuccess) {
+        fprintf(stderr, "finfo_kernel failed: %s\n", hipGetErrorString(err));
+    }
+}
+#endif
diff --git a/op_tests/opus/device/test_opus_device.py b/op_tests/opus/device/test_opus_device.py
@@ -11,7 +11,7 @@
   - MFMA variants (fp32, fp16, bf16, fp8, bf8)
   - MXFP variants (fp8, fp4) -- gfx950 only
   - vector_add, async_load, dtype_convert, predicated_copy, free_func_add,
-    predicated_async_load, numeric_limits, mdiv, workgroup_barrier
+    predicated_async_load, numeric_limits, finfo, mdiv, workgroup_barrier
 """
 
 import ctypes
@@ -141,6 +141,13 @@ def run_numeric_limits(self, Out):
         fn.argtypes = [_VP]
         fn(self._ptr(Out))
 
+    # -- finfo --
+    def run_finfo(self, Out):
+        fn = self._lib.run_finfo
+        fn.restype = None
+        fn.argtypes = [_VP]
+        fn(self._ptr(Out))
+
     # -- mdiv --
     def run_mdiv(self, Dividends, OutQ, OutR, divisor):
         fn = self._lib.run_mdiv
@@ -1298,6 +1305,103 @@ def ref_int(dtype, size):
     return 0
 
 
+def test_finfo(mod):
+    """Test opus::finfo against torch.finfo reference values (bitwise comparison)."""
+    import struct
+
+    device = torch.device("cuda")
+
+    N_TYPES = 7  # fp32, fp16, bf16, fp8, bf8, fp4, e8m0
+    FIELDS_PER_TYPE = 5  # eps, max, min, tiny, bits
+    N = N_TYPES * FIELDS_PER_TYPE
+    out = torch.zeros(N, device=device, dtype=torch.float32)
+    mod.run_finfo(out)
+    raw = out.cpu()
+
+    fails = 0
+    fields = ["eps", "max", "min", "tiny", "bits"]
+
+    def float_to_u32(f):
+        return struct.unpack("I", struct.pack("f", float(f)))[0]
+
+    def u32_to_float(u):
+        return struct.unpack("f", struct.pack("I", u))[0]
+
+    def ref_from_torch_finfo(dtype):
+        fi = torch.finfo(dtype)
+        return {
+            "eps": fi.eps,
+            "max": fi.max,
+            "min": fi.min,
+            "tiny": fi.tiny,
+        }
+
+    fp8_dtype = _get_fp8_dtype()
+    bf8_dtype = _get_bf8_dtype()
+
+    # (name, offset, torch_dtype_or_None, manual_ref_or_None)
+    # For fp4 and e8m0 there is no torch.finfo, so we provide manual reference.
+    fp4_ref = {"eps": 0.5, "max": 6.0, "min": -6.0, "tiny": 1.0, "bits": 4}
+    e8m0_ref = {
+        "eps": 1.0,
+        "max": 2.0**127,
+        "min": 2.0**-127,
+        "tiny": 2.0**-127,
+        "bits": 8,
+    }
+
+    type_table = [
+        ("fp32", 0, torch.float32, 32, None),
+        ("fp16", 5, torch.float16, 16, None),
+        ("bf16", 10, torch.bfloat16, 16, None),
+        ("fp8", 15, fp8_dtype, 8, None),
+        ("bf8", 20, bf8_dtype, 8, None),
+        ("fp4", 25, None, 4, fp4_ref),
+        ("e8m0", 30, None, 8, e8m0_ref),
+    ]
+
+    for name, offset, dtype, expected_bits, manual_ref in type_table:
+        if dtype is not None:
+            ref = ref_from_torch_finfo(dtype)
+            ref["bits"] = expected_bits
+        else:
+            ref = manual_ref
+
+        type_fails = 0
+        for j, field in enumerate(fields):
+            actual_f32 = raw[offset + j].item()
+            if field == "bits":
+                # bits is stored as __int_as_float(bits), extract the int
+                actual_val = struct.unpack("I", struct.pack("f", actual_f32))[0]
+                expected_val = ref["bits"]
+                if actual_val != expected_val:
+                    print(
+                        f"    {name}.{field}: {actual_val} != expected {expected_val}"
+                    )
+                    type_fails += 1
+            else:
+                expected_f32 = float(ref[field])
+                actual_bits = float_to_u32(actual_f32)
+                expected_bits_val = float_to_u32(expected_f32)
+                if actual_bits != expected_bits_val:
+                    print(
+                        f"    {name}.{field}: 0x{actual_bits:08X} ({actual_f32}) "
+                        f"!= expected 0x{expected_bits_val:08X} ({expected_f32})"
+                    )
+                    type_fails += 1
+        if type_fails == 0:
+            print(f"  PASS: finfo<{name}> (all {len(fields)} fields)")
+        else:
+            print(f"  FAIL: finfo<{name}> ({type_fails} field(s) wrong)")
+            fails += type_fails
+
+    if fails:
+        print(f"  finfo: {fails} field(s) FAILED")
+        return 1
+    print("  PASS: finfo all types correct")
+    return 0
+
+
 def test_wb_cumulative(mod):
     """Test workgroup_barrier wait_lt + inc: N workgroups contribute i+1 sequentially."""
     device = torch.device("cuda")
@@ -1395,6 +1499,7 @@ def main():
     failures += test_free_func_vector_add(mod)
     failures += test_predicated_async_load(mod)
     failures += test_numeric_limits(mod)
+    failures += test_finfo(mod)
     failures += test_mdiv(mod)
     failures += test_wb_cumulative(mod)
     failures += test_wb_streamk_reduce(mod)

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`"test_mdiv.cu",`
`34`	`34`	`"test_numeric_limits.cu",`
`35`	`35`	`"test_workgroup_barrier.cu",`
	`36`	`+ "test_finfo.cu",`
`36`	`37`	`]`
`37`	`38`
`38`	`39`