benchmarks

ipiszy · ipiszy · commit a967cfe21d4c · 2025-05-16T03:02:32.000-07:00
diff --git a/hopper/benchmark_attn.py b/hopper/benchmark_attn.py
@@ -13,7 +13,7 @@
     import cudnn
 except ImportError:
     cudnn = None
-# cudnn = None
+cudnn = None
 
 Timing = NamedTuple('timing', [('mean', float)])
 
@@ -24,8 +24,8 @@
 from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
 from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_func
 from flash_attn_interface import flash_attn_func as flash_attn_func_v3
-# from flash_attn_interface import flash_attn_with_kvcache as flash_attn_func_v3
-from flash_attn_interface import flash_attn_varlen_func as flash_attn_varlen_func_v3
+from flash_attn_interface import flash_attn_with_kvcache as flash_attn_func_v3
+# from flash_attn_interface import flash_attn_varlen_func as flash_attn_varlen_func_v3
 
 from triton.testing import do_bench
 
@@ -226,21 +226,22 @@ def run(*args, **kwargs):
 softcap = 0.0
 V_colmajor = False
 deterministic = False
-batch_size = 2
+batch_size = 1
 # seqlen = 2048
-seqlen = 8192
+# seqlen = 8192
+seqlen = 2048 * 8
 # seqlen = 4096
 # seqlen = 2047
-dim = 2048
-# headdim = 128
+dim = 128 * 16
+headdim = 128
 # headdim = 64
-headdim = 256
+# headdim = 256
 # for headdim in [64, 128, 256]:
 # bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
 # bs_seqlen_vals = [(16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
 # bs_seqlen_vals = [(32, 512), (16, 1024)]
 # bs_seqlen_vals = [(2, 64 * 132)]
-bs_seqlen_vals = [(2, 8192)]
+bs_seqlen_vals = [(1, 8192 * 2)]
 # bs_seqlen_vals = [(1, 16 * 1024)]
 time_f = {}
 time_b = {}
@@ -272,8 +273,10 @@ def run(*args, **kwargs):
         window_size = (-1, -1)
         # window_size = (seqlen // 2 - 1, 0)
         pack_gqa = None
+        # pack_gqa = True
         # seqlen_q = 64
         seqlen_q = seqlen
+        # seqlen_q = 1
         leftpad_k = None
         # leftpad_k = torch.full((batch_size,), 0, device=device, dtype=torch.int32)
         q = torch.randn(batch_size, seqlen_q, nheads, headdim, device=device, dtype=dtype_gen, requires_grad=True)
diff --git a/hopper/benchmark_flash_attention_fp8.py b/hopper/benchmark_flash_attention_fp8.py
@@ -29,6 +29,7 @@
     import cudnn
 except ImportError:
     cudnn = None
+cudnn = None
 
 
 def convert_to_cudnn_type(torch_type):
@@ -198,14 +199,22 @@ def attention_pytorch(qkv, dropout_p=0.0, causal=True):
     output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
     return output.to(dtype=qkv.dtype)
 
-def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+def flops(batch, q_seqlen, seqlen, headdim, nheads, causal, mode="fwd"):
     assert mode in ["fwd", "bwd", "fwd_bwd"]
-    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    f = 4 * batch * q_seqlen * seqlen * nheads * headdim // (2 if causal else 1)
     return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
 
 def efficiency(flop, time):
     return (flop / time / 10**12) if not math.isnan(time) else 0.0
 
+def data_size(batch, q_seqlen, seqlen, headdim, nheads, nkvheads, nbytes, mode="fwd"):
+    assert mode in ["fwd"]
+    d_size = batch * nbytes * headdim * (q_seqlen * nheads * 2 + seqlen * nkvheads * 2)
+    return d_size
+
+def mem_bw(nbytes, time):
+    return (nbytes / time / 1024 / 1024 / 1024 / 1024) if not math.isnan(time) else 0.0
+
 def time_fwd(func, *args, **kwargs):
     time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
     time_f = benchmark_forward(func, *args, **kwargs)
@@ -216,30 +225,50 @@ def time_fwd(func, *args, **kwargs):
 
 repeats = 30
 device = 'cuda'
-# dtype = torch.float16
+dtype = torch.float16
 dtype = torch.float8_e4m3fn
+is_gqa = True
 
+# For prefill
+q_seqlen_val = None
 # bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4224), (2, 8448), (1, 8448 * 2)]
-bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 8192 * 2)]
+# bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 8192 * 2)]
+bs_seqlen_vals = [(1, 2048 * 8)]
+# bs_seqlen_vals = [(32, 8192), (32, 2048)]
+
+# For decode 
+# q_seqlen_val = 1
+# q_seqlen_val = 4
+# bs_seqlen_vals = [(1, 128)]
 # bs_seqlen_vals = [(4, 4096), (2, 8192), (1, 8192 * 2)]
 # bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048)]
-causal_vals = [False, True]
-headdim_vals = [64, 128, 256]
-dim = 2048
+# bs_seqlen_vals = [(32, 8192*4), (32, 8192*2), (32, 8192), (32, 4096), (32, 2048), (32, 1024), (64, 8192*2), (128, 8192), (128, 4096), (128, 2048), (128, 1024)]
+
+
+# causal_vals = [False, True]
+causal_vals = [True]
+# headdim_vals = [64, 128, 256]
+headdim_vals = [128]
+dim = 128 * 16
 # dim = 256
 dropout_p = 0.0
 
-methods = (["Pytorch", "Flash3"]
-        + (["cuDNN"] if cudnn is not None else [])
-        # + (["Triton"] if attention_triton is not None else [])
-        #    + (["xformers.c"] if xops is not None else [])
-        #    + (["xformers.f"] if xops is not None else [])
-           )
+# scaling_recipe = 1
+scaling_recipe = 0
+
+methods = (["Flash3"])
+# methods = (["Pytorch", "Flash3"]
+#         + (["cuDNN"] if cudnn is not None else [])
+#         # + (["Triton"] if attention_triton is not None else [])
+#         #    + (["xformers.c"] if xops is not None else [])
+#         #    + (["xformers.f"] if xops is not None else [])
+#         )
 
 time_f = {}
 time_b = {}
 time_f_b = {}
 speed_f = {}
+mem_bw_f = {}
 speed_b = {}
 speed_f_b = {}
 for causal in causal_vals:
@@ -248,55 +277,89 @@ def time_fwd(func, *args, **kwargs):
             torch.cuda.empty_cache()
             config = (causal, headdim, batch_size, seqlen)
             nheads = dim // headdim
-            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=torch.bfloat16, requires_grad=False) for _ in range(3)]
+            nkvheads = 1 if is_gqa else nheads
+            if q_seqlen_val is not None:
+                q_seqlen = q_seqlen_val
+                q = torch.rand(batch_size, q_seqlen, nheads, headdim, device=device, dtype=torch.bfloat16, requires_grad=False)
+            else:
+                q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=torch.bfloat16, requires_grad=False)
+                q_seqlen = seqlen
+            k, v = [torch.randn(batch_size, seqlen, nkvheads, headdim, device=device, dtype=torch.bfloat16, requires_grad=False) for _ in range(2)]
             
-            qkv = torch.stack([q, k, v], dim=2)
-            qkv = qkv.to(torch.bfloat16)
-            f = time_fwd(attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False)
-            time_f[config, "Pytorch"] = f
-            res_baseline = attention_pytorch(qkv, dropout_p, causal=causal)
-
-            if attention_triton is not None:
-                q_transposed = q.transpose(1, 2).contiguous().to(torch.float8_e4m3fn)
-                k_transposed = k.transpose(1, 2).contiguous().to(torch.float8_e4m3fn)
-                v_transposed = v.transpose(1, 2).contiguous().permute(0, 1, 3, 2).to(torch.float8_e4m3fn)
-                scale = 1 / math.sqrt(headdim)
-                f = time_fwd(
-                    attention_triton, q_transposed, k_transposed, v_transposed,
-                    causal, scale, repeats=5, verbose=False, desc='Triton'
-                )
-                f = time_fwd(
-                    attention_triton, q_transposed, k_transposed, v_transposed,
-                    causal, scale, repeats=repeats, verbose=False, desc='Triton'
-                )
-                time_f[config, "Triton"] = f
-                res = attention_triton(
-                    q_transposed, k_transposed, v_transposed.permute(0, 1, 3, 2),
-                    causal, scale
-                ).half().transpose(1, 2)
-                torch.testing.assert_close(res, res_baseline, atol=0.5, rtol=0.5)
+            # qkv = torch.stack([q, k, v], dim=2)
+            # qkv = qkv.to(torch.bfloat16)
+            # f = time_fwd(attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False)
+            # time_f[config, "Pytorch"] = f
+            # res_baseline = attention_pytorch(qkv, dropout_p, causal=causal)
+
+            # if attention_triton is not None:
+            #     q_transposed = q.transpose(1, 2).contiguous().to(torch.float8_e4m3fn)
+            #     k_transposed = k.transpose(1, 2).contiguous().to(torch.float8_e4m3fn)
+            #     v_transposed = v.transpose(1, 2).contiguous().permute(0, 1, 3, 2).to(torch.float8_e4m3fn)
+            #     scale = 1 / math.sqrt(headdim)
+            #     f = time_fwd(
+            #         attention_triton, q_transposed, k_transposed, v_transposed,
+            #         causal, scale, repeats=5, verbose=False, desc='Triton'
+            #     )
+            #     f = time_fwd(
+            #         attention_triton, q_transposed, k_transposed, v_transposed,
+            #         causal, scale, repeats=repeats, verbose=False, desc='Triton'
+            #     )
+            #     time_f[config, "Triton"] = f
+            #     res = attention_triton(
+            #         q_transposed, k_transposed, v_transposed.permute(0, 1, 3, 2),
+            #         causal, scale
+            #     ).half().transpose(1, 2)
+            #     torch.testing.assert_close(res, res_baseline, atol=0.5, rtol=0.5)
 
             # out = torch.empty_like(q)
             q, k, v = q.to(dtype), k.to(dtype), v.to(dtype)
             softmax_scale = q.shape[-1] ** (-0.5)
-            descale_q = torch.tensor([1.0], dtype=torch.float32, device='cuda')
-            descale_k = torch.tensor([1.0], dtype=torch.float32, device='cuda')
-            descale_v = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+            if scaling_recipe == 0:
+                q_descale = torch.tensor([[1.0] * nkvheads] * batch_size, dtype=torch.float32, device='cuda')
+                k_descale = torch.tensor([[1.0] * nkvheads] * batch_size, dtype=torch.float32, device='cuda') 
+                v_descale = torch.tensor([[1.0] * nkvheads] * batch_size, dtype=torch.float32, device='cuda') 
+            elif scaling_recipe == 1:
+                q_descale = torch.tensor([[1.0] * int(q_seqlen * batch_size)] * nheads, dtype=torch.float32, device='cuda').T
+                k_descale = torch.tensor([[1.0] * int((seqlen + 223) / 224) * batch_size] * nkvheads, dtype=torch.float32, device='cuda').T
+                v_descale = torch.tensor([[1.0] * int((seqlen + 223) / 224) * batch_size] * nkvheads, dtype=torch.float32, device='cuda').T
+            else:
+                raise ValueError(f"Unsupported scaling recipe: {scaling_recipe}")
+
+            # print(f"{q_descale.shape=}, {q_descale.stride()=}, {k_descale.shape=}, {k_descale.stride()=}", flush=True)
 
             # f = time_fwd(flash_attn_func, q, k, v, causal=causal, repeats=repeats, verbose=False)
             f = time_fwd(
                 _flash_attn_forward,
                 q, 
                 k, 
                 v, 
-                softmax_scale, 
+                None, # k_new,
+                None, # v_new,
+                None, # qv,
+                None, # out,
+                None, # cu_seqlens_q,
+                None, # cu_seqlens_k,
+                None, # cu_seqlens_k_new,
+                None, # seqused_q,
+                None, # seqused_k,
+                None, # max_seqlen_q,
+                None, # max_seqlen_k,
+                None, # page_table,
+                None, # kv_batch_idx,
+                None, # leftpad_k,
+                None, # rotary_cos,
+                None, # rotary_sin,
+                None, # seqlens_rotary,
+                q_descale=q_descale,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                softmax_scale=softmax_scale, 
                 causal=causal,
                 window_size=(-1,-1),
-                descale_q=descale_q, 
-                descale_k=descale_k, 
-                descale_v=descale_v, 
                 repeats=repeats, 
-                verbose=False
+                verbose=False,
+                scaling_recipe=scaling_recipe,
             )
 
             # res = flash_attn_func(q, k, v, causal=causal)
@@ -340,12 +403,16 @@ def time_fwd(func, *args, **kwargs):
             print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
             for method in methods:
                 speed_f[config, method] = efficiency(
-                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    flops(batch_size, q_seqlen, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                mem_bw_f[config, method] = mem_bw(
+                    data_size(batch_size, q_seqlen, seqlen, headdim, nheads, nkvheads, 1 if dtype == torch.float8_e4m3fn else 2, mode="fwd"),
                     time_f[config, method]
                 )
                 #print (time_f[config,method])
                 print(
-                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, {time_f[config, method] * 1e3} ms, "
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, {mem_bw_f[config, method]:.2f} TB/s, {time_f[config, method] * 1e3} ms, "
                 )
 
 
diff --git a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -883,7 +883,7 @@ struct CollectiveMainloopFwdSm90 {
                     // TODO: uncomment this for cp.async.
                     // copy(scale_copy_v_per_block, tVgV_per_block_scale(_, n_block), tVsV_per_block_scale(_, smem_pipe_write.index()));
                     // TODO: comment out this line to use cp.async.
-                    tVsV_per_block_scale(_0{}, smem_pipe_write.index()) = tVgV_per_block_scale(_0{}, n_block);
+                    copy(tVgV_per_block_scale(_, n_block), tVsV_per_block_scale(_, smem_pipe_write.index()));
                 }
             } 
             transpose_V(smem_pipe_write.index());
diff --git a/hopper/test_flash_attn.py b/hopper/test_flash_attn.py
@@ -49,22 +49,22 @@
 
 
 # @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float8_e4m3fn])
-# @pytest.mark.parametrize("dtype", [torch.bfloat16] + ([torch.float16] if not DISABLE_FP16 else []) + ([torch.float8_e4m3fn] if not DISABLE_FP8 else []))
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn] if not DISABLE_FP8 else [])
+@pytest.mark.parametrize("dtype", [torch.bfloat16] + ([torch.float16] if not DISABLE_FP16 else []) + ([torch.float8_e4m3fn] if not DISABLE_FP8 else []))
+# @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn] if not DISABLE_FP8 else [])
 # @pytest.mark.parametrize("dtype", [torch.bfloat16])
 # @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
-# @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
-@pytest.mark.parametrize("mha_type", ["mha"])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize("mha_type", ["mha"])
 # @pytest.mark.parametrize("has_qv", [False, True])
 @pytest.mark.parametrize("has_qv", [False])
-# @pytest.mark.parametrize("deterministic", [False, True])
-@pytest.mark.parametrize("deterministic", [False])
-# @pytest.mark.parametrize("softcap", [0.0] + ([15.0] if not DISABLE_SOFTCAP else []))
-@pytest.mark.parametrize("softcap", [0.0])
-# @pytest.mark.parametrize("local", [False] + ([True] if not DISABLE_LOCAL else []))
-@pytest.mark.parametrize("local", [False])
-# @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("deterministic", [False, True])
+# @pytest.mark.parametrize("deterministic", [False])
+@pytest.mark.parametrize("softcap", [0.0] + ([15.0] if not DISABLE_SOFTCAP else []))
+# @pytest.mark.parametrize("softcap", [0.0])
+@pytest.mark.parametrize("local", [False] + ([True] if not DISABLE_LOCAL else []))
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
 # @pytest.mark.parametrize("causal", [True])
 # @pytest.mark.parametrize("V_colmajor", [False, True])
 @pytest.mark.parametrize("V_colmajor", [False])

Original file line number	Diff line number	Diff line change
`@@ -883,7 +883,7 @@ struct CollectiveMainloopFwdSm90 {`
`883`	`883`	`// TODO: uncomment this for cp.async.`
`884`	`884`	`// copy(scale_copy_v_per_block, tVgV_per_block_scale(_, n_block), tVsV_per_block_scale(_, smem_pipe_write.index()));`
`885`	`885`	`// TODO: comment out this line to use cp.async.`
`886`		`- tVsV_per_block_scale(_0{}, smem_pipe_write.index()) = tVgV_per_block_scale(_0{}, n_block);`
	`886`	`+ copy(tVgV_per_block_scale(_, n_block), tVsV_per_block_scale(_, smem_pipe_write.index()));`
`887`	`887`	`}`
`888`	`888`	`}`
`889`	`889`	`transpose_V(smem_pipe_write.index());`