[Inductor][float8] Register qconv-binary fusion pass for float8

jiayisunx · jiayisunx · commit 23bd2b82edaf · 2025-11-12T09:16:26.000Z
diff --git a/torchao/quantization/pt2e/inductor_passes/x86.py b/torchao/quantization/pt2e/inductor_passes/x86.py
@@ -376,9 +376,9 @@ def fn(match):
     return fn
 
 
-def _is_valid_qconv_post_op_fusion_pattern(has_binary_post_op=False):
+def _is_valid_qconv_post_op_fusion_pattern(has_binary_post_op=False, is_fp8=False):
     return (
-        _is_valid_qconv_binary_optimization_pattern()
+        _is_valid_qconv_binary_optimization_pattern(is_fp8=is_fp8)
         if has_binary_post_op
         else _is_valid_quantized_conv_optimization_pattern()
     )
@@ -408,9 +408,11 @@ def _is_valid_qlinear_post_op_fusion_pattern(has_binary_post_op=False):
     )
 
 
-def _is_valid_qconv_binary_optimization_pattern():
+def _is_valid_qconv_binary_optimization_pattern(is_fp8=False):
     return _is_valid_quantized_op_binary_optimization_pattern(
-        torch.ops.onednn.qconv_pointwise
+        torch.ops.onednn.qconv_pointwise,
+        # we don't insert q-dq for extra input in fp8 recipe
+        extra_input_from_dequant= not is_fp8,
     )
 
 
@@ -2016,12 +2018,13 @@ def _register_qconv_post_op_fusion_pass(
     pass_number,
     computation_op,
     post_op_attr,
+    is_fp8=False,
 ):
     has_binary_post_op = post_op_attr.binary_op_name != "none"
 
     @register_freezing_graph_pattern(
         pattern,
-        extra_check=_is_valid_qconv_post_op_fusion_pattern(has_binary_post_op),
+        extra_check=_is_valid_qconv_post_op_fusion_pattern(has_binary_post_op, is_fp8=is_fp8),
         pass_number=pass_number,
     )
     def qconv(match: Match, *args, **kwargs):
@@ -2097,7 +2100,7 @@ def qconv(match: Match, *args, **kwargs):
             else:
                 accum = (
                     kwargs["accum"]
-                    if output_dtype in [torch.uint8, torch.int8]
+                    if output_dtype in [torch.uint8, torch.int8] or is_fp8
                     else kwargs["accum_after_dequant"]
                 )
                 accum_scale = (
@@ -2237,6 +2240,7 @@ def _register_qconv_unary_fusion():
                 3,  # pass_number
                 computation_op,  # computation_op
                 unary_attr,  # unary_attr
+                is_fp8=is_fp8,
             )
 
         # Priority 2 to match: QConv2d Unary pattern with fp32/bfloat16 output
@@ -2289,41 +2293,49 @@ def _register_qconv_unary_fusion():
                 4,  # pass_number
                 computation_op,  # computation_op
                 unary_attr,  # unary_attr
+                is_fp8=is_fp8,
             )
 
 
 def _register_qconv_binary_fusion():
-    for int8_mixed_bf16_with_inplace_add in [False, True]:
+    for int8_mixed_bf16_with_inplace_add, x_scale_zp_are_tensors in itertools.product([False, True], [False, True]):
+        qconv_binary_op = (
+            torch.ops.onednn.qconv2d_pointwise.binary_tensor
+            if x_scale_zp_are_tensors
+            else torch.ops.onednn.qconv2d_pointwise.binary
+        )
         # Priority 1 to match: QConv2d Binary or Binary-Unary pattern with int8 output
         swap_binary_inputs_list = [False, True]
         binary_replace_patterns = {}
-        for swap_inputs in swap_binary_inputs_list:
+        for swap_inputs, is_fp8 in itertools.product(swap_binary_inputs_list, [False, True]):
             binary_replace_patterns.update(
                 {
                     PostOpAttr(
                         "sum", 1.0, "none", [], ""
                     ): generate_pattern_with_output_quant(
                         generate_pattern_with_binary(
                             aten.add.Tensor,
-                            get_qconv_pt2e_pattern(users=1),
+                            get_qconv_pt2e_pattern(x_scale_zp_are_tensors, 1),
                             dequantize_accum_pattern,
                             int8_mixed_bf16_with_inplace_add,
                             swap_inputs=swap_inputs,
                         ),
+                        is_fp8=is_fp8,
                     ),
                     PostOpAttr(
                         "sum", 1.0, "relu", [], ""
                     ): generate_pattern_with_output_quant(
                         generate_pattern_with_unary(
                             generate_pattern_with_binary(
                                 aten.add.Tensor,
-                                get_qconv_pt2e_pattern(users=1),
+                                get_qconv_pt2e_pattern(x_scale_zp_are_tensors, 1),
                                 dequantize_accum_pattern,
                                 int8_mixed_bf16_with_inplace_add,
                                 swap_inputs=swap_inputs,
                             ),
                             aten.relu.default,
                         ),
+                        is_fp8=is_fp8,
                     ),
                 }
             )
@@ -2332,8 +2344,9 @@ def _register_qconv_binary_fusion():
             _register_qconv_post_op_fusion_pass(
                 patterns,
                 3,  # pass_number
-                torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                qconv_binary_op,  # computation_op
                 binary_unary_attr,  # binary_unary_attr
+                is_fp8=is_fp8,
             )
 
         # Priority 2 to match: QConv2d Binary-Unary pattern with fp32/bfloat16 output
@@ -2344,8 +2357,8 @@ def _register_qconv_binary_fusion():
                     PostOpAttr("sum", 1.0, "relu", [], ""): generate_pattern_with_unary(
                         generate_pattern_with_binary(
                             aten.add.Tensor,
-                            get_qconv_pt2e_pattern(users=1),
-                            KeywordArg("accum_after_dequant"),
+                            get_qconv_pt2e_pattern(x_scale_zp_are_tensors, 1),
+                            KeywordArg("accum") if is_fp8 else KeywordArg("accum_after_dequant"),
                             int8_mixed_bf16_with_inplace_add,
                             swap_inputs=swap_inputs,
                         ),
@@ -2362,15 +2375,17 @@ def _register_qconv_binary_fusion():
                 _register_qconv_post_op_fusion_pass(
                     patterns,
                     3,  # pass_number
-                    torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                    qconv_binary_op,  # computation_op
                     binary_unary_attr,  # binary_unary_attr
+                    is_fp8=is_fp8,
                 )
             else:
                 _register_qconv_post_op_fusion_pass(
                     patterns,
                     4,  # pass_number
-                    torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                    qconv_binary_op,  # computation_op
                     binary_unary_attr,  # binary_unary_attr
+                    is_fp8=is_fp8,
                 )
 
         # Priority 3: QConv2d Binary pattern with fp32/bfloat16 output
@@ -2382,8 +2397,8 @@ def _register_qconv_binary_fusion():
                         "sum", 1.0, "none", [], ""
                     ): generate_pattern_with_binary(
                         aten.add.Tensor,
-                        get_qconv_pt2e_pattern(users=1),
-                        KeywordArg("accum_after_dequant"),
+                        get_qconv_pt2e_pattern(x_scale_zp_are_tensors, 1),
+                        KeywordArg("accum") if is_fp8 else KeywordArg("accum_after_dequant"),
                         int8_mixed_bf16_with_inplace_add,
                         swap_inputs=swap_inputs,
                     ),
@@ -2397,8 +2412,9 @@ def _register_qconv_binary_fusion():
             _register_qconv_post_op_fusion_pass(
                 patterns,
                 4 if int8_mixed_bf16_with_inplace_add else 5,  # pass_number
-                torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                qconv_binary_op,  # computation_op
                 binary_unary_attr,  # binary_unary_attr
+                is_fp8=is_fp8,
             )