Update on "[ET-VK] Introduce graph runtime shader library that enables dynamic shapes"

SS-JIA · SS-JIA · commit 5bb03895dee6 · 2024-03-12T15:50:06.000-07:00
## Context pytorch/pytorch#121598 introduces the ability to support dynamic shapes through tensor metadata updates. The idea is fairly simple. Instead of shaders accepting a UBO with size data for all arguments: ``` layout(set = 0, binding = 2) uniform PRECISION restrict Block { ivec4 output_sizes; ivec4 other_sizes; float alpha; } ``` Shaders will accept separate UBOs for each piece of tensor metadata: ``` layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes { ivec4 data; } out_sizes; layout(set = 0, binding = 4) uniform PRECISION restrict InSizes { ivec4 data; } in_sizes; layout(set = 0, binding = 5) uniform PRECISION restrict OtherSizes { ivec4 data; } other_sizes; layout(set = 0, binding = 6) uniform PRECISION restrict Alpha { float data; } alpha; ``` Each UBO will be owned and maintained by the corresponding `vTensor` instance. To support a graph input resize, every tensor in the graph only needs to update their metadata UBOs via the `tensor.virtual_resize(new_sizes)` call. Shader dispatches in subsequent command buffer submissions will then see the updated metadata and execute as if the tensor were the updated sizes. This changeset introduces a new shader library for the Vulkan graph runtime that enables dynamic shapes through this technique in favor of relying on the shader library from PyTorch Vulkan. ## Considerations Technically, the UBO update technique can be applied to the shaders from PyTorch Vulkan as well. If that's the case, why introduce a new shader library for the graph runtime? The primary motivation is code quality. First, having `vTensor` supply UBOs for their own metadata greatly reduces the need to have operator specifc ad-hoc `Params` structs to organize arguments to write into a `api::UniformParamsBuffer`. Constructing an `ExecuteNode` for binary operators is now ``` graph.execute_nodes().emplace_back(new ExecuteNode( graph, api::shader_registry().get_shader_info(kernel_name.str()), global_size, local_size, {{out, api::MemoryAccessType::WRITE}, {{arg1, arg2}, api::MemoryAccessType::READ}}, {t_out.gpu_sizes_ubo(), t_in1.gpu_sizes_ubo(), t_in2.gpu_sizes_ubo(), graph.create_params_buffer(alpha_val)})) ``` instead of ``` ArithmeticParams block{ get_size_as_ivec4(t_out), get_size_as_ivec4(t_in1), get_size_as_ivec4(t_in2), alpha_val, }; api::UniformParamsBuffer params(graph.context(), block); graph.execute_nodes().emplace_back(new ExecuteNode( graph, shader, global_size, local_size, {{out, api::MemoryAccessType::WRITE}, {{arg1, arg2}, api::MemoryAccessType::READ}}, std::move(params))); ``` Another consideration is that pytorch/pytorch#115948 which was landed fairly recently enables much more expressive shader templates through the use of Python code blocks in the GLSL template. This enables shader templates that can easily express variants for different data types, packing structures, etc. Introducing a new shader library provides the opportunity to rewrite the shaders in PyTorch Vulkan in a more generic and extensible way. Differential Revision: [D54754545](https://our.internmc.facebook.com/intern/diff/D54754545/) [ghstack-poisoned]
diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS
@@ -3,7 +3,7 @@ load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
-define_common_targets()
+define_common_targets(is_fbcode = True)
 
 runtime.python_library(
     name = "vulkan_preprocess",
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
@@ -1,23 +1,22 @@
-load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_fbcode")
-load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-def get_glsl_image_format():
-    if native.read_config("pt", "vulkan_full_precision", "0") == "0":
-        return "rgba16f"
-    return "rgba32f"
-
-def vulkan_spv_shader_lib(name, spv_filegroup):
+def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):
     gen_aten_vulkan_spv_target = "//caffe2/tools:gen_aten_vulkan_spv_bin"
     glslc_path = "//caffe2/fb/vulkan/dotslash:glslc"
-    if is_fbcode():
+    if is_fbcode:
         gen_aten_vulkan_spv_target = "//caffe2:gen_vulkan_spv_bin"
         glslc_path = "//caffe2/fb/vulkan/tools:glslc"
 
+    glsl_paths = []
+
+    # TODO(ssjia): remove the need for subpath once subdir_glob is enabled in OSS
+    for target, subpath in spv_filegroups.items():
+        glsl_paths.append("$(location {})/{}".format(target, subpath))
+
     genrule_cmd = [
         "$(exe {})".format(gen_aten_vulkan_spv_target),
-        "--glsl-paths $(location {})".format(spv_filegroup),
-        "--output-path $OUT --env FLOAT_IMAGE_FORMAT={}".format(get_glsl_image_format()),
+        "--glsl-paths {}".format(" ".join(glsl_paths)),
+        "--output-path $OUT",
         "--glslc-path=$(exe {})".format(glslc_path),
         "--tmp-dir-path=$OUT",
     ]
@@ -49,7 +48,7 @@ def vulkan_spv_shader_lib(name, spv_filegroup):
         ],
     )
 
-def define_common_targets():
+def define_common_targets(is_fbcode = False):
     runtime.genrule(
         name = "gen_vk_delegate_schema",
         srcs = [
@@ -89,14 +88,17 @@ def define_common_targets():
 
     runtime.filegroup(
         name = "vulkan_graph_runtime_shaders",
-        srcs = subdir_glob([
-            ("runtime/graph/ops/glsl", "*"),
+        srcs = native.glob([
+            "runtime/graph/ops/glsl/*",
         ]),
     )
 
     vulkan_spv_shader_lib(
         name = "vulkan_graph_runtime_shaderlib",
-        spv_filegroup = ":vulkan_graph_runtime_shaders",
+        spv_filegroups = {
+            ":vulkan_graph_runtime_shaders": "runtime/graph/ops/glsl",
+        },
+        is_fbcode = is_fbcode,
     )
 
     runtime.cxx_library(
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
@@ -916,8 +916,9 @@ def linear_forward_8da4w(
     x, weight_int8, scales, zeros, out_features, group_size, precision
 ):
     x = per_token_dynamic_quant(x)
-    origin_x_size = x.size()
-    x = x.reshape(-1, origin_x_size[-1])
+    # TODO: verify and remove following reshape code
+    # origin_x_size = x.size()
+    # x = x.reshape(-1, origin_x_size[-1])
 
     # TODO: better API
     # weight_int8 = torch.ops.quantized_decomposed.unpack_int4_to_int8(weight_int4packed)
@@ -939,8 +940,8 @@ def linear_forward_8da4w(
     # w_dq = w_dq.to(torch.float16)
     c = torch.nn.functional.linear(x, w_dq)
 
-    new_shape = origin_x_size[:-1] + (out_features,)
-    c = c.reshape(new_shape)
+    # new_shape = origin_x_size[:-1] + (out_features,)
+    # c = c.reshape(new_shape)
 
     return c
 
@@ -1144,7 +1145,8 @@ def __init__(
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         input = input.to(self.precision)
-        input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
+        # padding is removed for perf
+        # input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
         return linear_forward_8da4w(
             input,
             self.weight,
@@ -1387,6 +1389,10 @@ def make_names_and_values_dict_func(q, qparams):
 
     def convert_for_runtime(self, model):
         replace_linear_8da4w(
-            model, self.groupsize, self.inner_k_tiles, self.padding_allowed
+            model,
+            self.groupsize,
+            self.padding_allowed,
+            torch.int8,
+            self.precision,
         )
         return model
diff --git a/examples/portable/scripts/export.py b/examples/portable/scripts/export.py
@@ -9,6 +9,8 @@
 import argparse
 import logging
 
+import torch
+
 from executorch.exir.capture import EdgeCompileConfig, ExecutorchBackendConfig
 
 from ...models import MODEL_NAME_TO_MODEL
@@ -75,4 +77,5 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()  # pragma: no cover
+    with torch.no_grad():
+        main()  # pragma: no cover
diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/sdk/scripts/export_bundled_program.py
@@ -8,15 +8,11 @@
 
 import argparse
 
-from typing import List, Union
+from typing import List
 
 import torch
 
-from executorch.exir import (
-    ExecutorchProgram,
-    ExecutorchProgramManager,
-    MultiMethodExecutorchProgram,
-)
+from executorch.exir import ExecutorchProgramManager
 from executorch.sdk import BundledProgram
 from executorch.sdk.bundled_program.config import (
     MethodInputType,
@@ -33,11 +29,7 @@
 
 
 def save_bundled_program(
-    executorch_program: Union[
-        ExecutorchProgram,
-        MultiMethodExecutorchProgram,
-        ExecutorchProgramManager,
-    ],
+    executorch_program: ExecutorchProgramManager,
     method_test_suites: List[MethodTestSuite],
     output_path: str,
 ):
diff --git a/exir/__init__.py b/exir/__init__.py
@@ -10,7 +10,6 @@
     _capture_legacy_do_not_use,
     CallSpec,
     capture,
-    capture_multiple,
     CaptureConfig,
     EdgeCompileConfig,
     ExecutorchBackendConfig,
@@ -23,9 +22,6 @@
     ExecutorchProgram,
     ExecutorchProgramManager,
     ExirExportedProgram,
-    multi_method_program_to_executorch,
-    MultiMethodExecutorchProgram,
-    MultiMethodExirExportedProgram,
     to_edge,
 )
 from executorch.exir.tracer import ExirDynamoConfig
@@ -37,7 +33,6 @@
     "emit_program",
     "EmitterOutput",
     "capture",
-    "capture_multiple",
     "_capture_legacy_do_not_use",
     "CallSpec",
     "ExportedProgram",
@@ -49,12 +44,9 @@
     "EdgeProgramManager",
     "ExecutorchProgramManager",
     "edge_to_executorch_passes",
-    "MultiMethodExirExportedProgram",
-    "MultiMethodExecutorchProgram",
     "CaptureConfig",
     "EdgeCompileConfig",
     "ExecutorchBackendConfig",
     "Value",
-    "multi_method_program_to_executorch",
     "ExirDynamoConfig",
 ]
diff --git a/exir/capture/__init__.py b/exir/capture/__init__.py
@@ -10,7 +10,6 @@
     _capture_legacy_do_not_use,
     CallSpec,
     capture,
-    capture_multiple,
 )
 
 from executorch.exir.capture._config import (
@@ -23,7 +22,6 @@
     "CallSpec",
     "capture",
     "_capture_legacy_do_not_use",
-    "capture_multiple",
     "CaptureConfig",
     "EdgeCompileConfig",
     "ExecutorchBackendConfig",
diff --git a/exir/capture/_capture.py b/exir/capture/_capture.py
@@ -9,12 +9,12 @@
 from collections import namedtuple
 from contextlib import contextmanager
 from types import MethodType
-from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Callable, cast, List, Optional, Tuple
 
 import torch
 from executorch.exir.capture._config import CaptureConfig
 from executorch.exir.error import ExportError, ExportErrorType, InternalError
-from executorch.exir.program import ExirExportedProgram, MultiMethodExirExportedProgram
+from executorch.exir.program import ExirExportedProgram
 from executorch.exir.program._program import _transform, HackedUpExportedProgramDONOTUSE
 from executorch.exir.tracer import (
     _default_decomposition_table,
@@ -360,137 +360,6 @@ def convert_to_fake(x):
     return ExirExportedProgram(ep, False)
 
 
-@compatibility(is_backward_compatible=False)
-def capture_multiple(
-    m: Union[torch.nn.Module, Callable[..., Any]],
-    args: Union[Dict[str, Tuple[Value, ...]], Tuple[Value, ...]],
-    config: Optional[CaptureConfig] = None,
-    prim_getters: Optional[Set[str]] = None,
-    dynamic_shapes: Optional[Union[Dict[str, Any], List[Any]]] = None,
-):
-    """
-    capture_multiple traces either an nn.Module or just a callable with PyTorch
-    operations inside and produce a single MultiMethodExirExportedProgram that
-    can potentially have multiple entry points. When multiple entry points
-    are traced, each of them is stored separately in the resulting
-    MultiMethodExirExportedProgram without sharing state.
-
-    Args:
-        m: the `nn.Module` or callable to trace.
-
-        args: Tracing example inputs.
-
-        When `m` is an nn.Module, `args` can be
-        1) A dictionary that maps names of method to their tracing example inputs.
-        in this case, all specified methods will be captured.
-        2) A tuple. In this case, `forward` method of `m` will be captured. It is
-        equivalent to passing {"forward", tuple-type-args}
-
-        When `m` is a non-Module callable, `args` must be a Tuple containing
-        tracing example inputs.
-
-        config: A CaptureConfig object that specifies how to interpret the
-        program being captured.
-
-        prim_getters: A set of primitive getter functions to capture the return values of
-
-        dynamic_shapes: Input dynamic shapes.
-
-        When `m` is an nn.Module, `dynamic_shapes` is a dictionary that maps names of method
-        to their input dynamic shapes.
-
-        When `m` is a non-Module callable, `dynamic_shapes` is a list of input dynamic shapes.
-
-    Returns:
-        A MultiMethodExirExportedProgram.
-
-        if `m` is an nn.Module, returned program would have multiple
-        captured methods, each corresponding to one entry in args dictionary.
-
-        if `m` is a non-Module callable, returned program would have a single
-        captured method named `forward`.
-
-    Raises:
-        AssertionError if given method name do not reference a valid method
-        on the given nn.Module.
-    """
-    warnings.warn(
-        "This function is now deprecated, please use `torch.export and exir.to_edge` instead.",
-        DeprecationWarning,
-        stacklevel=1,
-    )
-    # Normalize m and args.
-    compile_specs = []
-    prim_getter_cache: Optional[Dict[str, Any]] = None
-    if isinstance(m, torch.nn.Module):
-        if dynamic_shapes is not None:
-            assert isinstance(
-                dynamic_shapes, dict
-            ), f"Expected a dict for dynamic_shapes, got {type(dynamic_shapes)}"
-
-        if isinstance(args, tuple):
-            compile_specs.append(
-                CompileSpec(
-                    "forward",
-                    m.forward,
-                    args,
-                    (
-                        dynamic_shapes["forward"]
-                        if dynamic_shapes and "forward" in dynamic_shapes
-                        else None
-                    ),
-                )
-            )
-        else:
-            assert isinstance(
-                args, dict
-            ), f"Expected a tuple or Dict[str, tuple], got {type(args)}"
-            for method_name, method_args in args.items():
-                compile_specs.append(
-                    CompileSpec(
-                        method_name,
-                        getattr(m, method_name),
-                        method_args,
-                        (
-                            dynamic_shapes[method_name]
-                            if dynamic_shapes and method_name in dynamic_shapes
-                            else None
-                        ),
-                    )
-                )
-        if prim_getters is not None:
-            prim_getter_cache = {}
-            for getter in prim_getters:
-                prim_getter_cache[getter] = getattr(m, getter)()
-    else:
-        # Reaching here means `m` is a non-Module callable.
-        assert isinstance(
-            m, Callable
-        ), f"Only nn.Module or callable allowed, got {type(m)}"
-        assert isinstance(
-            args, tuple
-        ), f"When tracing a non-Module callable, `args` must be a tuple of tracing inputs, but got {type(args)}"
-        assert (
-            prim_getters is None
-        ), "Caller should not specify primitive getter functions when only providing a callable as input"
-        if dynamic_shapes is not None:
-            assert isinstance(
-                dynamic_shapes, list
-            ), f"Expected a list for constraints, got {type(dynamic_shapes)}"
-        compile_specs.append(CompileSpec("forward", m, args, dynamic_shapes))
-
-    method_name_to_prog = {}
-    for compile_spec in compile_specs:
-        method_name_to_prog[compile_spec.method_name] = capture(
-            compile_spec.callable,
-            compile_spec.args,
-            config,
-            compile_spec.dynamic_shapes,
-        )
-
-    return MultiMethodExirExportedProgram(method_name_to_prog, prim_getter_cache)
-
-
 # This is to bootstrap the missing meta["val"] when 1. ph consists of scalar
 # 2. meta["val"] is not properly set in dispatch_trace.
 def _instantiate_missing_placeholder_val_with_real_inputs(gm, args):
diff --git a/exir/program/__init__.py b/exir/program/__init__.py
@@ -13,9 +13,6 @@
     ExecutorchProgram,
     ExecutorchProgramManager,
     ExirExportedProgram,
-    multi_method_program_to_executorch,
-    MultiMethodExecutorchProgram,
-    MultiMethodExirExportedProgram,
     to_edge,
 )
 
@@ -25,9 +22,6 @@
     "_to_edge",
     "to_edge",
     "edge_to_executorch_passes",
-    "MultiMethodExirExportedProgram",
-    "MultiMethodExecutorchProgram",
-    "multi_method_program_to_executorch",
     "EdgeProgramManager",
     "ExecutorchProgramManager",
 ]
diff --git a/exir/program/_program.py b/exir/program/_program.py
diff --git a/exir/tests/test_capture.py b/exir/tests/test_capture.py
diff --git a/sdk/bundled_program/core.py b/sdk/bundled_program/core.py
diff --git a/sdk/etrecord/_etrecord.py b/sdk/etrecord/_etrecord.py