Merge branch 'main' into update_nightly_2

mergennachin · web-flow · commit 3c352fd60160 · 2025-03-20T12:45:47.000-04:00
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
@@ -138,8 +138,21 @@ Tensor& add_out(
   if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] + b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] + b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
@@ -86,8 +86,21 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
   if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] / b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] / b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
@@ -201,8 +214,21 @@ Tensor& div_out_mode(
   if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] / b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] / b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
@@ -104,10 +104,23 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] * b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] * b.const_data_ptr<float>()[0];
+    return out;
+  }
 
-  if ((a_dim == 0) || (b_dim == 0))
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
@@ -133,8 +133,21 @@ Tensor& sub_out(
   if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] - b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] - b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -528,8 +528,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
         # Tensor combination
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
         exir_ops.edge.aten.repeat.default,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
@@ -563,6 +561,8 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
+        exir_ops.edge.aten.split.Tensor,
     ]
 )
 def register_ported_op_all_packed_dims(features: OpFeatures):
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,8 +25,6 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
   DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
@@ -38,62 +36,60 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
-  if (dim_index == kWidth4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  const auto packed_dim = t_in->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
 
-    for (ValueRef out_ref : *out_list) {
-      // Doesn't need to use split_size since we have already verified that the
-      // output tensor's size matches with the split_size.
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-      src_offset[0] += range[0];
-    }
-  } else if (dim_index == kHeight4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  const bool is_splitting_channel = (dim_index == kChannel4D);
 
-      src_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  // if splitting channels
+  if (is_splitting_channel) {
+    // set source offset w as channel size of the input tensor
+    src_offset[3] = dim_at(t_in->sizes(), kChannel4D);
+  }
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
+  for (ValueRef out_ref : *out_list) {
+    // Doesn't need to use split_size since we have already verified that the
+    // output tensor's size matches with the split_size.
+    vTensorPtr t_out = graph.get_tensor(out_ref);
+    const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D);
+    utils::ivec3 range = t_out->logical_limits();
+
+    if (dim_index == packed_dim_index) {
+      // if splitting channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_splitting_channel) {
+        add_copy_channel_offset_node(
+            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
+        src_offset[dim_xyz_index] += out_channel_size;
+      } else {
+        // dst_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set destination offset w as channel size of the output tensor if
+        // splitting channel
+        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, in, range, src_offset, dst_offset, out_ref);
+        src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index);
+      }
+    } else {
+      // set destination offset w as channel size of the output tensor if
+      // splitting channels
+      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
       add_copy_offset_node(
           graph, in, range, src_offset, dst_offset, out_ref, false, true);
-
-      src_offset[2] += range[2];
-    }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      int32_t range = dim_at<kChannel4D>(t_out->sizes());
-      add_copy_channel_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref);
-      src_offset += range;
+      src_offset[dim_xyz_index] +=
+          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
     }
-
-  } else {
-    VK_THROW("not ipmlemented");
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -922,30 +922,41 @@ def get_split_with_sizes_inputs():
     Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
     test_cases = [
         # Split on Width
+        Test(self=(S1, 7, 10, 11), sizes=[1, 3, 2, 5], dim=3),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
+        Test(self=(7, 10, 11), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 11), sizes=[3, 8], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
         Test(self=(10, 10), sizes=[1, 9], dim=1),
         Test(self=(10,), sizes=[1, 9], dim=0),
         # Split on Height
+        Test(self=(S1, 7, 11, 10), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 11, 10), sizes=[1, 3, 2, 5], dim=1),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
+        Test(self=(7, 11, 11), sizes=[3, 8], dim=1),
         Test(self=(7, 10, 10), sizes=[10], dim=1),
         Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
         Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
         # Split on Batch
         Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
         Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
         # Split on Channel
+        Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
+        Test(self=(7, 13, 4, 8), sizes=[3, 2, 2, 5, 1], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
+        Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
         Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[13], dim=0),
     ]
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
@@ -997,6 +1008,8 @@ def get_split_tensor_inputs():
     )
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
@@ -1534,26 +1534,49 @@ def forward(self, x):
         self.assertEqual(len(program.constant_buffer[1].storage), 8)
 
     def test_emit_lifted_tensor_constant(self) -> None:
-        class LiftedConstants(nn.Module):
+        class LiftedTensorConstants(nn.Module):
             def __init__(self):
                 super().__init__()
 
             def forward(self, x):
                 x = x * torch.tensor([[4, 3], [1, 2], [5, 6]], dtype=torch.float)
                 return x
 
-        model = LiftedConstants()
+        model = LiftedTensorConstants()
+        # Specify that we want to move non-lifted constants to external file
+        et_cfg = ExecutorchBackendConfig(external_constants=True)
+        program = to_edge(
+            export(model, (torch.ones(3, 2),), strict=True)
+        ).to_executorch(et_cfg)
+        program = program._emitter_output.program
+        exec_plan = program.execution_plan[0]
+        # There should only be 1 input to this model.
+        self.assertEqual(len(exec_plan.inputs), 1)
+        self.assertEqual(len(program.constant_buffer), 2)
+        self.assertEqual(len(program.constant_buffer[1].storage), 24)
 
+    def test_emit_lifted_constant(self) -> None:
+        class LiftedConstants(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = x + 1
+                return x
+
+        model = LiftedConstants()
+        # Specify that we want to move non-lifted constants to external file
+        et_cfg = ExecutorchBackendConfig(external_constants=True)
         program = to_edge(
             export(model, (torch.ones(3, 2),), strict=True)
-        ).to_executorch()
+        ).to_executorch(et_cfg)
 
         program = program._emitter_output.program
         exec_plan = program.execution_plan[0]
         # There should only be 1 input to this model.
         self.assertEqual(len(exec_plan.inputs), 1)
         self.assertEqual(len(program.constant_buffer), 2)
-        self.assertEqual(len(program.constant_buffer[1].storage), 24)
+        self.assertEqual(len(program.constant_buffer[1].storage), 8)
 
     def test_mutable_buffers(self) -> None:
         def count_copies(gm: torch.fx.GraphModule) -> int:
diff --git a/exir/passes/external_constants_pass.py b/exir/passes/external_constants_pass.py
@@ -17,15 +17,17 @@ def external_constants_pass(
     gm: GraphModule,
 ) -> PassResult:
     """
-    Move all constants to external file.
+    Move all non-lifted constants to external file.
+    NOTE: Lifted constants are not moved as they are closer
+    to code than data.
     """
     mutated = False
     for module in gm.modules():
         if not isinstance(module, torch.fx.GraphModule):
             continue
 
         for node in module.graph.nodes:
-            if node.op == "placeholder":
+            if (node.op == "placeholder") and ("_lifted_tensor" not in node.name):
                 spec = node.meta.get("spec")
                 if isinstance(spec, TensorSpec) and spec.const:
                     node.meta["constant_tag"] = "_default_external_constant"