pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 4 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 4 deletions
diff --git a/‎kernels/portable/CMakeLists.txt
Lines changed: 1 addition & 8 deletions b/‎kernels/portable/CMakeLists.txt
Lines changed: 1 addition & 8 deletions
diff --git a/‎kernels/portable/cpu/op_acos.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_acos.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_acosh.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_acosh.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_add.cpp
Lines changed: 4 additions & 8 deletions b/‎kernels/portable/cpu/op_add.cpp
Lines changed: 4 additions & 8 deletions
diff --git a/‎kernels/portable/cpu/op_asin.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_asin.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_asinh.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_asinh.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_atan.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_atan.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_atan2.cpp
Lines changed: 1 addition & 1 deletion b/‎kernels/portable/cpu/op_atan2.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/portable/cpu/op_atanh.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_atanh.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_ceil.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_ceil.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_clamp.cpp
Lines changed: 3 additions & 2 deletions b/‎kernels/portable/cpu/op_clamp.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎kernels/portable/cpu/op_cos.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_cos.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_cosh.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_cosh.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_elu.cpp
Lines changed: 2 additions & 1 deletion b/‎kernels/portable/cpu/op_elu.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/op_erf.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_erf.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_exp.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_exp.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_expm1.cpp
Lines changed: 2 additions & 5 deletions b/‎kernels/portable/cpu/op_expm1.cpp
Lines changed: 2 additions & 5 deletions
diff --git a/‎kernels/portable/cpu/op_floor.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_floor.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_fmod.cpp
Lines changed: 5 additions & 3 deletions b/‎kernels/portable/cpu/op_fmod.cpp
Lines changed: 5 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_isinf.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_isinf.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_isnan.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_isnan.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_log.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_log.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_log10.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_log10.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_log1p.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_log1p.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_log2.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_log2.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_maximum.cpp
Lines changed: 1 addition & 1 deletion b/‎kernels/portable/cpu/op_maximum.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/portable/cpu/op_minimum.cpp
Lines changed: 2 additions & 1 deletion b/‎kernels/portable/cpu/op_minimum.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 3 additions & 1 deletion b/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/op_native_dropout.cpp
Lines changed: 4 additions & 6 deletions b/‎kernels/portable/cpu/op_native_dropout.cpp
Lines changed: 4 additions & 6 deletions
@@ -271,10 +271,6 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
-    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
-    'kernels/portable/cpu/util/elementwise_util.h',
-    'kernels/portable/cpu/util/math_util.h',
-    'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
 
@@ -69,15 +69,8 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
   target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
-  gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
-  generate_bindings_for_kernels(
-    LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
-  )
-  gen_operators_lib(
-    LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
-  )
   install(
-    TARGETS optimized_portable_kernels optimized_portable_ops_lib
+    TARGETS optimized_portable_kernels
     DESTINATION lib
   )
 endif()
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& acos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "acos.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::acos(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::acos, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& acosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "acosh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::acosh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::acosh, ctx, in, out);
 }
 
 } // namespace native
 
@@ -102,18 +102,14 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    auto val_alpha_times_b = val_alpha * val_b;
     utils::apply_unitensor_elementwise_fn<
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_alpha_times_b](const auto val_a) {
-          // Cast here supports vectorization; either it does nothing
-          // or it casts from CTYPE_COMPUTE to
-          // Vectorized<CTYPE_COMPUTE>.
-          return val_a + decltype(val_a)(val_alpha_times_b);
+        [b, alpha](const auto val_a) {
+          CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
+          CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
+          return val_a + val_alpha * val_b;
         },
         ctx,
         a,
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& asin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "asin.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::asin(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::asin, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& asinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "asinh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::asinh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::asinh, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& atan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "atan.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::atan(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::atan, ctx, in, out);
 }
 
 } // namespace native
 
@@ -60,7 +60,7 @@ Tensor& atan2_out(
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
         [](const auto val_a, const auto val_b) {
-          return executorch::math::atan2(val_a, val_b);
+          return std::atan2(val_a, val_b);
         },
         ctx,
         a,
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& atanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "atanh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::atanh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::atanh, ctx, in, out);
 }
 
 } // namespace native
 
@@ -17,9 +17,7 @@ namespace native {
 using executorch::aten::Tensor;
 
 Tensor& ceil_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "ceil.out";
-  return internal::unary_ufunc_realhbf16<op_name>(
-      [](auto x) { return executorch::math::ceil(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbf16(std::ceil, ctx, in, out);
 }
 
 } // namespace native
 
@@ -138,8 +138,9 @@ Tensor& clamp_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [has_min, min_opt, has_max, max_opt](const auto val_in) {
-          auto val_out = val_in;
+        [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
+          CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
                 val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));
 
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& cos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "cos.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::cos(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::cos, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& cosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "cosh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::cosh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::cosh, ctx, in, out);
 }
 
 } // namespace native
 
@@ -48,7 +48,8 @@ Tensor& elu_out(
         CTYPE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [negcoef, math_scale, math_input_scale](const CTYPE x) {
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
 
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& erf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "erf.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::erf(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::erf, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "exp.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::exp(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::exp, ctx, in, out);
 }
 
 } // namespace native
 
@@ -7,19 +7,16 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
-#include <type_traits>
 
 namespace torch {
 namespace executor {
 namespace native {
 
 Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "expm1.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::expm1(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::expm1, ctx, in, out);
 }
 
 } // namespace native
 
@@ -17,9 +17,7 @@ namespace native {
 using executorch::aten::Tensor;
 
 Tensor& floor_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "floor.out";
-  return internal::unary_ufunc_realhbf16<op_name>(
-      [](auto x) { return executorch::math::floor(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbf16(std::floor, ctx, in, out);
 }
 
 } // namespace native
 
@@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          // TODO: rewrite this to be vectorization-capable?
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -138,8 +138,10 @@ Tensor& fmod_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [val_b](const auto val_a) {
-          return executorch::math::fmod(val_a, (decltype(val_a))val_b);
+        [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
+          CTYPE_COMPUTE value = std::fmod(val_a, val_b);
+          return value;
         },
         ctx,
         a,
 
@@ -17,9 +17,8 @@ namespace native {
 Tensor& isinf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   // Lambda is syntactic sugar needed to workaround compilation on some older
   // non-compatible distros where isnan is returning int rather than bool
-  static constexpr const char op_name[] = "isinf.out";
-  return internal::unary_ufunc_realhb_to_bool<op_name>(
-      [](auto x) -> bool { return std::isinf(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(
+      [](double x) -> bool { return std::isinf(x); }, ctx, in, out);
 }
 
 } // namespace native
 
@@ -17,9 +17,8 @@ namespace native {
 Tensor& isnan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   // Lambda is syntactic sugar needed to workaround compilation on some older
   // non-compatible distros where isnan is returning int rather than bool
-  static constexpr const char op_name[] = "isnan.out";
-  return internal::unary_ufunc_realhb_to_bool<op_name>(
-      [](auto x) -> bool { return std::isnan(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(
+      [](double x) -> bool { return std::isnan(x); }, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& log_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::log, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log10_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log10.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log10(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log10, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log1p_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log1p.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log1p(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log1p, ctx, in, out);
 }
 
 } // namespace native
 
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log2_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log2.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log2(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log2, ctx, in, out);
 }
 
 } // namespace native
 
@@ -49,7 +49,7 @@ Tensor& maximum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
         ctx,
 
@@ -49,7 +49,8 @@ Tensor& minimum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
 
@@ -72,7 +72,9 @@ Tensor& mul_out(
           CTYPE_COMPUTE,
           op_name,
           utils::SupportedTensorDtypes::REALHBBF16>(
-          [](const auto val_a, const auto val_b) { return val_a * val_b; },
+          [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+            return val_a * val_b;
+          },
           ctx,
           a,
           utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -57,11 +57,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
     }
     ET_SWITCH_FLOATHBF16_TYPES(
         input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
-          utils::apply_bitensor_elementwise_fn<
-              CTYPE_COMPUTE,
-              op_name,
-              utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-              [](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
+          utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [](const auto val, const auto mask_val) {
                 if (!mask_val) {
                   return static_cast<decltype(val)>(0);
                 }
@@ -73,7 +70,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
               mask,
               // TODO: should really be just BOOL
               utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-              out);
+              out,
+              utils::SupportedTensorDtypes::SAME_AS_COMMON);
         });
   } else if (input.numel() > 0) {
     std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());
Original file line number	Diff line number	Diff line change
`@@ -17,9 +17,7 @@ namespace native {`
`17`	`17`	`using executorch::aten::Tensor;`
`18`	`18`
`19`	`19`	`Tensor& ceil_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {`
`20`		`- static constexpr const char op_name[] = "ceil.out";`
`21`		`- return internal::unary_ufunc_realhbf16<op_name>(`
`22`		`- [](auto x) { return executorch::math::ceil(x); }, ctx, in, out);`
	`20`	`+ return internal::unary_ufunc_realhbf16(std::ceil, ctx, in, out);`
`23`	`21`	`}`
`24`	`22`
`25`	`23`	`} // namespace native`
Original file line number	Diff line number	Diff line change
`@@ -15,9 +15,7 @@ namespace executor {`
`15`	`15`	`namespace native {`
`16`	`16`
`17`	`17`	`Tensor& cos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {`
`18`		`- static constexpr const char op_name[] = "cos.out";`
`19`		`- return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(`
`20`		`- [](auto x) { return executorch::math::cos(x); }, ctx, in, out);`
	`18`	`+ return internal::unary_ufunc_realhbbf16_to_floathbf16(std::cos, ctx, in, out);`
`21`	`19`	`}`
`22`	`20`
`23`	`21`	`} // namespace native`