From cef74f0144e9f0f22daf09d356d3af2a74428508 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 01/10] Use compile-time promotion to reduce max/min size & build time (#3459) Summary: Yet another smaller pair of ops. Reviewed By: manuelcandales Differential Revision: D56807402 --- kernels/portable/cpu/op_maximum.cpp | 68 ++++++++++++++++++++++------ kernels/portable/cpu/op_minimum.cpp | 69 ++++++++++++++++++++++------- 2 files changed, 108 insertions(+), 29 deletions(-) diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 3e34035d5f6..4091f2cf8ca 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -20,6 +20,50 @@ const T& max(const T& a, const T& b) { return (b > a) ? b : a; } +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = max(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner + : public ReportCanCastBug {}; + } // namespace Tensor& maximum_out( @@ -44,20 +88,16 @@ Tensor& maximum_out( ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() { ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() { - ET_SWITCH_REALB_TYPES(common_type, ctx, "maximum.out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = max(a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() { + MaximumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); }); }); }); diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index 767a2c4ca59..7c106a63c4f 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -20,6 +20,50 @@ const T& min(const T& a, const T& b) { return (b < a) ? b : a; } +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = min(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner + : public ReportCanCastBug {}; + } // namespace Tensor& minimum_out( @@ -44,22 +88,17 @@ Tensor& minimum_out( ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "minimum.out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "minimum.out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); ET_SWITCH_REAL_TYPES_AND( - Bool, common_type, ctx, "minimum.out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = min(a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + Bool, out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { + MinimumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); }); }); }); From 689ffaf03270933e00a89a6eb768cdb06b03e35b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 02/10] Use compile-time promotion to reduce floor_divide size & build time (#3455) Summary: Continuing rollout of this technique. Reviewed By: manuelcandales Differential Revision: D56827786 --- kernels/portable/cpu/op_floor_divide.cpp | 93 +++++++++++++------ .../core/exec_aten/util/scalar_type_util.h | 6 ++ 2 files changed, 70 insertions(+), 29 deletions(-) diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp index 261f77ce617..0514df0ca25 100644 --- a/kernels/portable/cpu/op_floor_divide.cpp +++ b/kernels/portable/cpu/op_floor_divide.cpp @@ -20,6 +20,60 @@ namespace native { using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner { + static void + run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) { + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = utils::floor_divide(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&, bool&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner + : public ReportCanCastBug {}; + +} // namespace + Tensor& floor_divide_out( RuntimeContext& ctx, const Tensor& a, @@ -46,36 +100,17 @@ Tensor& floor_divide_out( Bool, a_type, ctx, "floor_divide.out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND( Bool, b_type, ctx, "floor_divide.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); ET_SWITCH_REAL_TYPES( - common_type, ctx, "floor_divide.out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES( - out_type, ctx, "floor_divide.out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn< - CTYPE_A, - CTYPE_B, - CTYPE_OUT>( - [common_type, &div_by_zero_error]( - const CTYPE_A val_a, const CTYPE_B val_b) { - if (isIntegralType( - common_type, /*includeBool=*/true)) { - if (val_b == 0) { - div_by_zero_error = true; - return static_cast(0); - } - } - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = utils::floor_divide( - a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + out_type, ctx, "floor_divide.out", CTYPE_OUT, [&]() { + FloorDivideInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out, div_by_zero_error); }); }); }); diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index 595ed7a1c02..084289520aa 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -349,6 +349,12 @@ inline constexpr bool isIntegralType( t == exec_aten::ScalarType::Short); } +template +struct is_integral_type + : public std::integral_constant< + bool, + isIntegralType(CppTypeToScalarType::value, includeBool)> {}; + inline constexpr bool isFloatingType(exec_aten::ScalarType t) { return ( t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Float || From ccb58e3968e22419fc76a1684184f4e6034bda02 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 03/10] Use compile-time promotion to reduce remainder size & build time (#3458) Summary: Yet another op that can benefit from compile-time type promotion. Reviewed By: manuelcandales Differential Revision: D56831293 --- kernels/portable/cpu/op_remainder.cpp | 81 ++++++++++++++++++--------- kernels/test/op_remainder_test.cpp | 14 +++++ 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp index 9e48374a81a..7c858c1c08a 100644 --- a/kernels/portable/cpu/op_remainder.cpp +++ b/kernels/portable/cpu/op_remainder.cpp @@ -20,6 +20,52 @@ namespace native { using Tensor = exec_aten::Tensor; +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = utils::remainder_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner + : public ReportCanCastBug {}; + +} // namespace Tensor& remainder_Tensor_out( RuntimeContext& ctx, const Tensor& a, @@ -45,32 +91,17 @@ Tensor& remainder_Tensor_out( Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND( Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); ET_SWITCH_REAL_TYPES( - common_type, ctx, "remainder.Tensor_out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES( - out_type, - ctx, - "remainder.Tensor_out", - CTYPE_OUT, - [&]() { - apply_binary_elementwise_fn< - CTYPE_A, - CTYPE_B, - CTYPE_OUT>( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = utils::remainder_override( - a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { + RemainderInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); }); }); }); diff --git a/kernels/test/op_remainder_test.cpp b/kernels/test/op_remainder_test.cpp index 4a550958a1a..254e8122b61 100644 --- a/kernels/test/op_remainder_test.cpp +++ b/kernels/test/op_remainder_test.cpp @@ -21,6 +21,7 @@ using exec_aten::Tensor; using torch::executor::testing::TensorFactory; class OpRemainderOutTest : public OperatorTest { + protected: Tensor& op_remainder_tensor_out( const Tensor& self, const Tensor& other, @@ -35,3 +36,16 @@ class OpRemainderOutTest : public OperatorTest { return torch::executor::aten::remainder_outf(context_, self, other, out); } }; + +TEST_F(OpRemainderOutTest, SmokeTest) { + TensorFactory tfDouble; + TensorFactory tfLong; + TensorFactory tfInt; + + Tensor self = tfLong.full({2, 2}, 46); + Tensor other = tfInt.full({2, 2}, 4); + Tensor out = tfDouble.zeros({2, 2}); + Tensor out_expected = tfDouble.full({2, 2}, 2.0); + op_remainder_tensor_out(self, other, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} From 61485b53058fe29f80bb356f425ab9589d253b3b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 04/10] Use compile-time promotion to reduce fmod size & build time (#3456) Summary: Almost done with Tensor ops that can benefit from compile-time promotion! Reviewed By: manuelcandales Differential Revision: D56835200 --- kernels/portable/cpu/op_fmod.cpp | 93 ++++++++++++++++++++++---------- kernels/test/op_fmod_test.cpp | 13 +++++ 2 files changed, 78 insertions(+), 28 deletions(-) diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 0083c1379d5..42f83731199 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -19,6 +19,60 @@ namespace native { using Tensor = exec_aten::Tensor; +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner { + static void + run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) { + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::fmod(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&, bool&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner + : public ReportCanCastBug {}; + +} // namespace + Tensor& fmod_Tensor_out( RuntimeContext& ctx, const Tensor& a, @@ -44,35 +98,18 @@ Tensor& fmod_Tensor_out( Bool, a_type, ctx, "fmod.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND( Bool, b_type, ctx, "fmod.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); ET_SWITCH_REAL_TYPES( - common_type, ctx, "fmod.Tensor_out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES( - out_type, ctx, "fmod.Tensor_out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn< - CTYPE_A, - CTYPE_B, - CTYPE_OUT>( - [common_type, &div_by_zero_error]( - const CTYPE_A val_a, const CTYPE_B val_b) { - if (isIntegralType( - common_type, /*includeBool=*/true)) { - if (val_b == 0) { - div_by_zero_error = true; - return static_cast(0); - } - } - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = std::fmod(a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + out_type, ctx, "fmod.Tensor_out", CTYPE_OUT, [&]() { + FmodInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out, div_by_zero_error); }); }); }); diff --git a/kernels/test/op_fmod_test.cpp b/kernels/test/op_fmod_test.cpp index 475d4ea5cb4..4ee4d84c1cc 100644 --- a/kernels/test/op_fmod_test.cpp +++ b/kernels/test/op_fmod_test.cpp @@ -32,3 +32,16 @@ class OpFmodTest : public OperatorTest { return torch::executor::aten::fmod_outf(context_, self, other, out); } }; + +TEST_F(OpFmodTest, SmokeTest) { + TensorFactory tfDouble; + TensorFactory tfLong; + TensorFactory tfInt; + + Tensor self = tfLong.full({2, 2}, 46); + Tensor other = tfInt.full({2, 2}, 4); + Tensor out = tfDouble.zeros({2, 2}); + Tensor out_expected = tfDouble.full({2, 2}, 2.0); + op_fmod_tensor_out(self, other, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} From 1a13a29927199ee7bf701cdeafe9f56c11586e66 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 05/10] support Half in minimum and clamp (#3457) Summary: IIUC, these ops need to support Half but don't. Noticed it as a difference from maximum. Reviewed By: manuelcandales Differential Revision: D56846242 --- kernels/portable/cpu/op_clamp.cpp | 18 ++++++------ kernels/portable/cpu/op_minimum.cpp | 27 +++++++++-------- kernels/portable/cpu/util/math_util.h | 42 +++++++++++++++++++++++++++ kernels/test/op_clamp_test.cpp | 25 +++++++++++++--- kernels/test/op_minimum_test.cpp | 4 +++ 5 files changed, 89 insertions(+), 27 deletions(-) diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index 06c87d03f2d..50d7e8c374d 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -53,7 +53,7 @@ __ET_NODISCARD bool check_bounds( } }); } else if (isFloatingType(out_type)) { - ET_SWITCH_FLOAT_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { if (std::isfinite(val) && is_out_of_bounds(val)) { ET_LOG(Error, "%s value out of bounds", val_name); @@ -119,7 +119,7 @@ Tensor& clamp_out( ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); - ET_SWITCH_REAL_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { // Extract optional min value CTYPE_OUT min = 0; if (has_min) { @@ -140,7 +140,7 @@ Tensor& clamp_out( }); } - ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "clamp", CTYPE_IN, [&]() { + ET_SWITCH_REALHB_TYPES(in_type, ctx, "clamp", CTYPE_IN, [&]() { apply_unary_map_fn( [has_min, min, has_max, max](const CTYPE_IN val_in) { CTYPE_OUT val_out = static_cast(val_in); @@ -195,20 +195,20 @@ Tensor& clamp_tensor_out( ScalarType out_type = out.scalar_type(); if (has_min) { - common_type = promoteTypes(common_type, min_type); + common_type = promoteTypes(common_type, min_type, /*half_to_float*/ true); } if (has_max) { - common_type = promoteTypes(common_type, max_type); + common_type = promoteTypes(common_type, max_type, /*half_to_float*/ true); } ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); constexpr auto name = "clamp.Tensor_out"; - ET_SWITCH_REALB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { - ET_SWITCH_REALB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { - ET_SWITCH_REALB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { - ET_SWITCH_REALB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { + ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { + ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { apply_ternary_elementwise_fn< CTYPE_IN, CTYPE_MIN, diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index 7c106a63c4f..44c0efa8a67 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -81,25 +81,24 @@ Tensor& minimum_out( ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); - ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); ScalarType out_type = out.scalar_type(); ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "minimum.out", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "minimum.out", CTYPE_B, [&]() { - using CTYPE_IN = - typename torch::executor::promote_types::type; + ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REAL_TYPES_AND( - Bool, out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { - MinimumInner< - can_cast::value, - CTYPE_A, - CTYPE_B, - CTYPE_IN, - CTYPE_OUT>::run(a, b, out); - }); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { + MinimumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); }); }); diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h index 44cb47f8cba..df175147062 100644 --- a/kernels/portable/cpu/util/math_util.h +++ b/kernels/portable/cpu/util/math_util.h @@ -94,6 +94,48 @@ INT_T max_override(INT_T a, INT_T b) { return std::max(a, b); } +template < + typename T, + typename std::enable_if< + std::is_same::value, + bool>::type = true> +T min_override(T a, T b) { + const auto float_a = static_cast(a); + if (std::isnan(float_a)) { + return a; + } + const auto float_b = static_cast(b); + if (std::isnan(float_b)) { + return b; + } + + if (float_a < float_b) { + return a; + } + return b; +} + +template < + typename T, + typename std::enable_if< + std::is_same::value, + bool>::type = true> +T max_override(T a, T b) { + const auto float_a = static_cast(a); + if (std::isnan(float_a)) { + return a; + } + const auto float_b = static_cast(b); + if (std::isnan(float_b)) { + return b; + } + + if (float_a > float_b) { + return a; + } + return b; +} + /** * There is a slight difference in how std::fmod works compared to how ATen * determines remainders: diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp index 871333482c8..0244fd55700 100644 --- a/kernels/test/op_clamp_test.cpp +++ b/kernels/test/op_clamp_test.cpp @@ -147,8 +147,16 @@ class OpClampOutTest : public OperatorTest { // Test cases that are compatible with float and double. template void run_floating_point_test_cases() { - constexpr auto kInfinity = - std::numeric_limits::ctype>::infinity(); + using ctype = typename TensorFactory::ctype; + using opt_infinity_type = std::conditional_t< + std::is_same::value, + float, + ctype>; + constexpr auto kInfinity = std::numeric_limits::infinity(); + const auto kOptInfinity = + OptScalar(static_cast(kInfinity)); + const auto kOptMinusInfinity = + OptScalar(static_cast(-kInfinity)); std::vector> test_cases = { { std::string(__func__) + ": Simple negative/positive clamp", @@ -178,7 +186,7 @@ class OpClampOutTest : public OperatorTest { std::string(__func__) + ": Infinite min", {2, 2}, // sizes {-10.1, -1.1, 1.1, 10.1}, // input_data - OptScalar(-kInfinity), // min + kOptMinusInfinity, // min OptScalar(5.5), // max {-10.1, -1.1, 1.1, 5.5}, // expected_data }, @@ -187,7 +195,7 @@ class OpClampOutTest : public OperatorTest { {2, 2}, // sizes {-10.1, -1.1, 1.1, 10.1}, // input_data OptScalar(-5.5), // min - OptScalar(kInfinity), // max + kOptInfinity, // max {-5.5, -1.1, 1.1, 10.1}, // expected_data }, { @@ -285,6 +293,15 @@ TEST_F(OpClampOutTest, LongTensors) { run_signed_integer_test_cases(); } +TEST_F(OpClampOutTest, HalfTensors) { + // Note that the integer test cases test the situation where the min/max value + // Scalars are integer types, demonstrating that floating point types can be + // clamped to integer values. + run_unsigned_integer_test_cases(); + run_signed_integer_test_cases(); + run_floating_point_test_cases(); +} + TEST_F(OpClampOutTest, FloatTensors) { // Note that the integer test cases test the situation where the min/max value // Scalars are integer types, demonstrating that floating point types can be diff --git a/kernels/test/op_minimum_test.cpp b/kernels/test/op_minimum_test.cpp index be43e0af07d..7e12374b8d1 100644 --- a/kernels/test/op_minimum_test.cpp +++ b/kernels/test/op_minimum_test.cpp @@ -65,6 +65,10 @@ TEST_F(OpMinimumOutTest, LongTensors) { test_minimum_out_same_size(); } +TEST_F(OpMinimumOutTest, HalfTensors) { + test_minimum_out_same_size(); +} + TEST_F(OpMinimumOutTest, FloatTensors) { test_minimum_out_same_size(); } From 74f4fc8cef726a0421727d0ac8cd1d9938c8cdd7 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 06/10] use utils:{min,max}_override in {min,max}imum ops (#3453) Summary: Noticed this inconsistency with clamp. Reviewed By: manuelcandales Differential Revision: D56846313 --- kernels/portable/cpu/op_maximum.cpp | 8 ++------ kernels/portable/cpu/op_minimum.cpp | 8 ++------ kernels/portable/cpu/targets.bzl | 2 ++ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 4091f2cf8ca..1353479b294 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -8,6 +8,7 @@ #include #include +#include #include namespace torch { @@ -15,11 +16,6 @@ namespace executor { namespace native { namespace { -template -const T& max(const T& a, const T& b) { - return (b > a) ? b : a; -} - template < bool can_cast, typename CTYPE_A, @@ -40,7 +36,7 @@ struct MaximumInner { [](const CTYPE_A val_a, const CTYPE_B val_b) { CTYPE_IN a_casted = static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = max(a_casted, b_casted); + CTYPE_IN value = utils::max_override(a_casted, b_casted); return static_cast(value); }, diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index 44c0efa8a67..f18d1a6d368 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -8,6 +8,7 @@ #include #include +#include #include namespace torch { @@ -15,11 +16,6 @@ namespace executor { namespace native { namespace { -template -const T& min(const T& a, const T& b) { - return (b < a) ? b : a; -} - template < bool can_cast, typename CTYPE_A, @@ -40,7 +36,7 @@ struct MinimumInner { [](const CTYPE_A val_a, const CTYPE_B val_b) { CTYPE_IN a_casted = static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = min(a_casted, b_casted); + CTYPE_IN value = utils::min_override(a_casted, b_casted); return static_cast(value); }, diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 77796c68526..bffe2fcf48c 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -560,6 +560,7 @@ _ATEN_OPS = ( name = "op_maximum", deps = [ "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/kernels/portable/cpu/util:math_util", ":scalar_utils", ], ), @@ -591,6 +592,7 @@ _ATEN_OPS = ( name = "op_minimum", deps = [ "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/kernels/portable/cpu/util:math_util", ":scalar_utils", ], ), From 9be3d6d79226ee3baa506f3d749f26c74d11d32b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 07/10] Use compile-time promotion to reduce bitwise op size & build time (#3487) Summary: Finally getting close to the end of compile-time promotion for Tensor ops! Reviewed By: manuelcandales Differential Revision: D56855548 --- kernels/portable/cpu/op_bitwise_and.cpp | 63 ++++++-------------- kernels/portable/cpu/op_bitwise_or.cpp | 61 ++++++------------- kernels/portable/cpu/op_bitwise_xor.cpp | 64 ++++++-------------- kernels/portable/cpu/pattern/bitwise_op.h | 72 +++++++++++++++++++++++ kernels/portable/cpu/pattern/targets.bzl | 11 ++++ kernels/portable/cpu/scalar_utils.h | 22 +++---- kernels/portable/cpu/targets.bzl | 3 + 7 files changed, 152 insertions(+), 144 deletions(-) create mode 100644 kernels/portable/cpu/pattern/bitwise_op.h diff --git a/kernels/portable/cpu/op_bitwise_and.cpp b/kernels/portable/cpu/op_bitwise_and.cpp index b1078f780a4..de137afbec2 100644 --- a/kernels/portable/cpu/op_bitwise_and.cpp +++ b/kernels/portable/cpu/op_bitwise_and.cpp @@ -6,8 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#include +// patternlint-disable-next-line executorch-cpp-nostdinc +#include +#include #include #include #include @@ -17,20 +19,6 @@ namespace torch { namespace executor { namespace native { -namespace { - -template -CTYPE bitwise_and(CTYPE a, CTYPE b) { - return a & b; -} - -template <> -bool bitwise_and(bool a, bool b) { - return a && b; -} - -} // namespace - using Tensor = exec_aten::Tensor; Tensor& bitwise_and_Tensor_out( @@ -55,38 +43,23 @@ Tensor& bitwise_and_Tensor_out( Bool, a_type, ctx, "bitwise_and.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_INT_TYPES_AND( Bool, b_type, ctx, "bitwise_and.Tensor_out", CTYPE_B, [&]() { - ET_SWITCH_INT_TYPES_AND( + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( Bool, - common_type, + out_type, ctx, "bitwise_and.Tensor_out", - CTYPE_IN, + CTYPE_OUT, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, - out_type, - ctx, - "bitwise_and.Tensor_out", - CTYPE_OUT, - [&]() { - apply_binary_elementwise_fn< - CTYPE_A, - CTYPE_B, - CTYPE_OUT>( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = - bitwise_and(a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + internal::BitwiseOpInner< + can_cast::value, + std::bit_and, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); }); }); }); @@ -142,8 +115,8 @@ Tensor& bitwise_and_Scalar_out( static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = - bitwise_and(a_casted, b_casted); + CTYPE_IN value = std::bit_and()( + a_casted, b_casted); return static_cast(value); }, diff --git a/kernels/portable/cpu/op_bitwise_or.cpp b/kernels/portable/cpu/op_bitwise_or.cpp index c13c68d3db4..39707de07ce 100644 --- a/kernels/portable/cpu/op_bitwise_or.cpp +++ b/kernels/portable/cpu/op_bitwise_or.cpp @@ -6,8 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#include +// patternlint-disable-next-line executorch-cpp-nostdinc +#include +#include #include #include #include @@ -17,20 +19,6 @@ namespace torch { namespace executor { namespace native { -namespace { - -template -CTYPE bitwise_or(CTYPE a, CTYPE b) { - return a | b; -} - -template <> -bool bitwise_or(bool a, bool b) { - return a || b; -} - -} // namespace - using Tensor = exec_aten::Tensor; Tensor& bitwise_or_Tensor_out( @@ -55,37 +43,23 @@ Tensor& bitwise_or_Tensor_out( Bool, a_type, ctx, "bitwise_or.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_INT_TYPES_AND( Bool, b_type, ctx, "bitwise_or.Tensor_out", CTYPE_B, [&]() { - ET_SWITCH_INT_TYPES_AND( + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( Bool, - common_type, + out_type, ctx, "bitwise_or.Tensor_out", - CTYPE_IN, + CTYPE_OUT, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, - out_type, - ctx, - "bitwise_or.Tensor_out", - CTYPE_OUT, - [&]() { - apply_binary_elementwise_fn< - CTYPE_A, - CTYPE_B, - CTYPE_OUT>( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = bitwise_or(a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + internal::BitwiseOpInner< + can_cast::value, + std::bit_or, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); }); }); }); @@ -141,7 +115,8 @@ Tensor& bitwise_or_Scalar_out( static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = bitwise_or(a_casted, b_casted); + CTYPE_IN value = + std::bit_or()(a_casted, b_casted); return static_cast(value); }, diff --git a/kernels/portable/cpu/op_bitwise_xor.cpp b/kernels/portable/cpu/op_bitwise_xor.cpp index d2ea8a81cfb..1855485ee52 100644 --- a/kernels/portable/cpu/op_bitwise_xor.cpp +++ b/kernels/portable/cpu/op_bitwise_xor.cpp @@ -6,8 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#include +// patternlint-disable-next-line executorch-cpp-nostdinc +#include +#include #include #include #include @@ -17,20 +19,6 @@ namespace torch { namespace executor { namespace native { -namespace { - -template -CTYPE bitwise_xor(CTYPE a, CTYPE b) { - return a ^ b; -} - -template <> -bool bitwise_xor(bool a, bool b) { - return a != b; -} - -} // namespace - using Tensor = exec_aten::Tensor; Tensor& bitwise_xor_Tensor_out( @@ -38,7 +26,6 @@ Tensor& bitwise_xor_Tensor_out( const Tensor& a, const Tensor& b, Tensor& out) { - // Determine output size and resize for dynamic shapes ET_KERNEL_CHECK( ctx, resize_to_broadcast_target_size(a, b, out) == Error::Ok, @@ -56,38 +43,23 @@ Tensor& bitwise_xor_Tensor_out( Bool, a_type, ctx, "bitwise_xor.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_INT_TYPES_AND( Bool, b_type, ctx, "bitwise_xor.Tensor_out", CTYPE_B, [&]() { - ET_SWITCH_INT_TYPES_AND( + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( Bool, - common_type, + out_type, ctx, "bitwise_xor.Tensor_out", - CTYPE_IN, + CTYPE_OUT, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, - out_type, - ctx, - "bitwise_xor.Tensor_out", - CTYPE_OUT, - [&]() { - apply_binary_elementwise_fn< - CTYPE_A, - CTYPE_B, - CTYPE_OUT>( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = - bitwise_xor(a_casted, b_casted); - - return static_cast(value); - }, - a, - b, - out); - }); + internal::BitwiseOpInner< + can_cast::value, + std::bit_xor, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); }); }); }); @@ -143,8 +115,8 @@ Tensor& bitwise_xor_Scalar_out( static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = - bitwise_xor(a_casted, b_casted); + CTYPE_IN value = std::bit_xor()( + a_casted, b_casted); return static_cast(value); }, diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h new file mode 100644 index 00000000000..dda4fe5cd55 --- /dev/null +++ b/kernels/portable/cpu/pattern/bitwise_op.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace internal { + +template < + bool can_cast, + template + class OpFunc, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct BitwiseOpInner; + +template < + template + class OpFunc, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct BitwiseOpInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = OpFunc()(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + template + class OpFunc, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct BitwiseOpInner + : public ReportCanCastBug {}; + +} // namespace internal +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/pattern/targets.bzl b/kernels/portable/cpu/pattern/targets.bzl index 360d991767b..7e0b71ed950 100644 --- a/kernels/portable/cpu/pattern/targets.bzl +++ b/kernels/portable/cpu/pattern/targets.bzl @@ -6,6 +6,17 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ + runtime.cxx_library( + name = "bitwise_op", + exported_headers = [ + "bitwise_op.h", + ], + compiler_flags = [], + deps = [ + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."], + ) runtime.cxx_library( name = "pattern", diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h index 989e7978fc3..3daf3e72526 100644 --- a/kernels/portable/cpu/scalar_utils.h +++ b/kernels/portable/cpu/scalar_utils.h @@ -84,9 +84,9 @@ template struct promote_type_with_scalar_type { private: static_assert( - std::is_same::value || - std::is_same::value || - std::is_same::value, + std::is_same::value || + std::is_same::value || + std::is_same::value, "scalar type can only be Bool, Long or Double"); static_assert( !is_qint_type::value, @@ -102,17 +102,19 @@ struct promote_type_with_scalar_type { "promote_type_with_scalar_type not valid for BFloat16"); using promote_type_with_scalar_type_not_respecting_half_to_float = typename std::conditional< - is_complex_type::value || std::is_same::value, + is_complex_type::value || + std::is_same::value, T1, typename std::conditional< - std::is_same::value, + std::is_same::value, typename std::conditional< - std::is_same::value, - internal::I8, + std::is_same::value, + torch::executor::internal::I8, T1>::type, - typename std:: - conditional::value, T1, internal::F4>:: - type>::type>::type; + typename std::conditional< + is_floating_point::value, + T1, + torch::executor::internal::F4>::type>::type>::type; public: using type = typename std::conditional< diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index bffe2fcf48c..7be1d94d2bf 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -142,6 +142,7 @@ _ATEN_OPS = ( deps = [ "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_util", + "//executorch/kernels/portable/cpu/pattern:bitwise_op", "//executorch/kernels/portable/cpu/util:broadcast_util", "//executorch/kernels/portable/cpu/util:functional_util", ":scalar_utils", @@ -160,6 +161,7 @@ _ATEN_OPS = ( deps = [ "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_util", + "//executorch/kernels/portable/cpu/pattern:bitwise_op", "//executorch/kernels/portable/cpu/util:broadcast_util", "//executorch/kernels/portable/cpu/util:functional_util", ":scalar_utils", @@ -170,6 +172,7 @@ _ATEN_OPS = ( deps = [ "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_util", + "//executorch/kernels/portable/cpu/pattern:bitwise_op", "//executorch/kernels/portable/cpu/util:broadcast_util", "//executorch/kernels/portable/cpu/util:functional_util", ":scalar_utils", From 7bd0547efdd638600671a38dd0acf81d0c188a80 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 08/10] Use compile-time promotion to reduce optimized mul op size & build time (#3532) Summary: another in a long line of fixes. Reviewed By: manuelcandales Differential Revision: D56896048 --- kernels/optimized/cpu/op_mul.cpp | 73 ++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 3b2926a8a74..adcd8999150 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -41,6 +41,50 @@ bool can_use_optimized_path( (a.numel() == b.numel() && a.numel() == out.numel())); return can_use_optimized_path; } + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MulInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MulInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted * b_casted; + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MulInner + : public ReportCanCastBug {}; } // namespace Tensor& opt_mul_out( @@ -86,20 +130,21 @@ Tensor& opt_mul_out( ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { - ET_SWITCH_REALB_TYPES(common_type, ctx, "mul.out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted * b_casted; - - return static_cast(value); - }, - a, - b, - out); - }); + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted * b_casted; + + return static_cast(value); + }, + a, + b, + out); }); }); }); From eaff700c9d77e3d8180ffffe598b3afcff2cd25f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 09/10] Use compile-time promotion to reduce optimized add/sub op size & build time (#3533) Summary: Yet another pair of ops. Reviewed By: manuelcandales Differential Revision: D57023819 --- kernels/optimized/cpu/op_add.cpp | 83 ++++++++++++++++++++++++-------- kernels/optimized/cpu/op_sub.cpp | 82 +++++++++++++++++++++++-------- 2 files changed, 125 insertions(+), 40 deletions(-) diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index c11c9977fe5..b62c3b154fa 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -16,6 +16,55 @@ namespace torch { namespace executor { namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct AddInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct AddInner { + static void + run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted + alpha_val * b_casted; + + return static_cast(value); + }, + a, + b, + out); + } +}; + +template +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct AddInner + : public ReportCanCastBug {}; + +} // namespace using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; @@ -69,26 +118,20 @@ Tensor& opt_add_out( ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() { ET_SWITCH_REALHB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() { - ET_SWITCH_REALB_TYPES(common_type, ctx, "add.out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() { - CTYPE_IN alpha_val; - ET_KERNEL_CHECK( - ctx, - utils::extract_scalar(alpha, &alpha_val), - InvalidArgument, ); - - apply_binary_elementwise_fn( - [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted + alpha_val * b_casted; - - return static_cast(value); - }, - a, - b, - out); - }); + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() { + CTYPE_IN alpha_val; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + + AddInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, alpha_val, out); }); }); }); diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index 77917c0eda6..87368f3ed76 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -17,6 +17,55 @@ namespace torch { namespace executor { namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct SubInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct SubInner { + static void + run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted - alpha_val * b_casted; + + return static_cast(value); + }, + a, + b, + out); + } +}; + +template +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct SubInner + : public ReportCanCastBug {}; + +} // namespace using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; @@ -72,26 +121,19 @@ Tensor& opt_sub_out( ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() { ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() { - ET_SWITCH_REAL_TYPES(common_type, ctx, "sub.out", CTYPE_IN, [&]() { - ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() { - CTYPE_IN alpha_val; - ET_KERNEL_CHECK( - ctx, - utils::extract_scalar(alpha, &alpha_val), - InvalidArgument, ); - - apply_binary_elementwise_fn( - [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted - alpha_val * b_casted; - - return static_cast(value); - }, - a, - b, - out); - }); + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() { + CTYPE_IN alpha_val; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + SubInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, alpha_val, out); }); }); }); From 816b50cebfc97db48209901acd042fc0fed1dbb2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 8 May 2024 11:24:54 -0700 Subject: [PATCH 10/10] Use compile-time promotion to reduce optimized le op size & build time (#3534) Summary: Yet another optimized op. Reviewed By: manuelcandales Differential Revision: D57028967 --- kernels/optimized/cpu/op_le.cpp | 35 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp index 05e7889671b..15481403c2d 100644 --- a/kernels/optimized/cpu/op_le.cpp +++ b/kernels/optimized/cpu/op_le.cpp @@ -53,31 +53,26 @@ Tensor& opt_le_tensor_out( a.numel()); }); } else { - ScalarType common_type = promoteTypes(a_type, b_type); ET_SWITCH_REAL_TYPES_AND( Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND( Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); ET_SWITCH_REAL_TYPES_AND( - Bool, common_type, ctx, "le.Tensor_out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, - out_type, - ctx, - "le.Tensor_out", - CTYPE_OUT, - [&]() { - const size_t n = a.numel(); - const CTYPE_A* a_data = a.const_data_ptr(); - const CTYPE_B* b_data = b.const_data_ptr(); - CTYPE_OUT* out_data = - out.mutable_data_ptr(); - for (auto i = 0; i < n; ++i) { - out_data[i] = static_cast( - static_cast(a_data[i]) <= - static_cast(b_data[i])); - } - }); + Bool, out_type, ctx, "le.Tensor_out", CTYPE_OUT, [&]() { + const size_t n = a.numel(); + const CTYPE_A* a_data = a.const_data_ptr(); + const CTYPE_B* b_data = b.const_data_ptr(); + CTYPE_OUT* out_data = out.mutable_data_ptr(); + for (auto i = 0; i < n; ++i) { + out_data[i] = static_cast( + static_cast(a_data[i]) <= + static_cast(b_data[i])); + } }); }); });