diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp index 6030221d883..4ad409d4820 100644 --- a/kernels/portable/cpu/op_amax.cpp +++ b/kernels/portable/cpu/op_amax.cpp @@ -46,13 +46,17 @@ Tensor& amax_out( ReduceOverDimListPlan plan(in, dim_list); ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - out_data[out_ix] = plan.execute( - [](CTYPE v, CTYPE max_v) { - return std::isnan(v) || v > max_v ? v : max_v; - }, - out_ix); - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + out_data[out_ix] = plan.execute( + [](CTYPE v, CTYPE max_v) { + return std::isnan(v) || v > max_v ? v : max_v; + }, + out_ix); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp index e4979390a5d..396cb6c016d 100644 --- a/kernels/portable/cpu/op_amin.cpp +++ b/kernels/portable/cpu/op_amin.cpp @@ -45,13 +45,17 @@ Tensor& amin_out( ReduceOverDimListPlan plan(in, dim_list); ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - out_data[out_ix] = plan.execute( - [](CTYPE v, CTYPE min_v) { - return std::isnan(v) || v < min_v ? v : min_v; - }, - out_ix); - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + out_data[out_ix] = plan.execute( + [](CTYPE v, CTYPE min_v) { + return std::isnan(v) || v < min_v ? v : min_v; + }, + out_ix); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp index a368226db80..ee9e54fc0c3 100644 --- a/kernels/portable/cpu/op_any.cpp +++ b/kernels/portable/cpu/op_any.cpp @@ -96,16 +96,21 @@ Tensor& any_dims_out( static_cast(static_cast(in_data[out_ix])); } } else { - for (const auto out_ix : c10::irange(out.numel())) { - bool any = false; - if (in_not_empty) { - any = plan->execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](bool outv, bool acc) { return acc || outv; }, - out_ix); - } - out_data[out_ix] = static_cast(any); - } + const bool success = + parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + bool any = false; + if (in_not_empty) { + any = plan->execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](bool outv, bool acc) { return acc || outv; }, + out_ix); + } + out_data[out_ix] = static_cast(any); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); } }); }); diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index c13e2a09937..423c2564232 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -46,22 +46,27 @@ Tensor& mean_dim_out( out); MapReduceOverDimListPlan plan(in, dim_list); - ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] { - ET_SWITCH_FLOATHBF16_TYPES( - out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr(); - const size_t num = get_reduced_dim_product(in, dim_list); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = 0; - if (in.numel() > 0) { - sum = plan.execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "add.out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const size_t num = get_reduced_dim_product(in, dim_list); + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = 0; + if (in.numel() > 0) { + sum = plan.execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + } + out_data[out_ix] = sum / static_cast(num); } - out_data[out_ix] = sum / static_cast(num); - } - }); + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); + }); }); return out; diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index f58773a6769..550f6b9572f 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -50,23 +50,27 @@ Tensor& sum_dim_out( if (in.numel() > 0) { plan.emplace(in, dim_list); } - ET_SWITCH_REALHBBF16_TYPES( - in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] { - ET_SWITCH_REALHBBF16_TYPES( - out.scalar_type(), ctx, "sum.IntList_out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = 0; - if (plan.has_value()) { - sum = plan->execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - } - out_data[out_ix] = sum; + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sum.IntList_out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = 0; + if (plan.has_value()) { + sum = plan->execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); } - }); - }); + out_data[out_ix] = sum; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); + }); + }); return out; } diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index c5be3fdad62..f09f1d92bc9 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -21,6 +21,7 @@ namespace { template void compute_variance( + KernelRuntimeContext& ctx, const Tensor& in, Tensor& out, optional> dim_list, @@ -33,22 +34,26 @@ void compute_variance( } } else { MapReduceOverDimListPlan plan(in, dim_list); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = plan.execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - CTYPE_OUT mean = sum / static_cast(num); - CTYPE_OUT sum2 = plan.execute( - [mean](CTYPE_IN v) { - return ( - (static_cast(v) - mean) * - (static_cast(v) - mean)); - }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - out_data[out_ix] = sum2 / denominator; - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = plan.execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + CTYPE_OUT mean = sum / static_cast(num); + CTYPE_OUT sum2 = plan.execute( + [mean](CTYPE_IN v) { + return ( + (static_cast(v) - mean) * + (static_cast(v) - mean)); + }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + out_data[out_ix] = sum2 / denominator; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); } } @@ -90,7 +95,7 @@ Tensor& var_out( ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { - compute_variance(in, out, dim_list, num, denom); + compute_variance(ctx, in, out, dim_list, num, denom); }); }); @@ -135,7 +140,7 @@ Tensor& var_correction_out( ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { - compute_variance(in, out, dim_list, num, denom); + compute_variance(ctx, in, out, dim_list, num, denom); }); }); diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 1c6a6de4101..ff7589af4f5 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -823,11 +823,15 @@ template executorch::aten::optional dim, const Tensor& out, const Func& func) { +#ifdef ET_USE_THREADPOOL const ssize_t reduction_size = get_reduced_dim_product(in, dim); const auto grain_size = std::max( static_cast(1), static_cast(executorch::extension::internal::GRAIN_SIZE) / reduction_size); +#else // ET_USE_THREADPOOL + const auto grain_size = 1; +#endif // ET_USE_THREADPOOL return executorch::extension::parallel_for(0, out.numel(), grain_size, func); } @@ -842,11 +846,15 @@ template optional> dim_list, const Tensor& out, const Func& func) { +#ifdef ET_UE_THREADPOOL const ssize_t reduction_size = get_reduced_dim_product(in, dim_list); const auto grain_size = std::max( static_cast(1), static_cast(executorch::extension::internal::GRAIN_SIZE) / reduction_size); +#else // ET_USE_THREADPOOL + const auto grain_size = 1; +#endif // ET_USE_THREADPOOL return executorch::extension::parallel_for(0, out.numel(), grain_size, func); }