diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index cf234c22c01..b6a2c500010 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -77,10 +77,20 @@ - arg_meta: null kernel_name: torch::executor::max_pool2d_with_indices_out +- op: maximum.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::maximum_out + - op: mean.out kernels: - arg_meta: null - kernel_name: cadence::impl::HiFi::mean_dim_out + kernel_name: cadence::impl::HiFi::mean_dim_out + +- op: minimum.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::minimum_out - op: mul.out kernels: @@ -92,6 +102,26 @@ - arg_meta: null kernel_name: torch::executor::permute_copy_out +- op: pow.Scalar_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::pow_Scalar_out + +- op: pow.Tensor_Scalar_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out + +- op: pow.Tensor_Tensor_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out + +- op: rsqrt.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::rsqrt_out + - op: sigmoid.out kernels: - arg_meta: null diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 9321cc544e7..3d321443f8b 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -9,10 +9,13 @@ add_library( cadence_kernels kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c ) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 2c915661f85..10927adc2a2 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -15,6 +15,14 @@ #include "xa_nnlib_kernels_api.h" /* Potential NNLIB function/APIs */ + +extern "C" WORD32 xa_nn_broadcast_32_32( + WORD32* __restrict__ p_out, + const int* const out_shape, + WORD32* __restrict__ p_in, + const int* const in_shape, + int num_dims); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -47,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32( const WORD32* const p_inp2_shape, WORD32 mode); +extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -55,6 +91,12 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( const FLOAT32* __restrict__ p_inp2, const WORD32* const p_inp2_shape); +extern "C" void xa_nn_elm_pow_f32( + FLOAT32* restrict z, + const FLOAT32* restrict x, + const FLOAT32* restrict y, + WORD32 N); + extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( FLOAT32* __restrict__ p_out, const FLOAT32* __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index fc00345465b..5e51f7fd3be 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -22,8 +22,12 @@ endif() set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp" diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp new file mode 100644 index 00000000000..f9a3658891b --- /dev/null +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = + torch::executor::native::utils::max_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& maximum_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool optimized = true; + /*find broadcast*/ + bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + float* a_data = a.mutable_data_ptr(); + float* b_data = b.mutable_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + if (broadcast == true) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + off_o] = out.size(i); + } + + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + + xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); + } else { + xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() { + MaximumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp new file mode 100644 index 00000000000..6f81ad5c3e3 --- /dev/null +++ b/backends/cadence/hifi/operators/op_minimum.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = + torch::executor::native::utils::min_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& minimum_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool optimized = true; + /*find broadcast*/ + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + float* a_data = a.mutable_data_ptr(); + float* b_data = b.mutable_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + if (broadcast == true) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + off_o] = out.size(i); + } + + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + + xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); + } else { + xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { + MinimumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp new file mode 100644 index 00000000000..9669e961230 --- /dev/null +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::promoteTypes; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + torch::executor::apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& pow_Tensor_Tensor_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK( + ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out); + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + constexpr auto name = "pow.Tensor_Tensor_out"; + constexpr int kNnlibMaxDim = 16; + int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); + bool optimized = true; + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted && b_is_broadcasted); + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if (out_type != ScalarType::Float) + optimized = false; + + if (max_dim > kNnlibMaxDim) + optimized = false; + + WORD32 num_elm = out.numel(); + + if (optimized) { + if (broadcast) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + WORD32 p_inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + for (int i = 0; i < b_dim; i++) + p_inp2_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2; + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + free(ptr2); + } else if (a_is_broadcasted && (!b_is_broadcasted)) { + FLOAT32* __restrict__ ptr1 = + (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = + (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + + xa_nn_broadcast_32_32( + (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else if (b_is_broadcasted && (!a_is_broadcasted)) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < b_dim; i++) + p_inp1_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1; + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else { + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + } + return out; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + PowInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +Tensor& pow_Tensor_Scalar_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b); + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + a_type, b, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + constexpr auto name = "pow.Tensor_Scalar_out"; + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + torch::executor::native::utils::extract_scalar(b, &val_b); + torch::executor::apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +Tensor& pow_Scalar_out( + KernelRuntimeContext& ctx, + const Scalar& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, b.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = torch::executor::native::utils::get_scalar_dtype(a); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + b_type, a, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + constexpr auto name = "pow.Scalar_out"; + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + CTYPE_A val_a = 0; + torch::executor::native::utils::extract_scalar(a, &val_a); + + torch::executor::apply_unary_map_fn( + [val_a](const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + b.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence + diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp new file mode 100644 index 00000000000..1cf717988aa --- /dev/null +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { +namespace { + +double rsqrt(double x) { + return 1.0 / std::sqrt(x); +} + +} // namespace + +Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { + bool optimized = true; + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (optimized) { + WORD32 num_elm = out.numel(); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp = + (const FLOAT32* __restrict__)in.const_data_ptr(); + + xa_nn_elm_rsqrt_f32_f32(p_out, p_inp, num_elm); + return out; + } + + return torch::executor::native::internal:: + unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index 0f56a1a9631..accc6101329 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -26,6 +26,9 @@ using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; + + // The nnlib kernel to compute quantized linear via matmul. + void _quantized_linear_asym8u( const Tensor& in, const Tensor& weight, @@ -37,37 +40,30 @@ void _quantized_linear_asym8u( int64_t out_zero_point, __ET_UNUSED const optional& offset, Tensor& out) { - // input comes in shape [leading_dims, in_dim] - // weight comes in shape [out_dim, in_dim] - // output comes in empty with shape [leading_dims, out_dim] - // Perform matrix multiply (M x N) x (N x P)' => M x P const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); const int64_t out_dim = weight.size(0); // = out_dim const int64_t in_dim = weight.size(1); // = in_dim - const uint8_t* __restrict__ in_data = in.const_data_ptr(); const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); const int32_t* __restrict__ bias_data = bias.const_data_ptr(); uint8_t* __restrict__ out_data = out.mutable_data_ptr(); - - // The nnlib kernel to compute quantized linear via matmul. int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( - out_data, // p_out - weight_data, // p_mat1, - in_data, // p_mat2, - bias_data, // p_bias - out_dim, // rows of p_mat1 - in_dim, // cols of p_mat1 - in_dim, // row_stride of p_mat1 - leading_dims, // vec_count, i.e., rows of p_mat2 - in_dim, // vec_offset of p_mat2. - out_dim, // out_offset, i.e., offset of next output element written - 1, // out_stride, i.e., stride to go to next output row + out_data, + weight_data, + in_data, + bias_data, + out_dim, + in_dim, + in_dim, + leading_dims, + in_dim, + out_dim, + 1, -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias -in_zero_point, // mat2_zero_bias - out_multiplier.const_data_ptr()[0], // out_multiplier - out_shift.const_data_ptr()[0], // out_shift - out_zero_point); // out_zero_bias + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + out_zero_point); ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); } diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c new file mode 100644 index 00000000000..cad3f1a25bb --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_8_8.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c new file mode 100644 index 00000000000..34a7111ee78 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_32_32.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c new file mode 100644 index 00000000000..3af93fc00c1 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c @@ -0,0 +1,847 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" +#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" +#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h" + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_maximum_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + y = XT_MAX_SX2(x2, x1); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_MAX_SX2(x2, x1); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_MAX_S(a1, a2); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_MAX_SX2(x2, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_MAX_SX2(x2, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_MAX_S(b0, a0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_maximum_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_maximum_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_maximum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_maximum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_minimum_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + y = XT_MIN_SX2(x2, x1); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_MIN_SX2(x2, x1); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_MIN_S(a1, a2); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_MIN_SX2(x2, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_MIN_SX2(x2, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_MIN_S(b0, a0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_minimum_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_minimum_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_minimum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_minimum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c new file mode 100644 index 00000000000..4dcec52f973 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c @@ -0,0 +1,1151 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ("Cadence */ +/* Libraries") are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* DSP Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2015-2018 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NatureDSP Signal Processing Library. Vector mathematics + Vector operations + code optimized for HiFi4 core + IntegrIT, 2006-2018 +*/ + +#include "../include/NatureDSP_Signal_math.h" +#include "NatureDSP_types.h" +#include "xa_nn_common.h" + +/* Common helper macros. */ +#include "xa_nnlib_common_fpu.h" + +#include "xa_nnlib_common.h" +/* Constant tables. */ + +const union ufloat32uint32 ALIGN(8) xa_nnlib_pow2f_coef[] = +{ + { 0x39222a65 }, + { 0x3aaf931c }, + { 0x3c1d94fc }, + { 0x3d63578a }, + { 0x3e75fdf0 }, + { 0x3f317218 }, + { 0x3f800000 } + + //{ 0x3aaf931b }, + //{ 0x3c1e7220 }, + //{ 0x3d63578a }, + //{ 0x3e75fcc9 }, + //{ 0x3f317218 }, + //{ 0x3f800000 } + +}; + +const union ufloat32uint32 ALIGN(8) xa_nnlib_log2f_coef[] = +{ + { 0x3d726a49 }, + { 0x3dd91c88 }, + { 0x3ddde76c }, + { 0x3de21e63 }, + { 0x3dfe600b }, + { 0x3e124679 }, + { 0x3e2ab2f1 }, + { 0x3e4ccd1b }, + { 0x3e7fffde }, + { 0x3eaaaaaa }, + { 0x3f000000 }, + { 0x3f800000 }, + /* log2(e) */ + { 0x3fb8aa3b }, /* 1.4426950216 */ + { 0x32a57060 } /* 1.9259629891e-008 */ +}; + +const union ufloat32uint32 xa_nnlib_pow_plusInff ={0x7f800000}; + +const union ufloat32uint32 xa_nnlib_pow_qNaNf = { 0x7fc00000 }; + +#define MIN(a,b) ( (a)<(b) ? (a) : (b) ) +#define MAX(a,b) ( (a)>(b) ? (a) : (b) ) + +/*------------------------------------------------------------------------- + Power function + These routines calculate power function for 32-bit fixed-point numbers or + floating point numbers. + For the fixed point API, The base is represented in Q31, the exponent + is represented in Q6.25. Results are represented as normalized fixed point + number with separate mantissa in Q31 and exponent. + + Precision: + 32x32 32-bit inputs, 32-bit outputs + f floating point input, floating point output + + Accuracy: + 2 ULP for fixed point API + 2 ULP under condition that |y|<=100 + + Notes: +1. Scalar floating point raise to a power functions conform to ANSI C requirements on + standard math library functions in respect to treatment of errno and floating- + point exceptions. Vectorized function does not touch errno and may raise or not raise + floating point exceptions. +2. For floating point API, If x<0 is finite, y is finite and not an integer value, + then the respective result z is set to NaN +3. For fixed point API, function returns zero for all non-positive x. Fixed point + functions never touch errno + + Special cases: + x | y | Result | Extra Conditions + --------+--------+--------+--------------------- + floating point API + --------+--------+--------+--------------------- + +/-0 | y | +/-inf | odd y<0 + +/-0 | y | +inf | even y<0 + +/-0 | y | +/-0 | odd y>0 + +/-0 | y | 0 | even y>0 + +/-1 | +/-inf | 1 | + 1 | y | 1 | any y including NaN + x | +/-0 | 1 | any x including NaN + x | y | NaN | finite x<0 and finite + | | | non-integer y (see + | | | note 2) + x | -inf | +inf | |x|<1 + x | -inf | 0 | |x|>1 + x | +inf | 0 | |x|<1 + x | +inf | +inf | |x|>1 + -inf | y | -0 | y an odd integer <0 + -inf | y | 0 | y<0 and not an odd + | | | integer + -inf | y | -inf | y an odd integer >0 + -inf | y | +inf | y>0 and not an odd + | | | integer + +inf | y | 0 | y<0 + +inf | y | +inf | y>0 + --------+--------+--------+--------------------- + fixed point API + --------+--------+--------+--------------------- + x | y | 0 | x<=0 + --------+--------+--------+--------------------- + + Input: + x[N] input data,Q0.31 or floating point + y[N] input data,Q6.25 or floating point + N length of vectors + Output (fixed point API): + m[N] mantissa of output, Q31 + e[N] exponent of output + Output (floating point API): + z[N] results: floating point + + Restriction: + z,x,y,m should not overlap +-------------------------------------------------------------------------*/ + +#if !HAVE_VFPU && !HAVE_FPU +DISCARD_FUN(void, xa_nn_elm_pow_f32, (FLOAT32 * restrict z, const FLOAT32 * restrict y, const FLOAT32 * restrict x, WORD32 N)) +#elif HAVE_VFPU +#define sz_f32 (int)sizeof(FLOAT32) +static void mypowf(FLOAT32 * scr, + FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + WORD32 N ) +{ + /* Table of different constants used in computations */ + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + int n; + const xtfloatx2 * pX; + const xtfloatx2 * pY; + + const xtfloatx2 * restrict S_rd; + xtfloatx2 * restrict S_wr; + xtfloatx2 * restrict pZ; + const ae_int32 * restrict TBL; + const xtfloat * restrict TBL_LOG2; + const xtfloat * restrict TBL_POW2; + xtfloatx2 x0, y0, z0, t0, t1, ef0; + xtfloatx2 c2f, c3f, c4f; + xtfloatx2 _0, _1, half; + ae_int32x2 c0i, c1i, c5i, c7i, c8i; + ae_int32x2 e0, xi0, yi0, ex0; + xtbool2 bsx, bsy, bdenorm, bsmall; + ae_valign aX, aY, aZ; + + /* overall number of blocks; number of values in the current block */ + int blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / (3*sz_f32); + + + if (N <= 0) return; + + NASSERT(N % 2 == 0); + NASSERT_ALIGN16(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + + blkLen = 0; + TBL = (const ae_int32 *)c_tbl; + for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize) + { + blkLen = XT_MIN(N, blkSize); + _0 = 0.0f; + _1 = (1.0f); + half = (0.5f); + { + pX = (const xtfloatx2*)x; + S_wr = (xtfloatx2*)scr; + aX = AE_LA64_PP(pX); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LASX2IP(x0, aX, pX); + + x0 = XT_ABS_SX2(x0); + c0i = AE_L32_I(TBL, 0 * 4); /*-126*/ + c1i = AE_L32_I(TBL, 1 * 4); /*-150*/ + c2f = XT_LSI((xtfloat*)TBL, 2 * 4); + c3f = XT_LSI((xtfloat*)TBL, 3 * 4); + /* process denormalized values */ + bdenorm = XT_OLE_SX2(x0, c2f); + t0 = XT_MUL_SX2(x0, c3f); + XT_MOVT_SX2(x0, t0, bdenorm); + e0 = c0i; + AE_MOVT32X2(e0, c1i, bdenorm); + /* extract exponent */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(c2f);/* load mantissa mask */ //!!!!!!!!!!!!! + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, c5i); + x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + c4f = XT_LSI((xtfloat*)TBL, 4 * 4); + bsmall = XT_OLT_SX2(x0, c4f); + t0 = XT_ADD_SX2(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_SX2(x0, t0, bsmall); + AE_MOVT32X2(e0, ex0, bsmall); + x0 = XT_SUB_SX2(_1, x0); //!!! + ef0 = XT_FLOAT_SX2(e0, 0); //!!! + XT_SSX2IP(x0, S_wr, 2 * sz_f32); + XT_SSX2IP(ef0, S_wr, 2*2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloatx2 p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloatx2 p10, p11, p12, p13; + xtfloatx2 t2, w0, w1; + S_wr = ( xtfloatx2*)scr+2; + S_rd = (const xtfloatx2*)scr; + TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef; + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(x0, S_rd, 3*2 * sz_f32); + //XT_LSX2IP(ef0, S_rd, 2 * sz_f32); + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI(TBL_LOG2, 0 * 4); + p1 = XT_LSI(TBL_LOG2, 1 * 4); + p2 = XT_LSI(TBL_LOG2, 2 * 4); + p3 = XT_LSI(TBL_LOG2, 3 * 4); + p4 = XT_LSI(TBL_LOG2, 4 * 4); + p5 = XT_LSI(TBL_LOG2, 5 * 4); + p6 = XT_LSI(TBL_LOG2, 6 * 4); + p7 = XT_LSI(TBL_LOG2, 7 * 4); + p8 = XT_LSX(TBL_LOG2, 8 * 4); + p9 = XT_LSX(TBL_LOG2, 9 * 4); + + XT_MADD_SX2(p1, x0, p0); + XT_MADD_SX2(p2, x0, p1); + XT_MADD_SX2(p3, x0, p2); + XT_MADD_SX2(p4, x0, p3); + XT_MADD_SX2(p5, x0, p4); + XT_MADD_SX2(p6, x0, p5); + XT_MADD_SX2(p7, x0, p6); + XT_MADD_SX2(p8, x0, p7); + XT_MADD_SX2(p9, x0, p8); + t2 = p9; + XT_SSX2IP(t2, S_wr, 3*2 * sz_f32); + } + S_wr = (xtfloatx2*)scr; + S_rd = (const xtfloatx2*)scr; + for (n = 0; n<(blkLen >> 1); n++) + { + p10 = XT_LSX(TBL_LOG2, 10 * 4); + p11 = XT_LSX(TBL_LOG2, 11 * 4); + p12 = XT_LSX(TBL_LOG2, 12 * 4); + p13 = XT_LSX(TBL_LOG2, 13 * 4); + + XT_LSX2IP(x0, S_rd, 2 * sz_f32); + XT_LSX2IP(ef0, S_rd, 2 * sz_f32); + XT_LSX2IP(t2, S_rd, 2 * sz_f32); + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_SX2(x0, t2); t1 = t0; + XT_MSUB_SX2(t1, x0, t2); + w0 = XT_ADD_SX2(t0, p10); + w1 = XT_SUB_SX2(w0, p10); + w1 = XT_SUB_SX2(t0, w1); + w1 = XT_SUB_SX2(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_SX2(x0, t0); w1 = w0; + XT_MSUB_SX2(w1, x0, t0); t0 = w0; + XT_MSUB_SX2(w1, x0, t1); t1 = w1; + w0 = XT_ADD_SX2(t0, p11); + w1 = XT_SUB_SX2(w0, p11); + w1 = XT_SUB_SX2(t0, w1); + w1 = XT_SUB_SX2(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_SX2(x0); + w0 = XT_MUL_SX2(x0, t0); w1 = w0; + XT_MSUB_SX2(w1, x0, t0); t0 = w0; + XT_MSUB_SX2(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_SX2(t0, p12); w1 = w0; + XT_MSUB_SX2(w1, t0, p12); + XT_MADD_SX2(w1, t1, p12); + XT_MSUB_SX2(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_SX2(t0, ef0); + w1 = XT_SUB_SX2(w0, ef0); + w1 = XT_SUB_SX2(t0, w1); + t1 = XT_SUB_SX2(w1, t1);//!!!! + t0 = w0; // !!!!! + XT_SSX2IP(t0, S_wr, 2 * sz_f32); + XT_SSX2IP(t1, S_wr, 2*2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloatx2 xy, dxy, c0, c1; + xtfloatx2 p0, p1, p2, p3, p4, p5, p6; + S_wr = ( xtfloatx2*)scr+2; + S_rd = (const xtfloatx2*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloatx2*)y; + aY = AE_LA64_PP(pY); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(t0, S_rd, 2 * sz_f32); + XT_LSX2IP(t1, S_rd, 2*2 * sz_f32); + + XT_LASX2IP(y0, aY, pY); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0)); + dxy = XT_NEG_SX2(xy); + XT_MADD_SX2(dxy, y0, t0); + XT_MADD_SX2(dxy, y0, t1); + dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f); + /* compute 2^fract */ + p0 = XT_LSI(TBL_POW2, 0 * 4); + p1 = XT_LSI(TBL_POW2, 1 * 4); + p2 = XT_LSI(TBL_POW2, 2 * 4); + p3 = XT_LSI(TBL_POW2, 3 * 4); + p4 = XT_LSI(TBL_POW2, 4 * 4); + + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_SX2(p1, dxy, p0); + XT_MADD_SX2(p2, dxy, p1); + XT_MADD_SX2(p3, dxy, p2); + XT_MADD_SX2(p4, dxy, p3); + XT_SSX2IP(p4, S_wr, 3*2 * sz_f32); + } + __Pragma("no_reorder"); + S_wr = (xtfloatx2*)scr; + S_rd = (const xtfloatx2*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloatx2*)y; + aY = AE_LA64_PP(pY); + for (n = 0; n<(blkLen >> 1); n++) + { + + XT_LSX2IP(t0, S_rd, 2 * sz_f32); + XT_LSX2IP(t1, S_rd, 2 * sz_f32); + XT_LSX2IP(p4, S_rd, 2 * sz_f32); + p5 = XT_LSI(TBL_POW2, 5 * 4); + p6 = XT_LSI(TBL_POW2, 6 * 4); + XT_LASX2IP(y0, aY, pY); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0)); + dxy = XT_NEG_SX2(xy); + XT_MADD_SX2(dxy, y0, t0); + XT_MADD_SX2(dxy, y0, t1); + dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f); + XT_MADD_SX2(p5, dxy, p4); + XT_MADD_SX2(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_SX2(xy, 0); + c7i = AE_L32_I(TBL, 7 * 4);/* -252 */ + c8i = AE_L32_X(TBL, 8 * 4);/* 254 */ + e0 = AE_MAX32(e0, c7i); + e0 = AE_MIN32(e0, c8i); + e0 = AE_ADD32(e0, c8i); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0); + c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0); + z0 = XT_MUL_SX2(z0, c1); + z0 = XT_MUL_SX2(z0, c0); //!!!!!!!!!!!! + XT_SSX2IP(z0, S_wr, 2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtbool2 b_yint, b_e0, b0, b_notspec; + xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloatx2 xabs, spec; + ae_int32x2 sgn, zi0; + + S_rd = (const xtfloatx2*)scr; + pY = (const xtfloatx2*)y; + pX = (const xtfloatx2*)x; + pZ = ( xtfloatx2*)z; + aY = AE_LA64_PP(pY); + aX = AE_LA64_PP(pX); + aZ = AE_ZALIGN64(); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(z0, S_rd, 2 * sz_f32); + XT_LASX2IP(x0, aX, pX); + XT_LASX2IP(y0, aY, pY); + /* Take sign of x and y */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + yi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(y0); + bsx = XT_OLT_SX2(xi0, (xtfloatx2)0.0f); + bsy = XT_OLT_SX2(yi0, (xtfloatx2)0.0f); + + xabs = XT_ABS_SX2(x0); + /* check if y is integer */ + t0 = XT_FITRUNC_SX2(y0); + b_yint = XT_OEQ_SX2(t0, y0); + + /* check if y is odd */ + e0 = XT_TRUNC_SX2(y0, 0); //temp0 + b_e0 = AE_EQ32(e0, MAX_INT32);//~b_tmp0 + b0i = AE_MOVAB2(b_e0); + b1i = AE_MOVAB2(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA2(b0i); + AE_MOVF32X2(e0, AE_ZERO32(), b0); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = XT_OEQ_SX2((xtfloatx2)0.0f, y0); /* y ==0 */ + b_yinf = XT_OEQ_SX2(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f); /* |y|==Inf */ + b_xeqz = XT_OEQ_SX2(x0, (xtfloatx2)0.0f); /* x ==0 */ + b_xeq1 = XT_OEQ_SX2(xabs, (xtfloatx2)1.0f); /* |x|==1 */ + b_xinf = XT_OEQ_SX2(xabs, xa_nnlib_pow_plusInff.f); /* |x|==INF */ + + yint = AE_MOVAB2(b_yint); + yeqz = AE_MOVAB2(b_yeqz); + yinf = AE_MOVAB2(b_yinf); + xeqz = AE_MOVAB2(b_xeqz); + xeq1 = AE_MOVAB2(b_xeq1); + xinf = AE_MOVAB2(b_xinf); + sx = AE_MOVAB2(bsx); + sy = AE_MOVAB2(bsy); + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA2(NaN1); + b_NaN2 = XT_UN_SX2(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA2(one); + b_Inf = AE_MOVBA2(Inf); + b_zero = AE_MOVBA2(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloatx2)xa_nnlib_pow_qNaNf.f; + XT_MOVF_SX2(spec, half, b_NaN1); + XT_MOVT_SX2(spec, _0, b_zero); + XT_MOVT_SX2(spec, xa_nnlib_pow_plusInff.f, b_Inf); + XT_MOVT_SX2(spec, xa_nnlib_pow_qNaNf.f, b_NaN2); + XT_MOVT_SX2(spec, _1, b_one); + + b_notspec = XT_OEQ_SX2(spec, half); + /* Replace result with special numbers if needed */ + XT_MOVF_SX2(z0, spec, b_notspec); + /* Restore sign and store result */ + zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0); + XT_SASX2IP(z0, aZ, pZ); + } + } + XT_SASX2POSFP(aZ, pZ); + } +} /* mypowf() */ +void xa_nn_elm_pow_f32( FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + int N ) +{ + const int blkSize = MAX_ALLOCA_SZ/sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + FLOAT32 ALIGN(16) scr[blkSize]; + int M; + if ( N<=0 ) return; + M=N&~1; + if ( M ) + { + mypowf(scr,z,x,y,M); + y += M; + x += M; + z += M; + N&=1; + } + if (N) + { // processing the tail + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + xtfloat x0, y0, t0, ef0, t1, t2; + xtfloat xy, dxy, z0, c0, c1; + xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloat p10, p11, p12, p13, w0, w1; + xtbool bdenorm, bsmall; + ae_int32 e0, xi0, ex0; + x0=XT_LSI((const xtfloat*)x,0); + + x0 = XT_ABS_S(x0); + + /* process denormalized values */ + bdenorm = xtbool2_extract_0(XT_OLE_S(x0, XT_LSI((xtfloat*)c_tbl, 2 * 4))); + t0 = XT_MUL_S(x0, XT_LSI((xtfloat*)c_tbl, 3 * 4)); + XT_MOVT_S(x0, t0, (bdenorm)); + e0 = AE_L32_I((ae_int32 *)c_tbl, 0 * 4);; + AE_MOVT_32(e0, AE_L32_I((ae_int32 *)c_tbl, 1 * 4), (bdenorm)); + /* extract exponent */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(XT_LSI((xtfloat*)c_tbl, 2 * 4));/* load mantissa mask */ //!!!!!!!!!!!!! + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, AE_L32_I((ae_int32 *)c_tbl, 5 * 4)); + x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + + bsmall = xtbool2_extract_0(XT_OLT_S(x0, XT_LSI((xtfloat*)c_tbl, 4 * 4))); + + + t0 = XT_ADD_S(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_S(x0, t0, bsmall); + AE_MOVT_32(e0, ex0, bsmall); + x0 = XT_SUB_S(1.0f, x0); //!!! + ef0 = XT_FLOAT_S(e0, 0); //!!! + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 0 * 4); + p1 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 1 * 4); + p2 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 2 * 4); + p3 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 3 * 4); + p4 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 4 * 4); + p5 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 5 * 4); + p6 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 6 * 4); + p7 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 7 * 4); + p8 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 8 * 4); + p9 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 9 * 4); + + + XT_MADD_S(p1, x0, p0); + XT_MADD_S(p2, x0, p1); + XT_MADD_S(p3, x0, p2); + XT_MADD_S(p4, x0, p3); + XT_MADD_S(p5, x0, p4); + XT_MADD_S(p6, x0, p5); + XT_MADD_S(p7, x0, p6); + XT_MADD_S(p8, x0, p7); + XT_MADD_S(p9, x0, p8); + t2 = p9; + + + p10 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 10 * 4); + p11 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 11 * 4); + p12 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 12 * 4); + p13 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 13 * 4); + + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_S(x0, t2); t1 = t0; + XT_MSUB_S(t1, x0, t2); + w0 = XT_ADD_S(t0, p10); + w1 = XT_SUB_S(w0, p10); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + w0 = XT_ADD_S(t0, p11); + w1 = XT_SUB_S(w0, p11); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_S(x0); + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_S(t0, p12); w1 = w0; + XT_MSUB_S(w1, t0, p12); + XT_MADD_S(w1, t1, p12); + XT_MSUB_S(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_S(t0, ef0); + w1 = XT_SUB_S(w0, ef0); + w1 = XT_SUB_S(t0, w1); + t1 = XT_SUB_S(w1, t1);//!!!! + t0 = w0; // !!!!! + + /* compute y*log2(x) and separate result into integer and fractional parts */ + y0 = XT_LSI((const xtfloat*)y, 0); + xy = XT_FIROUND_S(XT_MUL_S(y0, t0)); + dxy = XT_NEG_S(xy); + XT_MADD_S(dxy, y0, t0); + XT_MADD_S(dxy, y0, t1); + dxy = XT_MIN_S(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_S(dxy, (xtfloatx2)-1.0f); + /* compute 2^fract */ + p0 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 0 * 4); + p1 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 1 * 4); + p2 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 2 * 4); + p3 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 3 * 4); + p4 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 4 * 4); + p5 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 5 * 4); + p6 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 6 * 4); + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_S(p1, dxy, p0); + XT_MADD_S(p2, dxy, p1); + XT_MADD_S(p3, dxy, p2); + XT_MADD_S(p4, dxy, p3); + XT_MADD_S(p5, dxy, p4); + XT_MADD_S(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_SX2(xy, 0); + e0 = AE_MAX32(e0, AE_L32_I((ae_int32 *)c_tbl, 7 * 4)); + e0 = AE_MIN32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4)); + e0 = AE_ADD32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4)); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0); + c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0); + z0 = XT_MUL_S(z0, c1); + z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!! + + + /* Take sign of x and y */ + { + xtbool2 bsx, bsy, b_yint, b_e0, b0, b_notspec; + + xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloat xabs, spec; + ae_int32 sgn, zi0; + + x0 = XT_LSI((const xtfloat*)x, 0); + y0 = XT_LSI((const xtfloat*)y, 0); + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + bsx = (XT_OLT_S(x0, (xtfloat)0.0f)); + bsy = (XT_OLT_S(y0, (xtfloat)0.0f)); + + xabs = XT_ABS_S(x0); + /* check if y is integer */ + t0 = XT_FITRUNC_S(y0); + b_yint = (XT_OEQ_S(t0, y0)); + + /* check if y is odd */ + e0 = XT_TRUNC_S(y0, 0); //temp0 + b_e0 = (AE_EQ32(e0, MAX_INT32));//~b_tmp0 + b0i = AE_MOVAB2(b_e0); + b1i = AE_MOVAB2(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA2(b0i); + AE_MOVF_32(e0, AE_ZERO32(), xtbool2_extract_0(b0)); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = (XT_OEQ_S((xtfloatx2)0.0f, y0)); /* y ==0 */ + b_yinf = (XT_OEQ_S(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f)); /* |y|==Inf */ + b_xeqz = (XT_OEQ_S(x0, (xtfloatx2)0.0f)); /* x ==0 */ + b_xeq1 = (XT_OEQ_S(xabs, (xtfloatx2)1.0f)); /* |x|==1 */ + b_xinf = (XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f)); /* |x|==INF */ + + yint = AE_MOVAB2 (b_yint); + yeqz = AE_MOVAB2 (b_yeqz); + yinf = AE_MOVAB2 (b_yinf); + xeqz = AE_MOVAB2 (b_xeqz); + xeq1 = AE_MOVAB2 (b_xeq1); + xinf = AE_MOVAB2 (b_xinf); + sx = AE_MOVAB2 (bsx); + sy = AE_MOVAB2 (bsy); + + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA2(NaN1); + b_NaN2 = XT_UN_SX2(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA2(one); + b_Inf = AE_MOVBA2(Inf); + b_zero = AE_MOVBA2(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloat)xa_nnlib_pow_qNaNf.f; + XT_MOVF_S(spec, 0.5f, xtbool2_extract_0(b_NaN1)); + XT_MOVT_S(spec, 0.0f, xtbool2_extract_0(b_zero)); + XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, xtbool2_extract_0(b_Inf)); + XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, xtbool2_extract_0(b_NaN2)); + XT_MOVT_S(spec, 1.0f, xtbool2_extract_0(b_one)); + + b_notspec = XT_OEQ_S(spec, 0.5f); + /* Replace result with special numbers if needed */ + XT_MOVF_S(z0, spec, xtbool2_extract_0(b_notspec)); + /* Restore sign and store result */ + zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0); + + XT_SSI(z0,(xtfloat*)z,0); + + } + } + +} /* vec_powf() */ +#else +#define sz_f32 (int)sizeof(FLOAT32) +void xa_nn_elm_pow_f32(FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + int N) +{ + + const int blkSizef = MAX_ALLOCA_SZ / sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + float ALIGN(16) scr[blkSizef]; + /* Table of different constants used in computations */ + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + int n; + const xtfloat * pX; + const xtfloat * pY; + + const xtfloat * restrict S_rd; + xtfloat * restrict S_wr; + xtfloat * restrict pZ; + const ae_int32 * restrict TBL; + const xtfloat * restrict TBL_LOG2; + const xtfloat * restrict TBL_POW2; + xtfloat x0, y0, z0, t0, t1, ef0; + xtfloat c2f, c3f, c4f; + xtfloat _0, _1, half; + ae_int32x2 c0i, c1i, c5i, c6i, c7i, c8i; + ae_int32 e0, xi0, yi0, ex0; + xtbool bsx, bsy, bdenorm, bsmall; + + /* overall number of blocks; number of values in the current block */ + int blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / (3 * sz_f32); + + + if (N <= 0) return; + + NASSERT_ALIGN16(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + blkLen = 0; + TBL = (const ae_int32 *)c_tbl; + for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize) + { + blkLen = XT_MIN(N, blkSize); + _0 = 0.0f; + _1 = (1.0f); + half = (0.5f); + { + pX = (const xtfloat*)x; + S_wr = ( xtfloat*)scr; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(x0, pX, sz_f32); + + x0 = XT_ABS_S(x0); + c0i = AE_L32_I(TBL, 0 * 4); /* -126 */ + c1i = AE_L32_I(TBL, 1 * 4); /* -150 */ + c2f = XT_LSI((xtfloat*)TBL, 2 * 4); + c3f = XT_LSI((xtfloat*)TBL, 3 * 4); + /* process denormalized values */ + bdenorm = XT_OLE_S(x0, c2f); + t0 = XT_MUL_S(x0, c3f); + XT_MOVT_S(x0, t0, bdenorm); + e0 = c0i; + + AE_MOVT_32(e0, c1i, bdenorm); + /* extract exponent */ + xi0 = XT_RFR(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_RFR(c2f);/* load mantissa mask */ //!!!!!!!!!!!!! + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, c5i); + x0 = XT_WFR(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + c4f = XT_LSI((xtfloat*)TBL, 4 * 4); + bsmall = XT_OLT_S(x0, c4f); + t0 = XT_ADD_S(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_S(x0, t0, bsmall); + AE_MOVT_32(e0, ex0, bsmall); + x0 = XT_SUB_S(_1, x0); //!!! + ef0 = XT_FLOAT_S(e0, 0); //!!! + XT_SSIP(x0, S_wr, sz_f32); + XT_SSIP(ef0, S_wr, 2 * sz_f32); + + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloat p10, p11, p12, p13; + xtfloat t2, w0, w1; + S_wr = ( xtfloat*)scr + 2; + S_rd = (const xtfloat*)scr; + TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(x0, S_rd, 3*sz_f32); + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI(TBL_LOG2, 0 * 4); + p1 = XT_LSI(TBL_LOG2, 1 * 4); + p2 = XT_LSI(TBL_LOG2, 2 * 4); + p3 = XT_LSI(TBL_LOG2, 3 * 4); + p4 = XT_LSI(TBL_LOG2, 4 * 4); + p5 = XT_LSI(TBL_LOG2, 5 * 4); + p6 = XT_LSI(TBL_LOG2, 6 * 4); + p7 = XT_LSI(TBL_LOG2, 7 * 4); + p8 = XT_LSX(TBL_LOG2, 8 * 4); + p9 = XT_LSX(TBL_LOG2, 9 * 4); + + XT_MADD_S(p1, x0, p0); + XT_MADD_S(p2, x0, p1); + XT_MADD_S(p3, x0, p2); + XT_MADD_S(p4, x0, p3); + XT_MADD_S(p5, x0, p4); + XT_MADD_S(p6, x0, p5); + XT_MADD_S(p7, x0, p6); + XT_MADD_S(p8, x0, p7); + XT_MADD_S(p9, x0, p8); + t2 = p9; + XT_SSIP(t2, S_wr, 3 * sz_f32); + } + S_wr = ( xtfloat*)scr; + S_rd = (const xtfloat*)scr; + + for (n = 0; n<(blkLen); n++) + { + p10 = XT_LSX(TBL_LOG2, 10 * 4); + p11 = XT_LSX(TBL_LOG2, 11 * 4); + p12 = XT_LSX(TBL_LOG2, 12 * 4); + p13 = XT_LSX(TBL_LOG2, 13 * 4); + + XT_LSIP(x0, S_rd, sz_f32); + XT_LSIP(ef0, S_rd, sz_f32); + XT_LSIP(t2, S_rd, sz_f32); + + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_S(x0, t2); t1 = t0; + XT_MSUB_S(t1, x0, t2); + w0 = XT_ADD_S(t0, p10); + w1 = XT_SUB_S(w0, p10); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + w0 = XT_ADD_S(t0, p11); + w1 = XT_SUB_S(w0, p11); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_S(x0); + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_S(t0, p12); w1 = w0; + XT_MSUB_S(w1, t0, p12); + XT_MADD_S(w1, t1, p12); + XT_MSUB_S(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_S(t0, ef0); + w1 = XT_SUB_S(w0, ef0); + w1 = XT_SUB_S(t0, w1); + t1 = XT_SUB_S(w1, t1);//!!!! + t0 = w0; // !!!!! + XT_SSIP(t0, S_wr, sz_f32); + XT_SSIP(t1, S_wr, sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloat xy, dxy, c0, c1, _m1;; + xtfloat p0, p1, p2, p3, p4, p5, p6; + S_wr = ( xtfloat*)scr; + S_rd = (const xtfloat*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloat*)y; + _m1 = -1.0f; + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(t0, S_rd, sz_f32); + XT_LSIP(t1, S_rd, sz_f32); + XT_LSIP(y0, pY, sz_f32); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FLOAT_S(XT_ROUND_S(XT_MUL_S(y0, t0), 0), 0); + dxy = XT_NEG_S(xy); + XT_MADD_S(dxy, y0, t0); + XT_MADD_S(dxy, y0, t1); + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + c6i = AE_L32_I(TBL, 6 * 4);/* -0.5 */ + dxy = XT_MIN_S(dxy, _1); + dxy = XT_MAX_S(dxy, _m1); + /* compute 2^fract */ + p0 = XT_LSI(TBL_POW2, 0 * 4); + p1 = XT_LSI(TBL_POW2, 1 * 4); + p2 = XT_LSI(TBL_POW2, 2 * 4); + p3 = XT_LSI(TBL_POW2, 3 * 4); + p4 = XT_LSI(TBL_POW2, 4 * 4); + p5 = XT_LSI(TBL_POW2, 5 * 4); + p6 = XT_LSI(TBL_POW2, 6 * 4); + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_S(p1, dxy, p0); + XT_MADD_S(p2, dxy, p1); + XT_MADD_S(p3, dxy, p2); + XT_MADD_S(p4, dxy, p3); + XT_MADD_S(p5, dxy, p4); + XT_MADD_S(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_S(xy, 0); + c7i = AE_L32_I(TBL, 7 * 4);/* -252 */ + c8i = AE_L32_X(TBL, 8 * 4);/* 254 */ + e0 = AE_MAX32(e0, c7i); + e0 = AE_MIN32(e0, c8i); + e0 = AE_ADD32(e0, c8i); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + + c0 = XT_WFR(e0); + c1 = XT_WFR(ex0); + z0 = XT_MUL_S(z0, c1); + z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!! + XT_SSIP(z0, S_wr, sz_f32); + + } + } + __Pragma("no_reorder"); + /* */ + { + xtbool b_yint, b_e0, b0, b_notspec; + xtbool b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloat xabs, spec; + ae_int32x2 sgn, zi0; + + S_rd = (const xtfloat*)scr; + pY = (const xtfloat*)y; + pX = (const xtfloat*)x; + pZ = (xtfloat*)z; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(z0, S_rd, sz_f32); + XT_LSIP(x0, pX, sz_f32); + XT_LSIP(y0, pY, sz_f32); + + /* Take sign of x and y */ + xi0 = XT_RFR(x0); + yi0 = XT_RFR(y0); + bsx = XT_OLT_S(x0, (xtfloat)0.0f); + bsy = XT_OLT_S(y0, (xtfloat)0.0f); + + xabs = XT_ABS_S(x0); + /* check if y is integer */ + { /* validate if y is integral - all numbers bigger than 2^23 are assumed as integral */ + xtfloat t, c; + t = XT_ABS_S((xtfloat)y0); + c = 8388608.f; + XT_MOVT_S(c, t, XT_ULT_S(t, 8388608.f)); + t = c; + t0 = XT_FLOAT_S(XT_TRUNC_S(t, 0), 0); + b_yint = XT_OEQ_S(XT_FLOAT_S(XT_TRUNC_S(t, 0), 0), t); + } + + /* check if y is odd */ + e0 = XT_TRUNC_S(y0, 0); //temp0 + b_e0 = xtbool2_extract_0(AE_EQ32(e0, MAX_INT32));//~b_tmp0 + b0i = AE_MOVAB(b_e0); + b1i = AE_MOVAB(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA(b0i); + AE_MOVF_32(e0, AE_ZERO32(), b0); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = XT_OEQ_S((xtfloat)0.0f, y0); /* y ==0 */ + b_yinf = XT_OEQ_S(XT_ABS_S(y0), xa_nnlib_pow_plusInff.f); /* |y|==Inf */ + b_xeqz = XT_OEQ_S(x0, (xtfloat)0.0f); /* x ==0 */ + b_xeq1 = XT_OEQ_S(xabs, (xtfloat)1.0f); /* |x|==1 */ + b_xinf = XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f); /* |x|==INF */ + + yint = AE_MOVAB(b_yint); + yeqz = AE_MOVAB(b_yeqz); + yinf = AE_MOVAB(b_yinf); + xeqz = AE_MOVAB(b_xeqz); + xeq1 = AE_MOVAB(b_xeq1); + xinf = AE_MOVAB(b_xinf); + sx = AE_MOVAB(bsx); + sy = AE_MOVAB(bsy); + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA(NaN1); + b_NaN2 = XT_UN_S(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA(one); + b_Inf = AE_MOVBA(Inf); + b_zero = AE_MOVBA(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloat)xa_nnlib_pow_qNaNf.f; + XT_MOVF_S(spec, half, b_NaN1); + XT_MOVT_S(spec, _0, b_zero); + XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, b_Inf); + XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, b_NaN2); + XT_MOVT_S(spec, _1, b_one); + + b_notspec = XT_OEQ_S(spec, half); + /* Replace result with special numbers if needed */ + XT_MOVF_S(z0, spec, b_notspec); + /* Restore sign and store result */ + zi0 = XT_RFR(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_WFR(zi0); + XT_SSIP(z0, pZ, sz_f32); + } + } + } + +} /* vec_powf() */ +#endif