diff --git a/cmake/developer_package/packaging/archive.cmake b/cmake/developer_package/packaging/archive.cmake index d4e9c65096d245..853b7649e88ddf 100644 --- a/cmake/developer_package/packaging/archive.cmake +++ b/cmake/developer_package/packaging/archive.cmake @@ -94,6 +94,8 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PKG_CONFIG_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) # symbolic links set(OV_CPACK_COMP_LINKS_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) + # npu internal tools + unset(OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL) endmacro() ov_define_component_include_rules() diff --git a/cmake/developer_package/packaging/common-libraries.cmake b/cmake/developer_package/packaging/common-libraries.cmake index 247f107b83b6fc..4ec96dc28b53e8 100644 --- a/cmake/developer_package/packaging/common-libraries.cmake +++ b/cmake/developer_package/packaging/common-libraries.cmake @@ -111,6 +111,8 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PKG_CONFIG_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) # symbolic links set(OV_CPACK_COMP_LINKS_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) + # npu internal tools + set(OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL EXCLUDE_FROM_ALL) endmacro() ov_define_component_include_rules() diff --git a/cmake/developer_package/packaging/debian/debian.cmake b/cmake/developer_package/packaging/debian/debian.cmake index 1b29dd7697d1c7..c7f49419111cea 100644 --- a/cmake/developer_package/packaging/debian/debian.cmake +++ b/cmake/developer_package/packaging/debian/debian.cmake @@ -118,6 +118,8 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PKG_CONFIG_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) # symbolic links set(OV_CPACK_COMP_LINKS_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) + # npu internal tools + set(OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL EXCLUDE_FROM_ALL) endmacro() ov_define_component_include_rules() diff --git a/cmake/developer_package/packaging/npm.cmake b/cmake/developer_package/packaging/npm.cmake index 996b55e0ba032f..24453965125348 100644 --- a/cmake/developer_package/packaging/npm.cmake +++ b/cmake/developer_package/packaging/npm.cmake @@ -85,6 +85,8 @@ macro(ov_define_component_include_rules) unset(OV_CPACK_COMP_PKG_CONFIG_EXCLUDE_ALL) # symbolic links unset(OV_CPACK_COMP_LINKS_EXCLUDE_ALL) + # npu internal tools + set(OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL EXCLUDE_FROM_ALL) endmacro() ov_define_component_include_rules() diff --git a/cmake/developer_package/packaging/nsis.cmake b/cmake/developer_package/packaging/nsis.cmake index b84569e4f0b1d1..f5f9a233e8b87f 100644 --- a/cmake/developer_package/packaging/nsis.cmake +++ b/cmake/developer_package/packaging/nsis.cmake @@ -140,6 +140,8 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PKG_CONFIG_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) # symbolic links set(OV_CPACK_COMP_LINKS_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) + # npu internal tools + set(OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL EXCLUDE_FROM_ALL) endmacro() ov_define_component_include_rules() diff --git a/cmake/developer_package/packaging/rpm/rpm.cmake b/cmake/developer_package/packaging/rpm/rpm.cmake index 56a0a12647079c..7c9fb4f22a372d 100644 --- a/cmake/developer_package/packaging/rpm/rpm.cmake +++ b/cmake/developer_package/packaging/rpm/rpm.cmake @@ -109,6 +109,8 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PKG_CONFIG_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) # symbolic links set(OV_CPACK_COMP_LINKS_EXCLUDE_ALL ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL}) + # npu internal tools + set(OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL EXCLUDE_FROM_ALL) endmacro() ov_define_component_include_rules() diff --git a/src/plugins/intel_npu/CMakeLists.txt b/src/plugins/intel_npu/CMakeLists.txt index dd9ca9279349ab..ce06b2542ac31c 100644 --- a/src/plugins/intel_npu/CMakeLists.txt +++ b/src/plugins/intel_npu/CMakeLists.txt @@ -31,3 +31,7 @@ add_subdirectory(src) if(ENABLE_TESTS) add_subdirectory(tests) endif() + +add_subdirectory(tools) + +ov_cpack_add_component(${NPU_INTERNAL_COMPONENT} HIDDEN) diff --git a/src/plugins/intel_npu/tools/CMakeLists.txt b/src/plugins/intel_npu/tools/CMakeLists.txt new file mode 100644 index 00000000000000..c0e620981952e1 --- /dev/null +++ b/src/plugins/intel_npu/tools/CMakeLists.txt @@ -0,0 +1,8 @@ +# +# Copyright (C) 2024 Intel Corporation. +# SPDX-License-Identifier: Apache 2.0 +# + +add_subdirectory(common) +add_subdirectory(compile_tool) +add_subdirectory(single-image-test) diff --git a/src/plugins/intel_npu/tools/common/CMakeLists.txt b/src/plugins/intel_npu/tools/common/CMakeLists.txt new file mode 100644 index 00000000000000..a25e6126424617 --- /dev/null +++ b/src/plugins/intel_npu/tools/common/CMakeLists.txt @@ -0,0 +1,29 @@ +# +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +set(TARGET_NAME "npu_tools_utils") + +# +# Define the target +# + +ov_add_target(ADD_CPPLINT + TYPE STATIC + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + INCLUDES + PUBLIC + "$" + LINK_LIBRARIES + PRIVATE + openvino::runtime) + +set_target_properties(${TARGET_NAME} PROPERTIES + FOLDER ${CMAKE_CURRENT_SOURCE_DIR} + CXX_STANDARD 17) + +if (CMAKE_COMPILER_IS_GNUCXX) + target_compile_options(${TARGET_NAME} PRIVATE -Wall) +endif() diff --git a/src/plugins/intel_npu/tools/common/include/data_type_converters.hpp b/src/plugins/intel_npu/tools/common/include/data_type_converters.hpp new file mode 100644 index 00000000000000..c9c12b5154f398 --- /dev/null +++ b/src/plugins/intel_npu/tools/common/include/data_type_converters.hpp @@ -0,0 +1,272 @@ +// +// Copyright (C) 2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +#pragma once + +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/type/element_type_traits.hpp" + +#include + +namespace npu { +namespace utils { + +namespace details { + +template +std::enable_if_t staticIf(Func&& func) { + func(); +} + +template +std::enable_if_t staticIf(Func&&) { +} + +// To overcome the syntax parse error, when `>` comparison operator is treated as +// template closing bracket +template +constexpr bool Greater(T1&& v1, T2&& v2) { + return v1 > v2; +} + +} // namespace details + +// +// Bool logic +// + +template +using not_ = std::negation; + +template +using or_ = std::disjunction; + +template +using and_ = std::conjunction; + +// +// enable_if +// + +template +using enable_t = std::enable_if_t<(Args::value && ...), T>; + +// +// Standart data types +// + +template +enable_t> checked_cast(InT value) { + return value; +} + +template +enable_t, std::is_signed, std::is_integral, std::is_signed, + not_>> +checked_cast(InT value) { + details::staticIf::lowest() < std::numeric_limits::lowest()>([&] { + OPENVINO_ASSERT(value >= std::numeric_limits::lowest(), "Can not safely cast ", + static_cast(value), " from ", ov::element::from(), " to ", + ov::element::from()); + }); + + details::staticIf::max(), std::numeric_limits::max())>([&] { + OPENVINO_ASSERT(value <= std::numeric_limits::max(), "Can not safely cast ", static_cast(value), + " from ", ov::element::from(), " to ", ov::element::from()); + }); + + return static_cast(value); +} + +template +enable_t, std::is_unsigned, std::is_integral, std::is_unsigned, + not_>> +checked_cast(InT value) { + details::staticIf::max(), std::numeric_limits::max())>([&] { + OPENVINO_ASSERT(value <= std::numeric_limits::max(), "Can not safely cast ", static_cast(value), + " from ", ov::element::from(), " to ", ov::element::from()); + }); + + return static_cast(value); +} + +template +enable_t, std::is_unsigned, std::is_integral, std::is_signed> checked_cast( + InT value) { + details::staticIf::max(), + static_cast>(std::numeric_limits::max()))>([&] { + OPENVINO_ASSERT(value <= static_cast>(std::numeric_limits::max()), + "Can not safely cast ", static_cast(value), " from ", ov::element::from(), + " to ", ov::element::from()); + }); + + return static_cast(value); +} + +template +enable_t, std::is_signed, std::is_integral, std::is_unsigned> checked_cast( + InT value) { + OPENVINO_ASSERT(value >= 0, "Can not safely cast ", static_cast(value), " from ", ov::element::from(), + " to ", ov::element::from()); + + details::staticIf>(std::numeric_limits::max()), + std::numeric_limits::max())>([&] { + OPENVINO_ASSERT(static_cast>(value) <= std::numeric_limits::max(), + "Can not safely cast ", static_cast(value), " from ", ov::element::from(), " to ", + ov::element::from()); + }); + + return static_cast(value); +} + +template +enable_t, std::is_integral> checked_cast(InT value) { + OPENVINO_ASSERT(value <= static_cast(std::numeric_limits::max()), "Can not safely cast ", value, + " from ", ov::element::from(), " to ", ov::element::from()); + + OPENVINO_ASSERT(value >= static_cast(std::numeric_limits::lowest()), "Can not safely cast ", value, + " from ", ov::element::from(), " to ", ov::element::from()); + + return static_cast(value); +} + +template +enable_t, std::is_signed, std::is_floating_point> checked_cast(InT value) { + OPENVINO_ASSERT(static_cast(static_cast(value)) == value, "Can not safely cast ", + static_cast(value), " from ", ov::element::from(), " to ", ov::element::from()); + + return static_cast(value); +} + +template +enable_t, std::is_unsigned, std::is_floating_point> checked_cast(InT value) { + OPENVINO_ASSERT(static_cast(static_cast(value)) == value, "Can not safely cast ", + static_cast(value), " from ", ov::element::from(), " to ", + ov::element::from()); + + return static_cast(value); +} + +template +enable_t, std::is_same> checked_cast(InT value) { + OPENVINO_ASSERT(static_cast(static_cast(value)) == value, "Can not safely cast ", value, " from ", + ov::element::from(), " to ", ov::element::from()); + + return static_cast(value); +} + +template +enable_t, std::is_same> checked_cast(InT value) { + return static_cast(value); +} + +// +// Custom float types +// + +template +enable_t> checked_cast(ov::bfloat16 val) { + return ov::float16(static_cast(val)); +} +template +enable_t> checked_cast(ov::bfloat16 val) { + return ov::float8_e4m3(static_cast(val)); +} +template +enable_t> checked_cast(ov::bfloat16 val) { + return ov::float8_e5m2(static_cast(val)); +} +template +enable_t> checked_cast(ov::float16 val) { + return ov::bfloat16(static_cast(val)); +} +template +enable_t> checked_cast(ov::float16 val) { + return ov::float8_e4m3(static_cast(val)); +} +template +enable_t> checked_cast(ov::float16 val) { + return ov::float8_e5m2(static_cast(val)); +} +template +enable_t> checked_cast(ov::float8_e4m3 val) { + return ov::float8_e5m2(static_cast(val)); +} +template +enable_t> checked_cast(ov::float8_e4m3 val) { + return ov::float16(static_cast(val)); +} +template +enable_t> checked_cast(ov::float8_e4m3 val) { + return ov::bfloat16(static_cast(val)); +} +template +enable_t> checked_cast(ov::float8_e5m2 val) { + return ov::float8_e4m3(static_cast(val)); +} +template +enable_t> checked_cast(ov::float8_e5m2 val) { + return ov::float16(static_cast(val)); +} +template +enable_t> checked_cast(ov::float8_e5m2 val) { + return ov::bfloat16(static_cast(val)); +} + +template +enable_t, std::is_same, + std::is_same>>> +checked_cast(ov::bfloat16 val) { + return checked_cast(static_cast(val)); +} +template +enable_t, std::is_same, + std::is_same>>> +checked_cast(ov::float16 val) { + return checked_cast(static_cast(val)); +} +template +enable_t, std::is_same, + std::is_same>>> +checked_cast(ov::float8_e4m3 val) { + return checked_cast(static_cast(val)); +} +template +enable_t, std::is_same, + std::is_same>>> +checked_cast(ov::float8_e5m2 val) { + return checked_cast(static_cast(val)); +} + +template +enable_t> checked_cast(InT val) { + return ov::bfloat16(checked_cast(val)); +} +template +enable_t> checked_cast(InT val) { + return ov::float16(checked_cast(val)); +} +template +enable_t> checked_cast(InT val) { + return ov::float8_e4m3(checked_cast(val)); +} +template +enable_t> checked_cast(InT val) { + return ov::float8_e5m2(checked_cast(val)); +} + +// +// Wrapper +// + +template +OutT convertValuePrecision(InT value) { + return checked_cast(value); +} + +} // namespace utils +} // namespace npu diff --git a/src/plugins/intel_npu/tools/common/include/tensor_utils.hpp b/src/plugins/intel_npu/tools/common/include/tensor_utils.hpp new file mode 100644 index 00000000000000..c6ca8f50fd3f94 --- /dev/null +++ b/src/plugins/intel_npu/tools/common/include/tensor_utils.hpp @@ -0,0 +1,62 @@ +// +// Copyright (C) 2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +#pragma once + +#include + +namespace npu { +namespace utils { + +/** + * @brief Copies the contents of one tensor into another one which bears the same shape and precision. + * + * @param in The source tensor + * @param out The destination tensor + */ +void copyTensor(const ov::Tensor& in, const ov::Tensor& out); + +/** + * @brief Copies the contents of one tensor into another one which bears the same shape. Precision conversions from + * source type to target type will be performed if required. + * + * @param in The source tensor + * @param out The destination tensor + */ +void convertTensorPrecision(const ov::Tensor& in, const ov::Tensor& out); + +/** + * @brief Constructs a tensor with the same content as the source but with the precision converted to the specified + * target. + * + * @param in The source tensor + * @param precision The target precision + * @param ptr Optional, the constructed tensor will use this address for its buffer if specified + * @return The tensor obtained upon converting the precision. + */ +ov::Tensor toPrecision(const ov::Tensor& in, const ov::element::Type& precision, void* ptr = nullptr); + +/** + * @brief Constructs a tensor with the same content as the source but with the precision converted to FP32. + * + * @param in The source tensor + * @param ptr Optional, the constructed tensor will use this address for its buffer if specified + * @return The tensor obtained upon converting the precision. + */ +inline ov::Tensor toFP32(const ov::Tensor& in, void* ptr = nullptr) { + return toPrecision(in, ov::element::Type_t::f32, ptr); +} + +/** + * @brief Converts the precision used by a batch of tensors to FP32 and returns their buffers. The original tensors + * remain unchanged. + * + * @param tensors The source tensors + * @return The buffers of the tensors obtained upon precision conversion + */ +std::vector> parseTensorsAsFP32(const std::map& tensors); + +} // namespace utils +} // namespace npu diff --git a/src/plugins/intel_npu/tools/common/src/tensor_utils.cpp b/src/plugins/intel_npu/tools/common/src/tensor_utils.cpp new file mode 100644 index 00000000000000..470d737a2b9d31 --- /dev/null +++ b/src/plugins/intel_npu/tools/common/src/tensor_utils.cpp @@ -0,0 +1,472 @@ +// +// Copyright (C) 2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +#include "tensor_utils.hpp" + +#include "data_type_converters.hpp" + +#include +#include +#include +#include + +#include + +namespace { + +template +void convertTensorPrecisionImpl(const ov::Tensor& in, const ov::Tensor& out) { + const auto inputBuffer = in.data(); + OPENVINO_ASSERT(inputBuffer != nullptr, "Tensor was not allocated"); + + const auto outputBuffer = out.data(); + OPENVINO_ASSERT(outputBuffer != nullptr, "Tensor was not allocated"); + + for (size_t index = 0; index < in.get_size(); ++index) { + outputBuffer[index] = npu::utils::convertValuePrecision(inputBuffer[index]); + } +} + +} // namespace + +namespace npu { +namespace utils { + +void copyTensor(const ov::Tensor& in, const ov::Tensor& out) { + OPENVINO_ASSERT(in.get_element_type() == out.get_element_type(), "Precision mismatch"); + OPENVINO_ASSERT(in.get_shape() == out.get_shape(), "Shape mismatch"); + + const auto inputBuffer = in.data(); + OPENVINO_ASSERT(inputBuffer != nullptr, "Tensor was not allocated"); + + const auto outputBuffer = out.data(); + OPENVINO_ASSERT(outputBuffer != nullptr, "Tensor was not allocated"); + + std::copy_n(inputBuffer, in.get_byte_size(), outputBuffer); +} + +void convertTensorPrecision(const ov::Tensor& in, const ov::Tensor& out) { + OPENVINO_ASSERT(in.get_shape() == out.get_shape(), "Mismatch in Dims"); + + const ov::element::Type& inPrecision = in.get_element_type(); + const ov::element::Type& outPrecision = out.get_element_type(); + + if (inPrecision == outPrecision) { + copyTensor(in, out); + return; + } + +#define CASE(InT, OutT) \ + convertTensorPrecisionImpl, \ + ov::fundamental_type_for>(in, out); \ + break + + switch (inPrecision) { + case ov::element::Type_t::f64: { + switch (outPrecision) { + case ov::element::Type_t::f32: + CASE(f64, f32); + case ov::element::Type_t::u64: + CASE(f64, u64); + case ov::element::Type_t::i64: + CASE(f64, i64); + case ov::element::Type_t::u32: + CASE(f64, u32); + case ov::element::Type_t::i32: + CASE(f64, i32); + case ov::element::Type_t::u16: + CASE(f64, u16); + case ov::element::Type_t::i16: + CASE(f64, i16); + case ov::element::Type_t::u8: + CASE(f64, u8); + case ov::element::Type_t::i8: + CASE(f64, i8); + case ov::element::Type_t::f16: + CASE(f64, f16); + case ov::element::Type_t::bf16: + CASE(f64, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::f32: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(f32, f64); + case ov::element::Type_t::u64: + CASE(f32, u64); + case ov::element::Type_t::i64: + CASE(f32, i64); + case ov::element::Type_t::u32: + CASE(f32, u32); + case ov::element::Type_t::i32: + CASE(f32, i32); + case ov::element::Type_t::u16: + CASE(f32, u16); + case ov::element::Type_t::i16: + CASE(f32, i16); + case ov::element::Type_t::u8: + CASE(f32, u8); + case ov::element::Type_t::i8: + CASE(f32, i8); + case ov::element::Type_t::f16: + CASE(f32, f16); + case ov::element::Type_t::bf16: + CASE(f32, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::f16: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(f16, f64); + case ov::element::Type_t::f32: + CASE(f16, f32); + case ov::element::Type_t::bf16: + CASE(f16, bf16); + case ov::element::Type_t::u64: + CASE(f16, u64); + case ov::element::Type_t::i64: + CASE(f16, i64); + case ov::element::Type_t::u32: + CASE(f16, u32); + case ov::element::Type_t::i32: + CASE(f16, i32); + case ov::element::Type_t::u16: + CASE(f16, u16); + case ov::element::Type_t::i16: + CASE(f16, i16); + case ov::element::Type_t::u8: + CASE(f16, u8); + case ov::element::Type_t::i8: + CASE(f16, i8); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::bf16: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(bf16, f64); + case ov::element::Type_t::f32: + CASE(bf16, f32); + case ov::element::Type_t::f16: + CASE(bf16, f16); + case ov::element::Type_t::u64: + CASE(bf16, u64); + case ov::element::Type_t::i64: + CASE(bf16, i64); + case ov::element::Type_t::u32: + CASE(bf16, u32); + case ov::element::Type_t::i32: + CASE(bf16, i32); + case ov::element::Type_t::u16: + CASE(bf16, u16); + case ov::element::Type_t::i16: + CASE(bf16, i16); + case ov::element::Type_t::u8: + CASE(bf16, u8); + case ov::element::Type_t::i8: + CASE(bf16, i8); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::u64: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(u64, f64); + case ov::element::Type_t::f32: + CASE(u64, f32); + case ov::element::Type_t::i64: + CASE(u64, i64); + case ov::element::Type_t::u32: + CASE(u64, u32); + case ov::element::Type_t::i32: + CASE(u64, i32); + case ov::element::Type_t::u16: + CASE(u64, u16); + case ov::element::Type_t::i16: + CASE(u64, i16); + case ov::element::Type_t::u8: + CASE(u64, u8); + case ov::element::Type_t::i8: + CASE(u64, i8); + case ov::element::Type_t::f16: + CASE(u64, f16); + case ov::element::Type_t::bf16: + CASE(u64, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::i64: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(i64, f64); + case ov::element::Type_t::f32: + CASE(i64, f32); + case ov::element::Type_t::u64: + CASE(i64, u64); + case ov::element::Type_t::u32: + CASE(i64, u32); + case ov::element::Type_t::i32: + CASE(i64, i32); + case ov::element::Type_t::u16: + CASE(i64, u16); + case ov::element::Type_t::i16: + CASE(i64, i16); + case ov::element::Type_t::u8: + CASE(i64, u8); + case ov::element::Type_t::i8: + CASE(i64, i8); + case ov::element::Type_t::f16: + CASE(i64, f16); + case ov::element::Type_t::bf16: + CASE(i64, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::u32: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(u32, f64); + case ov::element::Type_t::f32: + CASE(u32, f32); + case ov::element::Type_t::u64: + CASE(u32, u64); + case ov::element::Type_t::i64: + CASE(u32, i64); + case ov::element::Type_t::i32: + CASE(u32, i32); + case ov::element::Type_t::u16: + CASE(u32, u16); + case ov::element::Type_t::i16: + CASE(u32, i16); + case ov::element::Type_t::u8: + CASE(u32, u8); + case ov::element::Type_t::i8: + CASE(u32, i8); + case ov::element::Type_t::f16: + CASE(u32, f16); + case ov::element::Type_t::bf16: + CASE(u32, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::i32: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(i32, f64); + case ov::element::Type_t::f32: + CASE(i32, f32); + case ov::element::Type_t::u64: + CASE(i32, u64); + case ov::element::Type_t::i64: + CASE(i32, i64); + case ov::element::Type_t::u32: + CASE(i32, u32); + case ov::element::Type_t::u16: + CASE(i32, u16); + case ov::element::Type_t::i16: + CASE(i32, i16); + case ov::element::Type_t::u8: + CASE(i32, u8); + case ov::element::Type_t::i8: + CASE(i32, i8); + case ov::element::Type_t::f16: + CASE(i32, f16); + case ov::element::Type_t::bf16: + CASE(i32, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::u16: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(u16, f64); + case ov::element::Type_t::f32: + CASE(u16, f32); + case ov::element::Type_t::u64: + CASE(u16, u64); + case ov::element::Type_t::i64: + CASE(u16, i64); + case ov::element::Type_t::u32: + CASE(u16, u32); + case ov::element::Type_t::i32: + CASE(u16, i32); + case ov::element::Type_t::i16: + CASE(u16, i16); + case ov::element::Type_t::u8: + CASE(u16, u8); + case ov::element::Type_t::i8: + CASE(u16, i8); + case ov::element::Type_t::f16: + CASE(u16, f16); + case ov::element::Type_t::bf16: + CASE(u16, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::i16: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(i16, f64); + case ov::element::Type_t::f32: + CASE(i16, f32); + case ov::element::Type_t::u64: + CASE(i16, u64); + case ov::element::Type_t::i64: + CASE(i16, i64); + case ov::element::Type_t::u32: + CASE(i16, u32); + case ov::element::Type_t::i32: + CASE(i16, i32); + case ov::element::Type_t::u16: + CASE(i16, u16); + case ov::element::Type_t::u8: + CASE(i16, u8); + case ov::element::Type_t::i8: + CASE(i16, i8); + case ov::element::Type_t::f16: + CASE(i16, f16); + case ov::element::Type_t::bf16: + CASE(i16, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::u8: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(u8, f64); + case ov::element::Type_t::f32: + CASE(u8, f32); + case ov::element::Type_t::u64: + CASE(u8, u64); + case ov::element::Type_t::i64: + CASE(u8, i64); + case ov::element::Type_t::u32: + CASE(u8, u32); + case ov::element::Type_t::i32: + CASE(u8, i32); + case ov::element::Type_t::u16: + CASE(u8, u16); + case ov::element::Type_t::i16: + CASE(u8, i16); + case ov::element::Type_t::i8: + CASE(u8, i8); + case ov::element::Type_t::f16: + CASE(u8, f16); + case ov::element::Type_t::bf16: + CASE(u8, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + case ov::element::Type_t::i8: { + switch (outPrecision) { + case ov::element::Type_t::f64: + CASE(i8, f64); + case ov::element::Type_t::f32: + CASE(i8, f32); + case ov::element::Type_t::u64: + CASE(i8, u64); + case ov::element::Type_t::i64: + CASE(i8, i64); + case ov::element::Type_t::u32: + CASE(i8, u32); + case ov::element::Type_t::i32: + CASE(i8, i32); + case ov::element::Type_t::u16: + CASE(i8, u16); + case ov::element::Type_t::i16: + CASE(i8, i16); + case ov::element::Type_t::u8: + CASE(i8, u8); + case ov::element::Type_t::f16: + CASE(i8, f16); + case ov::element::Type_t::bf16: + CASE(i8, bf16); + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + break; + } + default: + OPENVINO_THROW("Unsupported combination of precisions ", inPrecision.get_type_name(), " -> ", + outPrecision.get_type_name()); + } + +#undef CASE +} + +ov::Tensor toPrecision(const ov::Tensor& in, const ov::element::Type& precision, void* ptr) { + if (in.get_element_type() == precision && ptr == nullptr) { + return in; + } + + ov::Tensor out; + + if (ptr == nullptr) { + out = ov::Tensor(precision, in.get_shape()); + } else { + out = ov::Tensor(precision, in.get_shape(), ptr); + } + + convertTensorPrecision(in, out); + + return out; +} + +std::vector> parseTensorsAsFP32(const std::map& tensors) { + std::vector> results; + + for (const auto& tensor : tensors) { + const ov::Tensor tensorFP32 = toFP32(tensor.second); + const auto dataBuffer = tensorFP32.data(); + OPENVINO_ASSERT(dataBuffer != nullptr); + + const size_t size = tensorFP32.get_size(); + std::vector result(size); + std::copy_n(dataBuffer, size, result.begin()); + + results.push_back(result); + } + + return results; +} + +} // namespace utils +} // namespace npu diff --git a/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt b/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt new file mode 100644 index 00000000000000..9d1e1d38b48188 --- /dev/null +++ b/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt @@ -0,0 +1,56 @@ +# +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +set(TARGET_NAME compile_tool) + +if (NOT DEFINED PROJECT_NAME) + cmake_minimum_required(VERSION 3.13 FATAL_ERROR) + project(compile_tool_standalone) + include("cmake/standalone.cmake") + return() +endif() + +# +# Define the target +# + +ov_add_target(ADD_CPPLINT + TYPE EXECUTABLE + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + LINK_LIBRARIES + PRIVATE + openvino::runtime + gflags + Threads::Threads) + +set_target_properties(${TARGET_NAME} PROPERTIES + FOLDER ${CMAKE_CURRENT_SOURCE_DIR} + CXX_STANDARD 17) + +if (CMAKE_COMPILER_IS_GNUCXX) + target_compile_options(${TARGET_NAME} PRIVATE -Wall) +endif() + +# TODO: fix warnings and remove this exception +if(CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG) + ov_add_compiler_flags(-Wno-missing-declarations) +endif() + +# +# Install +# + +install(TARGETS ${TARGET_NAME} + RUNTIME DESTINATION "tools/${TARGET_NAME}" + COMPONENT ${NPU_INTERNAL_COMPONENT} + ${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL}) + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md") + install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/README.md" + DESTINATION "tools/${TARGET_NAME}" + COMPONENT ${NPU_INTERNAL_COMPONENT} + ${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL}) +endif() diff --git a/src/plugins/intel_npu/tools/compile_tool/README.md b/src/plugins/intel_npu/tools/compile_tool/README.md new file mode 100644 index 00000000000000..daebe97b40620c --- /dev/null +++ b/src/plugins/intel_npu/tools/compile_tool/README.md @@ -0,0 +1,101 @@ +# NPU Compile Tool + +This page demonstrates how to use NPU Compile Tool to convert OpenVINO™ Intermediate Representation (IR) of an AI model or a model in ONNX format to a "blob" file that is compiled by NPU NN Compiler and serialized to the format accessible for NPU Driver and NPU Runtime to execute. + + +## Description + +Compile tool is a C++ application that enables you to compile a model for inference on a specific device and export the compiled representation to a binary file. +With this tool, you can compile a model using supported OpenVINO Runtime devices on a machine that does not have the physical device connected, i.e. without NPU driver and Runtime loading, and then transfer a generated file to any machine with the target inference device available. + +Using Compile Tool is not a basic approach to end-to-end execution and/or application but mostly suitable for debugging and validation and some specific use cases. If one is looking for the standard way of reducing application startup delays by exporting and reusing the compiled model automatically, refer to [Model Caching article](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html#model-caching) + +## Workflow of the Compile tool + +First, the application reads command-line parameters and loads a model to the OpenVINO Runtime device. After that, the application exports a blob with the compiled model and writes it to the output file. + +## How to build + +### Within NPU Plugin build + +See [How to build](https://github.com/openvinotoolkit/openvino/wiki#how-to-build). If `ENABLE_INTEL_NPU=ON` is provided, no additional steps are required for Compile Tool. It will be built unconditionally with every NPU Plugin build. It can be found in `bin` folder. + +If you need to configure a release package layout and have Compile Tool in it, use `cmake --install --component npu_internal` from your `build` folder. After installation compile_tool executable can be found in `/tools/compile_tool` folder. + +### Standalone build + +#### Prerequisites +* [OpenVINO™ Runtime release package](https://docs.openvino.ai/2024/get-started/install-openvino.html) + +#### Build instructions +1. Download and install OpenVINO™ Runtime package +2. Build Compile Tool + ```sh + mkdir compile_tool_build && cd compile_tool_build + cmake -DOpenVINO_DIR=/runtime/cmake + cmake --build . --config Release + cmake --install . --prefix + ``` + > Note 1: command line instruction might differ on different platforms (e.g. Windows cmd) + > Note 2: this example is based on OpenVINO Archive distribution. If you have chosen another installation method, specifying OpenVINO_DIR and calling `setupvars` script might not be needed. Refer [documentation](https://docs.openvino.ai/2024/get-started/install-openvino.html) for details. + > Note 3: `` can be any directory on your filesystem that you want to use for installation including `` if you wish to extend OpenVINO package +3. Verify the installation + ```sh + source /setupvars.sh + /tools/compile_tool/compile_tool -h + ``` + > Note 1: command line might differ depending on your platform + > Note 2: this example is based on OpenVINO Archive distribution. If you have chosen another installation method, calling setupvars might not be needed. Refer [documentation](https://docs.openvino.ai/2024/get-started/install-openvino.html) for details. + + Successful build will show the information about Compile Tool CLI options + + +## How to run + +Running the application with the `-h` option yields the following usage message: +``` +OpenVINO Runtime version ......... 202x.y.z +Build ........... 202x.y.z-build-hash +Parsing command-line arguments +compile_tool [OPTIONS] + + Common options: + -h Optional. Print the usage message. + -m Required. Path to the XML model. + -d Required. Specify a target device for which executable network will be compiled. + Use "-d HETERO:" format to specify HETERO plugin. + Use "-d MULTI:" format to specify MULTI plugin. + The application looks for a suitable plugin for the specified device. + -o Optional. Path to the output file. Default value: ".blob". + -c Optional. Path to the configuration file. + -ip Optional. Specifies precision for all input layers of the network. + -op Optional. Specifies precision for all output layers of the network. + -iop "" Optional. Specifies precision for input and output layers by name. + Example: -iop "input:FP16, output:FP16". + Notice that quotes are required. + Overwrites precision from ip and op options for specified layers. + -il Optional. Specifies layout for all input layers of the network. + -ol Optional. Specifies layout for all output layers of the network. + -iol "" Optional. Specifies layout for input and output layers by name. + Example: -iol "input:NCHW, output:NHWC". + Notice that quotes are required. + Overwrites layout from il and ol options for specified layers. + -iml Optional. Specifies model layout for all input layers of the network. + -oml Optional. Specifies model layout for all output layers of the network. + -ioml "" Optional. Specifies model layout for input and output tensors by name. + Example: -ionl "input:NCHW, output:NHWC". + Notice that quotes are required. + Overwrites layout from il and ol options for specified layers. + -shape Set shape for model input. For example, "input1[1,3,224,224],input2[1,4]" or "[1,3,224,224]" in case of one input size. This parameter affect model input shape and can be dynamic. For dynamic dimensions use symbol `?` or '-1'. Ex. [?,3,?,?]. For bounded dimensions specify range 'min..max'. Ex. [1..10,3,?,?]. +``` +Running the application with the empty list of options yields an error message. + +For example, to compile a blob for inference on Intel® Core™ Ultra NPU, run the command below +``` +./compile_tool -m /model_name.xml -d NPU.3720 +``` + +You can pass a config file via `-c` option which allow you to specify some public or private properties. More details in [Supported Properties](https://github.com/openvinotoolkit/openvino/tree/master/src/plugins/intel_npu#supported-properties) and in [configs](https://github.com/openvinotoolkit/openvino/tree/master/src/plugins/intel_npu/src/al/include/intel_npu/al/config). For example, to use a custom build of NPU Compiler instaed of the release Compiler distributed within NPU driver, create a config file with the following content: +``` +NPU_COMPILER_TYPE MLIR +``` diff --git a/src/plugins/intel_npu/tools/compile_tool/cmake/standalone.cmake b/src/plugins/intel_npu/tools/compile_tool/cmake/standalone.cmake new file mode 100644 index 00000000000000..1bd9e2946de56e --- /dev/null +++ b/src/plugins/intel_npu/tools/compile_tool/cmake/standalone.cmake @@ -0,0 +1,47 @@ +# +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if("${CMAKE_BUILD_TYPE}" STREQUAL "") + set(CMAKE_BUILD_TYPE "Release") +endif() + +# +# OpenVINO package path should be specified via OpenVINO_DIR +# + +find_package(Threads REQUIRED) +find_package(OpenVINO REQUIRED COMPONENTS Runtime) + +# +# gflags is distributed in sources in OpenVINO packages so we need to build it explicitly +# + +if(EXISTS "${PACKAGE_PREFIX_DIR}/samples/cpp/thirdparty/gflags") + add_subdirectory("${PACKAGE_PREFIX_DIR}/samples/cpp/thirdparty/gflags" gflags EXCLUDE_FROM_ALL) +else() + find_package(gflags REQUIRED) +endif() + +set(DEPENDENCIES + Threads::Threads + gflags + openvino::runtime +) + +if (CMAKE_COMPILER_IS_GNUCXX) + target_compile_options(${TARGET_NAME} PRIVATE -Wall) +endif() + +file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") + +add_executable(${TARGET_NAME} ${SOURCES}) +target_link_libraries(${TARGET_NAME} ${DEPENDENCIES}) + +install(TARGETS ${TARGET_NAME} + DESTINATION "tools/${TARGET_NAME}" + COMPONENT npu_tools) diff --git a/src/plugins/intel_npu/tools/compile_tool/main.cpp b/src/plugins/intel_npu/tools/compile_tool/main.cpp new file mode 100644 index 00000000000000..4da65962be6aaf --- /dev/null +++ b/src/plugins/intel_npu/tools/compile_tool/main.cpp @@ -0,0 +1,593 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "openvino/core/partial_shape.hpp" +#include "openvino/openvino.hpp" + +#include "tools_helpers.hpp" + +static constexpr char help_message[] = "Optional. Print the usage message."; + +static constexpr char model_message[] = "Required. Path to the XML model."; + +static constexpr char targetDeviceMessage[] = + "Required. Specify a target device for which executable network will be compiled.\n" + " Use \"-d HETERO:\" format to " + "specify HETERO plugin.\n" + " Use \"-d MULTI:\" format to " + "specify MULTI plugin.\n" + " The application looks for a suitable plugin for the specified " + "device."; + +static constexpr char output_message[] = "Optional. Path to the output file. Default value: \".blob\"."; + +static constexpr char log_level_message[] = "Optional. Log level for OpenVINO library."; + +static constexpr char config_message[] = "Optional. Path to the configuration file."; + +static constexpr char inputs_precision_message[] = "Optional. Specifies precision for all input layers of the network."; + +static constexpr char outputs_precision_message[] = + "Optional. Specifies precision for all output layers of the network."; + +static constexpr char iop_message[] = + "Optional. Specifies precision for input and output layers by name.\n" + " Example: -iop \"input:FP16, output:FP16\".\n" + " Notice that quotes are required.\n" + " Overwrites precision from ip and op options for specified " + "layers."; + +static constexpr char inputs_layout_message[] = "Optional. Specifies layout for all input layers of the network."; + +static constexpr char outputs_layout_message[] = "Optional. Specifies layout for all output layers of the network."; + +static constexpr char iol_message[] = + "Optional. Specifies layout for input and output layers by name.\n" + " Example: -iol \"input:NCHW, output:NHWC\".\n" + " Notice that quotes are required.\n" + " Overwrites layout from il and ol options for specified layers."; + +static constexpr char inputs_model_layout_message[] = + "Optional. Specifies model layout for all input layers of the network."; + +static constexpr char outputs_model_layout_message[] = + "Optional. Specifies model layout for all output layers of the network."; + +static constexpr char ioml_message[] = + "Optional. Specifies model layout for input and output tensors by name.\n" + " Example: -ionl \"input:NCHW, output:NHWC\".\n" + " Notice that quotes are required.\n" + " Overwrites layout from il and ol options for specified layers."; + +static const char shape_message[] = + " Set shape for model input. For example, \"input1[1,3,224,224],input2[1,4]\" or \"[1,3,224,224]\"" + " in case of one input size. This parameter affect model input shape and can be dynamic." + " For dynamic dimensions use symbol `?` or '-1'. Ex. [?,3,?,?]." + " For bounded dimensions specify range 'min..max'. Ex. [1..10,3,?,?]."; + +static const char override_model_batch_size[] = "Enforce a model to be compiled for batch size"; + +DEFINE_bool(h, false, help_message); +DEFINE_string(m, "", model_message); +DEFINE_string(d, "", targetDeviceMessage); +DEFINE_string(o, "", output_message); +DEFINE_string(log_level, "", log_level_message); +DEFINE_string(c, "", config_message); +DEFINE_string(ip, "", inputs_precision_message); +DEFINE_string(op, "", outputs_precision_message); +DEFINE_string(iop, "", iop_message); +DEFINE_string(il, "", inputs_layout_message); +DEFINE_string(ol, "", outputs_layout_message); +DEFINE_string(iol, "", iol_message); +DEFINE_string(iml, "", inputs_model_layout_message); +DEFINE_string(oml, "", outputs_model_layout_message); +DEFINE_string(ioml, "", ioml_message); +DEFINE_string(shape, "", shape_message); +DEFINE_uint32(override_model_batch_size, 1, override_model_batch_size); + +namespace { +std::vector splitStringList(const std::string& str, char delim) { + if (str.empty()) + return {}; + + std::istringstream istr(str); + + std::vector result; + std::string elem; + while (std::getline(istr, elem, delim)) { + if (elem.empty()) { + continue; + } + result.emplace_back(std::move(elem)); + } + + return result; +} + +std::map parseArgMap(std::string argMap) { + argMap.erase(std::remove_if(argMap.begin(), argMap.end(), ::isspace), argMap.end()); + + const auto pairs = splitStringList(argMap, ','); + + std::map parsedMap; + for (auto&& pair : pairs) { + const auto lastDelimPos = pair.find_last_of(':'); + auto key = pair.substr(0, lastDelimPos); + auto value = pair.substr(lastDelimPos + 1); + + if (lastDelimPos == std::string::npos || key.empty() || value.empty()) { + throw std::invalid_argument("Invalid key/value pair " + pair + ". Expected :"); + } + + parsedMap[std::move(key)] = std::move(value); + } + + return parsedMap; +} +} // namespace +using supported_type_t = std::unordered_map; +ov::element::Type getType(std::string value, const supported_type_t& supported_precisions) { + std::transform(value.begin(), value.end(), value.begin(), ::toupper); + + const auto precision = supported_precisions.find(value); + if (precision == supported_precisions.end()) { + throw std::logic_error("\"" + value + "\"" + " is not a valid precision"); + } + + return precision->second; +} +ov::element::Type getType(const std::string& value) { + static const supported_type_t supported_types = { + {"FP32", ov::element::f32}, {"f32", ov::element::f32}, {"FP16", ov::element::f16}, + {"f16", ov::element::f16}, {"BF16", ov::element::bf16}, {"bf16", ov::element::bf16}, + {"U64", ov::element::u64}, {"u64", ov::element::u64}, {"I64", ov::element::i64}, + {"i64", ov::element::i64}, {"U32", ov::element::u32}, {"u32", ov::element::u32}, + {"I32", ov::element::i32}, {"i32", ov::element::i32}, {"U16", ov::element::u16}, + {"u16", ov::element::u16}, {"I16", ov::element::i16}, {"i16", ov::element::i16}, + {"U8", ov::element::u8}, {"u8", ov::element::u8}, {"I8", ov::element::i8}, + {"i8", ov::element::i8}, {"BOOL", ov::element::boolean}, {"boolean", ov::element::boolean}, + }; + + return getType(value, supported_types); +} + +bool isFP32(const ov::element::Type& type) { + return type == ov::element::f32; +} + +void boundDynamicShape(std::shared_ptr& model) { + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + if (shape.is_static()) { + continue; + } + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by NPU"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by NPU\n" + " Setting batch to 1 forcibly" + << std::endl; + ov::set_batch(model, 1); + } + shape = item->get_partial_shape(); + if (shape.is_dynamic()) { + throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + + " is dynamic which is not supported by NPU"); + } + } +} + +void setModelBatch(std::shared_ptr& model, uint32_t batch = 1) { + if (batch == 1) { + return; + } + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by NPU"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + throw std::logic_error("ERROR: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by NPU\n" + "Cannot apply fixed batch: " + + std::to_string(batch) + + ". Please remove the parameter from config: \"override_model_batch_size\""); + } + ov::set_batch(model, batch); + } +} + +void configurePrePostProcessing(std::shared_ptr& model, const std::string& ip, const std::string& op, + const std::string& iop, const std::string& il, const std::string& ol, + const std::string& iol, const std::string& iml, const std::string& oml, + const std::string& ioml) { + auto preprocessor = ov::preprocess::PrePostProcessor(model); + const auto inputs = model->inputs(); + const auto outputs = model->outputs(); + + if (!ip.empty()) { + auto type = getType(ip); + for (size_t i = 0; i < inputs.size(); i++) { + preprocessor.input(i).tensor().set_element_type(type); + } + } + + if (!op.empty()) { + auto type = getType(op); + for (size_t i = 0; i < outputs.size(); i++) { + preprocessor.output(i).tensor().set_element_type(type); + } + } + + if (!iop.empty()) { + const auto user_precisions_map = parseArgMap(iop); + for (auto&& item : user_precisions_map) { + const auto& tensor_name = item.first; + const auto type = getType(item.second); + + bool tensorFound = false; + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].get_names().count(tensor_name)) { + preprocessor.input(i).tensor().set_element_type(type); + tensorFound = true; + break; + } + } + if (!tensorFound) { + for (size_t i = 0; i < outputs.size(); i++) { + if (outputs[i].get_names().count(tensor_name)) { + preprocessor.output(i).tensor().set_element_type(type); + tensorFound = true; + break; + } + } + } + OPENVINO_ASSERT(tensorFound, "Model doesn't have input/output with tensor name: ", tensor_name); + } + } + if (!il.empty()) { + for (size_t i = 0; i < inputs.size(); i++) { + preprocessor.input(i).tensor().set_layout(ov::Layout(il)); + } + } + + if (!ol.empty()) { + for (size_t i = 0; i < outputs.size(); i++) { + preprocessor.output(i).tensor().set_layout(ov::Layout(ol)); + } + } + + if (!iol.empty()) { + const auto user_precisions_map = parseArgMap(iol); + for (auto&& item : user_precisions_map) { + const auto& tensor_name = item.first; + + bool tensorFound = false; + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].get_names().count(tensor_name)) { + preprocessor.input(i).tensor().set_layout(ov::Layout(item.second)); + tensorFound = true; + break; + } + } + if (!tensorFound) { + for (size_t i = 0; i < outputs.size(); i++) { + if (outputs[i].get_names().count(tensor_name)) { + preprocessor.output(i).tensor().set_layout(ov::Layout(item.second)); + tensorFound = true; + break; + } + } + } + OPENVINO_ASSERT(tensorFound, "Model doesn't have input/output with tensor name: ", tensor_name); + } + } + + if (!iml.empty()) { + for (size_t i = 0; i < inputs.size(); i++) { + preprocessor.input(i).model().set_layout(ov::Layout(iml)); + } + } + + if (!oml.empty()) { + for (size_t i = 0; i < outputs.size(); i++) { + preprocessor.output(i).model().set_layout(ov::Layout(oml)); + } + } + + if (!ioml.empty()) { + const auto user_precisions_map = parseArgMap(ioml); + for (auto&& item : user_precisions_map) { + const auto& tensor_name = item.first; + + bool tensorFound = false; + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].get_names().count(tensor_name)) { + preprocessor.input(i).model().set_layout(ov::Layout(item.second)); + tensorFound = true; + break; + } + } + if (!tensorFound) { + for (size_t i = 0; i < outputs.size(); i++) { + if (outputs[i].get_names().count(tensor_name)) { + preprocessor.output(i).model().set_layout(ov::Layout(item.second)); + tensorFound = true; + break; + } + } + } + OPENVINO_ASSERT(tensorFound, "Model doesn't have input/output with tensor name: ", tensor_name); + } + } + + model = preprocessor.build(); +} + +void printInputAndOutputsInfoShort(const ov::Model& network) { + std::cout << "Network inputs:" << std::endl; + for (auto&& param : network.get_parameters()) { + auto l = param->get_layout(); + std::cout << " " << param->get_friendly_name() << " : " << param->get_element_type() << " / " + << param->get_layout().to_string() << " / " << param->get_partial_shape().to_string() << std::endl; + } + std::cout << "Network outputs:" << std::endl; + for (auto&& result : network.get_results()) { + std::cout << " " << result->get_friendly_name() << " : " << result->get_element_type() << " / " + << result->get_layout().to_string() << " / " << result->get_output_partial_shape(0).to_string() + << std::endl; + } +} + +inline std::string fileNameNoExt(const std::string& filepath) { + auto pos = filepath.rfind('.'); + if (pos == std::string::npos) + return filepath; + return filepath.substr(0, pos); +} + +static void showUsage() { + std::cout << "compile_tool [OPTIONS]" << std::endl; + std::cout << std::endl; + std::cout << " Common options: " << std::endl; + std::cout << " -h " << help_message << std::endl; + std::cout << " -m " << model_message << std::endl; + std::cout << " -d " << targetDeviceMessage << std::endl; + std::cout << " -o " << output_message << std::endl; + std::cout << " -c " << config_message << std::endl; + std::cout << " -ip " << inputs_precision_message << std::endl; + std::cout << " -op " << outputs_precision_message << std::endl; + std::cout << " -iop \"\" " << iop_message << std::endl; + std::cout << " -il " << inputs_layout_message << std::endl; + std::cout << " -ol " << outputs_layout_message << std::endl; + std::cout << " -iol \"\" " << iol_message << std::endl; + std::cout << " -iml " << inputs_model_layout_message << std::endl; + std::cout << " -oml " << outputs_model_layout_message << std::endl; + std::cout << " -ioml \"\" " << ioml_message << std::endl; + std::cout << " -shape " << shape_message << std::endl; + std::cout << std::endl; +} + +static bool parseCommandLine(int* argc, char*** argv) { + gflags::ParseCommandLineNonHelpFlags(argc, argv, true); + + if (FLAGS_h) { + showUsage(); + return false; + } + + if (FLAGS_m.empty()) { + throw std::invalid_argument("Path to model xml file is required"); + } + + if (FLAGS_d.empty()) { + throw std::invalid_argument("Target device name is required"); + } + + if (1 < *argc) { + std::stringstream message; + message << "Unknown arguments: "; + for (auto arg = 1; arg < *argc; arg++) { + message << (*argv)[arg]; + if (arg < *argc) { + message << " "; + } + } + throw std::invalid_argument(message.str()); + } + + return true; +} + +static std::map parseConfigFile(char comment = '#') { + std::map config; + + std::ifstream file(FLAGS_c); + if (file.is_open()) { + std::string option; + while (std::getline(file, option)) { + if (option.empty() || option[0] == comment) { + continue; + } + size_t spacePos = option.find_first_of(" \t\n\r"); + OPENVINO_ASSERT(spacePos != std::string::npos, "Failed to find a space separator in " + "provided plugin config option: " + + option); + + std::string key = option.substr(0, spacePos); + + std::string value{}; + size_t valueStart = option.find_first_not_of(" \t\n\r", spacePos); + OPENVINO_ASSERT(valueStart != std::string::npos, "An invalid config parameter value detected, " + "it mustn't be empty: " + + option); + size_t valueEnd = option.find_last_not_of(" \t\n\r"); + value = option.substr(valueStart, valueEnd - valueStart + 1); + + config[key] = std::move(value); + } + } + return config; +} + +std::string getFileNameFromPath(const std::string& path, +#if defined(_WIN32) + const std::string& sep = "\\") { +#else + const std::string& sep = "/") { +#endif + const auto pos = path.rfind(sep); + if (std::string::npos == pos) { + return path; + } else { + return path.substr(pos + 1); + } +} + +using TimeDiff = std::chrono::milliseconds; + +void reshape(ov::OutputVector inputs_info, InputsInfo& info_map, std::shared_ptr& model) { + std::vector info_maps; + if (!FLAGS_shape.empty()) { + std::map> shapes_map = parseInputParameters(FLAGS_shape, inputs_info); + + if (FLAGS_override_model_batch_size != 1) { + throw std::logic_error("Incompatible params: \"shape\" and \"override_model_batch_size\""); + } + for (auto& item : inputs_info) { + InputInfo info; + auto name = item.get_any_name(); + + if (!shapes_map.empty()) { + if (shapes_map.count(name)) { + if (shapes_map.at(name).size() > 1) { + // Example: -shape input1[..][..] + throw std::logic_error("shape command line parameter doesn't support multiple " + "shapes for one input."); + } + info.partialShape = shapes_map.at(name)[0]; + } else { + info.partialShape = item.get_partial_shape(); + } + } + info_map[name] = info; + info_maps.push_back(info_map); + } + std::map newShapes; + for (auto& item : info_maps) { + for (auto& map : item) { + if (!newShapes.count(map.first)) { + newShapes[map.first] = map.second.partialShape; + } + } + } + model->reshape(newShapes); + } else { + if (FLAGS_d.find("NPU") != std::string::npos) { + boundDynamicShape(model); + } + + setModelBatch(model, FLAGS_override_model_batch_size); + } +} + +int main(int argc, char* argv[]) { + try { + TimeDiff loadNetworkTimeElapsed{0}; + + const auto& version = ov::get_openvino_version(); + std::cout << version.description << " version ......... "; + std::cout << OPENVINO_VERSION_MAJOR << "." << OPENVINO_VERSION_MINOR << "." << OPENVINO_VERSION_PATCH + << std::endl; + + std::cout << "Build ........... "; + std::cout << version.buildNumber << std::endl; + std::cout << "Parsing command-line arguments" << std::endl; + if (!parseCommandLine(&argc, &argv)) { + return EXIT_SUCCESS; + } + + ov::Core core; + std::cout << "Checking FLAGS_LOG_LEVEL " << FLAGS_log_level << std::endl; + + if (!FLAGS_log_level.empty()) { + std::cout << "Setting log level " << FLAGS_log_level << std::endl; + ov::log::Level level; + std::stringstream{FLAGS_log_level} >> level; + core.set_property(FLAGS_d, ov::log::level(level)); + } + + std::cout << "Reading model" << std::endl; + auto model = core.read_model(FLAGS_m); + auto inputs_info = std::const_pointer_cast(model)->inputs(); + InputsInfo info_map; + + std::cout << "Performing reshape" << std::endl; + reshape(inputs_info, info_map, model); + + std::cout << "Configuring model pre & post processing" << std::endl; + configurePrePostProcessing(model, FLAGS_ip, FLAGS_op, FLAGS_iop, FLAGS_il, FLAGS_ol, FLAGS_iol, FLAGS_iml, + FLAGS_oml, FLAGS_ioml); + std::cout << "Printing Input and Output Info from model" << std::endl; + printInputAndOutputsInfoShort(*model); + auto timeBeforeLoadNetwork = std::chrono::steady_clock::now(); + std::cout << "Parsing configuration file" << std::endl; + auto configs = parseConfigFile(); + + std::cout << "Compiling model" << std::endl; + auto compiledModel = core.compile_model(model, FLAGS_d, {configs.begin(), configs.end()}); + loadNetworkTimeElapsed = + std::chrono::duration_cast(std::chrono::steady_clock::now() - timeBeforeLoadNetwork); + std::string outputName = FLAGS_o; + if (outputName.empty()) { + outputName = getFileNameFromPath(fileNameNoExt(FLAGS_m)) + ".blob"; + } + + std::ofstream outputFile{outputName, std::ios::out | std::ios::binary}; + if (!outputFile.is_open()) { + std::cout << "Outputting file " << outputName << " can't be opened for writing" << std::endl; + return EXIT_FAILURE; + } else { + std::cout << "Writing into file - " << outputName << std::endl; + compiledModel.export_model(outputFile); + } + std::cout << "Done. LoadNetwork time elapsed: " << loadNetworkTimeElapsed.count() << " ms" << std::endl; + } catch (const std::exception& error) { + std::cerr << error.what() << std::endl; + return EXIT_FAILURE; + } catch (...) { + std::cerr << "Unknown/internal exception happened." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp b/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp new file mode 100644 index 00000000000000..6d42fd142b8971 --- /dev/null +++ b/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "openvino/openvino.hpp" + +struct InputInfo { + ov::element::Type type; + ov::PartialShape partialShape; + ov::Shape dataShape; + ov::Layout layout; +}; +using InputsInfo = std::map; + +std::string parameterNameToTensorName(std::string& name, std::vector>& inputs_info) { + auto count_name = std::any_of(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { + return port.get_names().count(name) > 0; + }); + if (count_name) { + return name; + } else { + auto inputInfo = std::find_if(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { + return name == port.get_node()->get_friendly_name(); + }); + if (inputInfo == inputs_info.end()) { + throw std::runtime_error("Provided I/O name \"" + name + + "\" is not found neither in tensor names nor in nodes names."); + } + return inputInfo->get_any_name(); + } +} + +std::map> parseInputParameters(std::string& parameter_string, + std::vector>& input_info) { + // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all + // inputs) + std::map> return_value; + std::string search_string = parameter_string; + auto start_pos = search_string.find_first_of('['); + auto input_name = search_string.substr(0, start_pos); + while (start_pos != std::string::npos) { + auto end_pos = search_string.find_first_of(']'); + if (end_pos == std::string::npos) + break; + input_name = search_string.substr(0, start_pos); + auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); + if (!input_name.empty()) { + return_value[parameterNameToTensorName(input_name, input_info)].push_back(input_value); + } else { + for (auto& item : input_info) { + return_value[item.get_any_name()].push_back(input_value); + } + } + search_string = search_string.substr(end_pos + 1); + if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) + break; + if (search_string.front() == ',') { + if (search_string.length() > 1) + search_string = search_string.substr(1); + else + throw std::logic_error("Can't parse input parameter string, there is nothing after the comma " + + parameter_string); + } + start_pos = search_string.find_first_of('['); + } + if (!search_string.empty()) + throw std::logic_error("Can't parse input parameter string: " + parameter_string); + return return_value; +} diff --git a/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt b/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt new file mode 100644 index 00000000000000..4bcfb0bc936bf8 --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/CMakeLists.txt @@ -0,0 +1,78 @@ +# +# Copyright (C) 2022-2024 Intel Corporation. +# SPDX-License-Identifier: Apache 2.0 +# + +set(TARGET_NAME single-image-test) + +if (NOT DEFINED PROJECT_NAME) + cmake_minimum_required(VERSION 3.13 FATAL_ERROR) + project(single-image-test_standalone) + include("cmake/standalone.cmake") + return() +endif() + +find_package(OpenCV QUIET COMPONENTS core imgproc imgcodecs) + +# +# check for missing dependencies +# + +set(MISSING_DEPENDENCIES "") +foreach(LIB opencv_core opencv_imgproc opencv_imgcodecs) + if(NOT TARGET ${LIB}) + list(APPEND MISSING_DEPENDENCIES ${LIB}) + endif() +endforeach() + +if(NOT MISSING_DEPENDENCIES STREQUAL "") + message(WARNING "${TARGET_NAME} tool is disabled due to missing dependencies: ${MISSING_DEPENDENCIES}") + return() +endif() + +# +# Define the target +# + +ov_add_target(ADD_CPPLINT + TYPE EXECUTABLE + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + LINK_LIBRARIES + PRIVATE + openvino::runtime + TBB::tbb + opencv_core + opencv_imgproc + opencv_imgcodecs + npu_tools_utils + gflags) + +set_target_properties(${TARGET_NAME} PROPERTIES + FOLDER ${CMAKE_CURRENT_SOURCE_DIR} + CXX_STANDARD 17) + +if (CMAKE_COMPILER_IS_GNUCXX) + target_compile_options(${TARGET_NAME} PRIVATE -Wall) +endif() + +# TODO: fix warnings and remove this exception +if(CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG) + ov_add_compiler_flags(-Wno-missing-declarations) +endif() + +# +# Install +# + +install(TARGETS ${TARGET_NAME} + RUNTIME DESTINATION "tools/${TARGET_NAME}" + COMPONENT ${NPU_INTERNAL_COMPONENT} + ${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL}) + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md") + install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/README.md" + DESTINATION "tools/${TARGET_NAME}" + COMPONENT ${NPU_INTERNAL_COMPONENT} + ${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL}) +endif() diff --git a/src/plugins/intel_npu/tools/single-image-test/README.md b/src/plugins/intel_npu/tools/single-image-test/README.md new file mode 100644 index 00000000000000..185b1c018b658c --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/README.md @@ -0,0 +1,284 @@ +# NPU Single Image Test Tool + +This page demostrates how to use NPU Single Image Test Tool for end-to-end accuracy validation on a single image or input file with OpenVINO™ Intermediate Representation (IR) of an AI model or a model in ONNX format. + + +## Description + +Single Image Test Tool is a C++ application that enables you to pass OpenVINO IR or ONNX model or pre-compiled blob and a single image or any other compatible file with the model inputs and get 2 sets of files with CPU outputs and NPU outputs that can be compared later or straight after the inference if `-run_test` option is passed. + +The tool can be configured to perform various preprocessing methods and output comparison algorithms depending on the model topology and its output semantics. See the tool help message below for the details. + +Using Single Image Test is not a basic approach to end-to-end validation or collecting release measures but is created for development CI checks and debugging. However, some methodologies might be useful if you're looking for examples of how to use NPU and preprocess data for the inference on NPU. If you're looking for the standard way of measuring accuracy please refer to [Deep Learning accuracy validation framework](https://github.com/openvinotoolkit/open_model_zoo/tree/master/tools/accuracy_checker). + + +## How to build + +### Within NPU Plugin build + +See [How to build](https://github.com/openvinotoolkit/openvino/wiki#how-to-build). If `ENABLE_INTEL_NPU=ON` is provided and `OpenCV` project is linked to the current cmake project, no additional steps are required for Single Image Test. It will be built unconditionally with every NPU Plugin build. It can be found in `bin` folder. + +If you need to configure a release package layout and have Single Image Test in it, use `cmake --install --component npu_internal` from your `build` folder. After installation single-image-test executable can be found in `/tools/single-image-test` folder. + +### Standalone build + +#### Prerequisites +* [OpenVINO™ Runtime release package](https://docs.openvino.ai/2024/get-started/install-openvino.html) +* [OpenCV: Open Source Computer Vision Library release package](https://opencv.org/get-started/) + +#### Build instructions +1. Download and install OpenVINO™ Runtime package +1. Download and install OpenCV package +1. Build Single Image Test Tool + ```sh + mkdir sit_build && cd sit_build + source /setupvars.sh + cmake -DOpenVINO_DIR=/runtime/cmake -DOpenCV_DIR= + cmake --build . --config Release + cmake --install . --prefix + ``` + > Note 1: command line instruction might differ on different platforms (e.g. Windows cmd) + > Note 2: this example is based on OpenVINO Archive distribution. If you have chosen another installation method, specifying OpenVINO_DIR and calling setupvars might not be needed. Refer [documentation](https://docs.openvino.ai/2024/get-started/install-openvino.html) for details. + > Note 3: depending on OpenCV installation method, there might not be a need to specify OpenCV_DIR. + > Note 4: depending on OpenCV version, cmake configs might be located somewhere else. You need to specify a directory that contains `OpenCVConfig.cmake` file + > Note 5: `` can be any directory on your filesystem that you want to use for installation including `` if you wish to extend OpenVINO package +1. Verify the installation + ```sh + source /setupvars.sh + source setup_vars_opencv4.sh + /tools/single-image-test/single-image-test -help + ``` + > Note 1: command line might differ depending on your platform + > Note 2: depending on OpenCV installation method, there might not be a need to call setupvars. + > Note 3: this example is based on OpenVINO Archive distribution. If you have chosen another installation method, calling setupvars might not be needed. Refer [documentation](https://docs.openvino.ai/2024/get-started/install-openvino.html) for details. + + Successful build will show the information about Single Image Test Tool CLI options + + +## How to run + +Running the application with the `-help` option yields the following usage message: +``` +single-image-test.exe: Usage: Release\single-image-test.exe[] + + Flags from C:\Users\mdoronin\work\applications.ai.vpu-accelerators.vpux-plugin\tools\single-image-test\main.cpp: + -box_tolerance (Box tolerance for 'detection' mode) type: double + default: 0.0001 + -classes (Number of classes for Yolo V3) type: int32 default: 80 + -color_format (Color format for input: RGB or BGR) type: string + default: "BGR" + -compiled_blob (Output compiled network file (compiled result blob)) + type: string default: "" + -confidence_threshold (Confidence threshold for Detection mode) + type: double default: 0.0001 + -config (Path to the configuration file (optional)) type: string + default: "" + -coords (Number of coordinates for Yolo V3) type: int32 default: 4 + -cosim_threshold (Threshold for 'cosim' mode) type: double + default: 0.90000000000000002 + -dataset (The dataset used to train the model. Useful for instances such as + semantic segmentation to visualize the accuracy per-class) type: string + default: "NONE" + -device (Device to use) type: string default: "" + -il (Input layout) type: string default: "" + -img_as_bin (Force binary input even if network expects an image) + type: bool default: false + -img_bin_precision (Specify the precision of the binary input files. Eg: + 'FP32,FP16,I32,I64,U8') type: string default: "" + -iml (Model input layout) type: string default: "" + -input (Input file(s)) type: string default: "" + -ip (Input precision (default: U8, available: FP32, FP16, I32, I64, U8)) + type: string default: "" + -is_tiny_yolo (Is it Tiny Yolo or not (true or false)?) type: bool + default: false + -log_level (IE logger level (optional)) type: string default: "" + -mean_values (Optional. Mean values to be used for the input image per + channel. Values to be provided in the [channel1,channel2,channel3] + format. Can be defined for desired input of the model, for example: + "--mean_values data[255,255,255],info[255,255,255]". The exact meaning + and order of channels depend on how the original model was trained. + Applying the values affects performance and may cause type conversion) + type: string default: "" + -mode (Comparison mode to use) type: string default: "" + -network (Network file (either XML or pre-compiled blob)) type: string + default: "" + -normalized_image (Images in [0, 1] range or not) type: bool default: false + -nrmse_loss_threshold (Threshold for 'nrmse' mode) type: double default: 1 + -num (Number of scales for Yolo V3) type: int32 default: 3 + -ol (Output layout) type: string default: "" + -oml (Model output layout) type: string default: "" + -op (Output precision (default: FP32, available: FP32, FP16, I32, I64, U8)) + type: string default: "" + -override_model_batch_size (Enforce a model to be compiled for batch size) + type: uint32 default: 1 + -pc (Report performance counters) type: bool default: false + -prob_tolerance (Probability tolerance for 'classification/ssd/yolo' mode) + type: double default: 0.0001 + -psnr_reference (PSNR reference value in dB) type: double default: 30 + -psnr_tolerance (Tolerance for 'psnr' mode) type: double default: 0.0001 + -raw_tolerance (Tolerance for 'raw' mode (absolute diff)) type: double + default: 0.0001 + -rrmse_loss_threshold (Threshold for 'rrmse' mode) type: double + default: 1.7976931348623157e+308 + -run_test (Run the test (compare current results with previously dumped)) + type: bool default: false + -scale_border (Scale border) type: uint32 default: 4 + -scale_values (Optional. Scale values to be used for the input image per + channel. Values are provided in the [channel1,channel2,channel3] format. + Can be defined for desired input of the model, for example: + "--scale_values data[255,255,255],info[255,255,255]". The exact meaning + and order of channels depend on how the original model was trained. If + both --mean_values and --scale_values are specified, the mean is + subtracted first and then scale is applied regardless of the order of + options in command line. Applying the values affects performance and may + cause type conversion) type: string default: "" + -sem_seg_classes (Number of classes for semantic segmentation) type: uint32 + default: 12 + -sem_seg_ignore_label (The number of the label to be ignored) type: uint32 + default: 4294967295 + -sem_seg_threshold (Threshold for 'semantic segmentation' mode) + type: double default: 0.97999999999999998 + -top_k (Top K parameter for 'classification' mode) type: uint32 default: 1 +``` + +For example, to run inference with mobilenet-v2 model on Intel® Core™ Ultra NPU on Windows 11 OS, run the commands below: + +1. Running inference on CPU to collect reference result + ``` + single-image-test.exe \ + --network \ + mobilenet-v2.xml \ + --input \ + validation-set/224x224/watch.bmp \ + --ip \ + FP16 \ + --op \ + FP16 \ + --device \ + CPU \ + --color_format \ + RGB \ + --il \ + NCHW \ + --ol \ + NC \ + --iml \ + NCHW \ + --oml \ + NC + ``` + expected output: + ``` + Parameters: + Network file: mobilenet-v2.xml + Input file(s): validation-set/224x224/watch.bmp + Output compiled network file: + Color format: RGB + Input precision: FP16 + Output precision: FP16 + Input layout: NCHW + Output layout: NC + Model input layout: NCHW + Model output layout: NC + Img as binary: 0 + Bin input file precision: + Device: CPU + Config file: + Run test: 0 + Performance counters: 0 + Mean_values [channel1,channel2,channel3] + Scale_values [channel1,channel2,channel3] + Log level: + + Run single image test + Load network mobilenet-v2.xml + Load input #0 from validation-set/224x224/watch.bmp as f16 [N,C,H,W] [1,3,224,224] + Dump input #0_case_0 to _mobilenet_v2_input_0_case_0.blob + Run inference on CPU + Latency: 100 ms + Dump reference output #0 to _mobilenet_v2_ref_out_0_case_0.blob + ``` + +1. Running inference on NPU and comparing results. In this example, it's considered that the model has been compiled before and exported as a blob file. You can pass OpenVINO IR with the same success but depending on a model and your setup, some additional configs might be needed in a config file or in CLI. + + ``` + single-image-test.exe \ + --network \ + mobilenet-v2.blob \ + --input \ + validation-set/224x224/watch.bmp \ + --ip \ + FP16 \ + --op \ + FP16 \ + --device \ + NPU \ + --config \ + mobilenet-v2.conf \ + --run_test \ + -log_level \ + LOG_ERROR \ + --mode \ + classification \ + --top_k \ + 1 \ + --prob_tolerance \ + 0.6 \ + --color_format \ + RGB \ + --il \ + NCHW \ + --ol \ + NC \ + --iml \ + NCHW \ + --oml \ + NC + ``` + + the content of mobilenet-v2.conf: + ``` + NPU_COMPILER_TYPE DRIVER + NPU_PLATFORM VPU3720 + ``` + + expected output: + ``` + Parameters: + Network file: mobilenet-v2.blob + Input file(s): validation-set/224x224/watch.bmp + Output compiled network file: + Color format: RGB + Input precision: FP16 + Output precision: FP16 + Input layout: NCHW + Output layout: NC + Model input layout: NCHW + Model output layout: NC + Img as binary: 0 + Bin input file precision: + Device: NPU + Config file: mobilenet-v2.conf + Run test: 1 + Performance counters: 0 + Mean_values [channel1,channel2,channel3] + Scale_values [channel1,channel2,channel3] + Mode: classification + Top K: 1 + Tolerance: 0.6 + Log level: LOG_ERROR + + Run single image test + Import network mobilenet-v2.blob + Load input #0 from validation-set/224x224/watch.bmp as f16 [N,C,H,W] [1,3,224,224] + Dump input #0_case_0 to _mobilenet_v2_input_0_case_0.blob + Run inference on NPU + Latency: 3 ms + Load reference output #0 from _mobilenet_v2_ref_out_0_case_0.blob as f16 + Dump device output #0_case_0 to _mobilenet_v2_kmb_out_0_case_0.blob + Actual top: + 0 : 531 : 21.95 + Ref Top: + 0 : 531 : 21.95 + PASSED + ``` diff --git a/src/plugins/intel_npu/tools/single-image-test/cmake/standalone.cmake b/src/plugins/intel_npu/tools/single-image-test/cmake/standalone.cmake new file mode 100644 index 00000000000000..eaefd3a88a088b --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/cmake/standalone.cmake @@ -0,0 +1,58 @@ +# +# Copyright (C) 2024 Intel Corporation. +# SPDX-License-Identifier: Apache 2.0 +# + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if("${CMAKE_BUILD_TYPE}" STREQUAL "") + set(CMAKE_BUILD_TYPE "Release") +endif() + +# +# OpenVINO and OpenCV package paths should be specified via OpenVINO_DIR and OpenCV_DIR +# TBB is distributed with OpenVINO packages however it is not exported from OpenVINO cmake config +# Therefore we need to find TBB explicitly. TBB_DIR is not requered as OpenVINO setupvars script sets its location +# + +find_package(Threads REQUIRED) +find_package(OpenVINO REQUIRED COMPONENTS Runtime) +find_package(TBB REQUIRED) +find_package(OpenCV REQUIRED COMPONENTS core imgproc imgcodecs) + +add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../common" common EXCLUDE_FROM_ALL) + +# +# gflags is distributed in sources in OpenVINO packages so we need to build it explicitly +# + +if(EXISTS "${PACKAGE_PREFIX_DIR}/samples/cpp/thirdparty/gflags") + add_subdirectory("${PACKAGE_PREFIX_DIR}/samples/cpp/thirdparty/gflags" gflags EXCLUDE_FROM_ALL) +else() + find_package(gflags REQUIRED) +endif() + +set(DEPENDENCIES + Threads::Threads + gflags + openvino::runtime + TBB::tbb + opencv_core + opencv_imgproc + opencv_imgcodecs + npu_tools_utils +) + +if (CMAKE_COMPILER_IS_GNUCXX) + target_compile_options(${TARGET_NAME} PRIVATE -Wall) +endif() + +file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") + +add_executable(${TARGET_NAME} ${SOURCES}) +target_link_libraries(${TARGET_NAME} PRIVATE ${DEPENDENCIES}) + +install(TARGETS ${TARGET_NAME} + DESTINATION "tools/${TARGET_NAME}" + COMPONENT npu_tools) diff --git a/src/plugins/intel_npu/tools/single-image-test/image_quality_helper.cpp b/src/plugins/intel_npu/tools/single-image-test/image_quality_helper.cpp new file mode 100644 index 00000000000000..4db5e2ca6f3a62 --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/image_quality_helper.cpp @@ -0,0 +1,49 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +// + +#include "image_quality_helper.hpp" + +#include +#include + +#include "data_type_converters.hpp" + + +float utils::runPSNRMetric(std::vector>& actOutput, + std::vector>& refOutput, + const size_t imgHeight, + const size_t imgWidth, + int scaleBorder, + bool normalizedImage) { + size_t colorScale; + float imageDiff; + float sum = 0; + + if (!normalizedImage) { + colorScale = 255; + } else { + colorScale = 1; + } + + for (size_t iout = 0; iout < actOutput.size(); ++iout) { + for (size_t h = scaleBorder; h < imgHeight - scaleBorder; h++) { + for (size_t w = scaleBorder; w < imgWidth - scaleBorder; w++) { + imageDiff = ((actOutput[iout][h * imgWidth + w] - refOutput[iout][h * imgWidth + w]) / + npu::utils::convertValuePrecision(colorScale)); + + sum = sum + (imageDiff * imageDiff); + } + } + } + + auto mse = sum / (imgWidth * imgHeight); + auto psnr = -10 * log10(mse); + + std::cout << "psnr: " << psnr << " Db" << std::endl; + + return psnr; +} diff --git a/src/plugins/intel_npu/tools/single-image-test/image_quality_helper.hpp b/src/plugins/intel_npu/tools/single-image-test/image_quality_helper.hpp new file mode 100644 index 00000000000000..911ac60201ad87 --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/image_quality_helper.hpp @@ -0,0 +1,18 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +// + +#pragma once + +#include +#include + +namespace utils { + +float runPSNRMetric(std::vector>& actOutput, std::vector>& refOutput, + const size_t imgHeight, const size_t imgWidth, int scaleBorder, bool normalizedImage); + +} // namespace utils diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp new file mode 100644 index 00000000000000..84cdc87df186ae --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp @@ -0,0 +1,2069 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +#include "image_quality_helper.hpp" +#include "semantic_segmentation_helpers.hpp" +#include "tensor_utils.hpp" +#include "yolo_helpers.hpp" + +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +using TensorMap = std::map; + +struct TensorDescriptor { + ov::element::Type precision; + ov::Shape shape; + ov::Layout layout; +}; + +using TensorDescriptorMap = std::unordered_map; +using LayoutMap = std::unordered_map; + +/** + * @brief Provides a caseless equality function for STL algorithms. + * @details Utility function taken from the OpenVINO implementation, formerly registered as + * "InferenceEngine::details::CaselessEq" + * @tparam Key + */ +template +class CaselessEq { +public: + bool operator()(const Key& a, const Key& b) const noexcept { + return a.size() == b.size() && + std::equal(std::begin(a), std::end(a), std::begin(b), [](const char cha, const char chb) { + return std::tolower(cha) == std::tolower(chb); + }); + } +}; + +CaselessEq strEq; + +// +// Command line options +// + +DEFINE_string(network, "", "Network file (either XML or pre-compiled blob)"); +DEFINE_string(input, "", "Input file(s)"); +DEFINE_string(compiled_blob, "", "Output compiled network file (compiled result blob)"); +DEFINE_uint32(override_model_batch_size, 1, "Enforce a model to be compiled for batch size"); +DEFINE_string(device, "", "Device to use"); +DEFINE_string(config, "", "Path to the configuration file (optional)"); +DEFINE_string(ip, "", "Input precision (default: U8, available: FP32, FP16, I32, I64, U8)"); +DEFINE_string(op, "", "Output precision (default: FP32, available: FP32, FP16, I32, I64, U8)"); +DEFINE_string(il, "", "Input layout"); +DEFINE_string(ol, "", "Output layout"); +DEFINE_string(iml, "", "Model input layout"); +DEFINE_string(oml, "", "Model output layout"); +DEFINE_bool(img_as_bin, false, "Force binary input even if network expects an image"); +DEFINE_bool(pc, false, "Report performance counters"); + +// for using input image mean and scale +static constexpr char mean_values_message[] = + "Optional. Mean values to be used for the input image per channel. " + "Values to be provided in the [channel1,channel2,channel3] format. " + "Can be defined for desired input of the model, for example: \"--mean_values " + "data[255,255,255],info[255,255,255]\". The exact meaning and order of channels depend on how the original " + "model " + "was trained. Applying the values affects performance and may cause type conversion"; + +static constexpr char scale_values_message[] = + "Optional. Scale values to be used for the input image per channel. " + "Values are provided in the [channel1,channel2,channel3] format. " + "Can be defined for desired input of the model, for example: \"--scale_values " + "data[255,255,255],info[255,255,255]\". " + "The exact meaning and order of channels depend on how the original model was trained. If both --mean_values " + "and " + "--scale_values are specified, the mean is subtracted first and then scale is applied regardless of the order " + "of " + "options in command line. Applying the values affects performance and may cause type conversion"; +DEFINE_string(mean_values, "", mean_values_message); +DEFINE_string(scale_values, "", scale_values_message); + +DEFINE_string(img_bin_precision, "", "Specify the precision of the binary input files. Eg: 'FP32,FP16,I32,I64,U8'"); + +DEFINE_bool(run_test, false, "Run the test (compare current results with previously dumped)"); +DEFINE_string(mode, "", "Comparison mode to use"); + +DEFINE_uint32(top_k, 1, "Top K parameter for 'classification' mode"); +DEFINE_double(prob_tolerance, 1e-4, "Probability tolerance for 'classification/ssd/yolo' mode"); + +DEFINE_double(raw_tolerance, 1e-4, "Tolerance for 'raw' mode (absolute diff)"); +DEFINE_double(cosim_threshold, 0.90, "Threshold for 'cosim' mode"); +DEFINE_double(rrmse_loss_threshold, std::numeric_limits::max(), "Threshold for 'rrmse' mode"); +DEFINE_double(nrmse_loss_threshold, 1.0, "Threshold for 'nrmse' mode"); +DEFINE_double(confidence_threshold, 1e-4, "Confidence threshold for Detection mode"); +DEFINE_double(box_tolerance, 1e-4, "Box tolerance for 'detection' mode"); + +DEFINE_double(psnr_reference, 30.0, "PSNR reference value in dB"); +DEFINE_double(psnr_tolerance, 1e-4, "Tolerance for 'psnr' mode"); + +DEFINE_string(log_level, "", "IE logger level (optional)"); +DEFINE_string(color_format, "BGR", "Color format for input: RGB or BGR"); +DEFINE_uint32(scale_border, 4, "Scale border"); +DEFINE_bool(normalized_image, false, "Images in [0, 1] range or not"); + +// for Yolo +DEFINE_bool(is_tiny_yolo, false, "Is it Tiny Yolo or not (true or false)?"); +DEFINE_int32(classes, 80, "Number of classes for Yolo V3"); +DEFINE_int32(coords, 4, "Number of coordinates for Yolo V3"); +DEFINE_int32(num, 3, "Number of scales for Yolo V3"); + +typedef std::chrono::high_resolution_clock Time; +// for Semantic Segmentation +DEFINE_uint32(sem_seg_classes, 12, "Number of classes for semantic segmentation"); +DEFINE_double(sem_seg_threshold, 0.98, "Threshold for 'semantic segmentation' mode"); +DEFINE_uint32(sem_seg_ignore_label, std::numeric_limits::max(), "The number of the label to be ignored"); +DEFINE_string(dataset, "NONE", + "The dataset used to train the model. Useful for instances such as semantic segmentation to visualize " + "the accuracy per-class"); +std::vector camVid12 = {"Sky", "Building", "Pole", "Road", "Pavement", "Tree", + "SignSymbol", "Fence", "Car", "Pedestrian", "Bicyclist", "Unlabeled"}; + +std::vector splitStringList(const std::string& str, char delim) { + std::vector out; + + if (str.empty()) { + return out; + } + + std::istringstream istr(str); + + std::string elem; + while (std::getline(istr, elem, delim)) { + if (elem.empty()) { + continue; + } + + out.push_back(std::move(elem)); + } + + return out; +} + +void parseCommandLine(int argc, char* argv[]) { + std::ostringstream usage; + usage << "Usage: " << argv[0] << "[]"; + gflags::SetUsageMessage(usage.str()); + + std::ostringstream version; + version << ov::get_openvino_version(); + gflags::SetVersionString(version.str()); + + gflags::ParseCommandLineFlags(&argc, &argv, true); + + std::cout << "Parameters:" << std::endl; + std::cout << " Network file: " << FLAGS_network << std::endl; + std::cout << " Input file(s): " << FLAGS_input << std::endl; + std::cout << " Output compiled network file: " << FLAGS_compiled_blob << std::endl; + std::cout << " Color format: " << FLAGS_color_format << std::endl; + std::cout << " Input precision: " << FLAGS_ip << std::endl; + std::cout << " Output precision: " << FLAGS_op << std::endl; + std::cout << " Input layout: " << FLAGS_il << std::endl; + std::cout << " Output layout: " << FLAGS_ol << std::endl; + std::cout << " Model input layout: " << FLAGS_iml << std::endl; + std::cout << " Model output layout: " << FLAGS_oml << std::endl; + std::cout << " Img as binary: " << FLAGS_img_as_bin << std::endl; + std::cout << " Bin input file precision: " << FLAGS_img_bin_precision << std::endl; + std::cout << " Device: " << FLAGS_device << std::endl; + std::cout << " Config file: " << FLAGS_config << std::endl; + std::cout << " Run test: " << FLAGS_run_test << std::endl; + std::cout << " Performance counters: " << FLAGS_pc << std::endl; + std::cout << " Mean_values [channel1,channel2,channel3] " << FLAGS_mean_values << std::endl; + std::cout << " Scale_values [channel1,channel2,channel3] " << FLAGS_scale_values << std::endl; + if (FLAGS_run_test) { + std::cout << " Mode: " << FLAGS_mode << std::endl; + if (strEq(FLAGS_mode, "classification")) { + std::cout << " Top K: " << FLAGS_top_k << std::endl; + std::cout << " Tolerance: " << FLAGS_prob_tolerance << std::endl; + } else if (strEq(FLAGS_mode, "raw")) { + std::cout << " Tolerance: " << FLAGS_raw_tolerance << std::endl; + } else if (strEq(FLAGS_mode, "cosim")) { + std::cout << " Threshold: " << FLAGS_cosim_threshold << std::endl; + } else if (strEq(FLAGS_mode, "psnr")) { + std::cout << " Reference: " << FLAGS_psnr_reference << std::endl; + std::cout << " Tolerance: " << FLAGS_psnr_tolerance << std::endl; + std::cout << " Scale_border: " << FLAGS_scale_border << std::endl; + std::cout << " Normalized_image: " << FLAGS_normalized_image << std::endl; + } else if (strEq(FLAGS_mode, "rrmse")) { + std::cout << " Threshold: " << FLAGS_rrmse_loss_threshold << std::endl; + } else if (strEq(FLAGS_mode, "nrmse")) { + std::cout << " Threshold: " << FLAGS_nrmse_loss_threshold << std::endl; + } + } + std::cout << " Log level: " << FLAGS_log_level << std::endl; + std::cout << std::endl; +} + +// +// OpenCV to OpenVINO conversion +// + +bool isImage(const ov::Shape& shape, const ov::Layout& layout) { + if (shape.size() == 4) { + const auto numChannels = shape[ov::layout::channels_idx(layout)]; + return (numChannels == 3) || (numChannels == 4); + } + + return false; +} + +/** + * @brief Computes the offsets for each axis in terms of number of elements. + * @details Taken from the OpenVINO implementation ("InferenceEngine::BlockingDesc::fillDesc") and slightly modified for + * the current usecase. + * + * @param shape The shape based on which the offsets will be computed. + * @return The resulting strides. + */ +std::vector getStrides(const ov::Shape& shape) { + std::vector strides(shape.size()); + + strides[strides.size() - 1] = 1; + + for (size_t i = 2; i <= shape.size(); i++) { + strides[strides.size() - i] = strides[strides.size() - (i - 1)] * shape[shape.size() - (i - 1)]; + } + + return strides; +} + +std::vector ovToCV(const ov::Tensor& tensor, const ov::Shape& shape, const ov::Layout& layout, + size_t batchInd = 0, size_t depthInd = 0) { + const ov::element::Type& precision = tensor.get_element_type(); + + OPENVINO_ASSERT(layout == ov::Layout("NCHW") || layout == ov::Layout("NCDHW"), + "Unsupported layout: ", layout.to_string()); + + OPENVINO_ASSERT(precision == ov::element::Type_t::u8 || precision == ov::element::Type_t::f32 || + precision == ov::element::Type_t::f16 || precision == ov::element::Type_t::i32, + "Unsupported precision: ", precision.get_type_name()); + + int cvType = 0; + size_t elemSize = 0; + + if (precision == ov::element::Type_t::u8) { + cvType = CV_8UC1; + elemSize = sizeof(uint8_t); + } else if (precision == ov::element::Type_t::f32) { + cvType = CV_32FC1; + elemSize = sizeof(float); + } else if (precision == ov::element::Type_t::f16) { + cvType = CV_16SC1; + elemSize = sizeof(ov::float16); + } else if (precision == ov::element::Type_t::i32) { + cvType = CV_32SC1; + elemSize = sizeof(int32_t); + } + + std::vector out; + + const size_t N = shape[ov::layout::batch_idx(layout)]; + const size_t C = shape[ov::layout::channels_idx(layout)]; + const size_t H = shape[ov::layout::height_idx(layout)]; + const size_t W = shape[ov::layout::width_idx(layout)]; + + const auto dataBuffer = reinterpret_cast(tensor.data()); + + if (layout == ov::Layout("NCHW")) { + OPENVINO_ASSERT(batchInd < N); + OPENVINO_ASSERT(C == 3 || C == 4, "Unsupported number of channels: ", C); + + out.resize(C); + for (size_t c = 0; c < C; ++c) { + out[c] = cv::Mat(static_cast(H), static_cast(W), cvType, + dataBuffer + (batchInd * C + c) * W * H * elemSize); + } + } else if (layout == ov::Layout("NCDHW")) { + const size_t D = shape[ov::layout::depth_idx(layout)]; + + const std::vector strides = getStrides(shape); + + const auto strideN = strides[ov::layout::batch_idx(layout)]; + const auto strideC = strides[ov::layout::channels_idx(layout)]; + const auto strideD = strides[ov::layout::depth_idx(layout)]; + + OPENVINO_ASSERT(batchInd < N); + OPENVINO_ASSERT(depthInd < D); + OPENVINO_ASSERT(C == 3 || C == 4, "Unsupported number of channels: ", C); + + out.resize(C); + for (size_t c = 0; c < C; ++c) { + out[c] = cv::Mat(static_cast(H), static_cast(W), cvType, + dataBuffer + (strideN * batchInd + strideC * c + strideD * depthInd) * elemSize); + } + } + + return out; +} + +/** + * @brief Converts the source data from its current precision to the one used by the destination buffer and then places + * the result inside it. + * @details The conversion is performed by the use of the "static_cast" operator. Depending on the types between which + * the conversions are made, this may lead to undefined behavior. + * + * E.g.: Several experiments suggest float<->ov::float16 conversions may work fine, but float->uint8_t may lead to + * division by zero. + * + * @tparam InT The type of the source buffer + * @tparam OutT The type of the destination buffer + * @param destination Where the result will be stored + * @param source The data which shall be converted + * @param numberOfElements Indicates how many elements will be taken from the source buffer. + */ +template +void convertBufferType(OutT* destination, const InT* source, size_t numberOfElements) { + ov::parallel_for(numberOfElements, [source, destination](int64_t index) { + destination[index] = static_cast(source[index]); + }); +} + +void cvToOV(const cv::Mat& cvImg, const ov::Tensor& tensor, const ov::Shape& shape, const ov::Layout& layout, + const std::string& colorFormat) { + const ov::element::Type& precision = tensor.get_element_type(); + + OPENVINO_ASSERT(layout == ov::Layout("NHWC") || layout == ov::Layout("NCHW"), + "Unsupported layout: ", layout.to_string()); + + const auto N = shape[ov::layout::batch_idx(layout)]; + const auto C = shape[ov::layout::channels_idx(layout)]; + const auto H = shape[ov::layout::height_idx(layout)]; + const auto W = shape[ov::layout::width_idx(layout)]; + + OPENVINO_ASSERT(C == 3 || C == 4, "Unsupported number of channels: ", C); + + int cvType = 0; + + if (precision == ov::element::Type_t::u8) { + cvType = static_cast(CV_8UC(C)); + } else if (precision == ov::element::Type_t::f32) { + cvType = static_cast(CV_32FC(C)); + } else if (precision == ov::element::Type_t::f16) { + cvType = static_cast(CV_16SC(C)); + } else if (precision == ov::element::Type_t::i32) { + cvType = static_cast(CV_32SC(C)); + } else { + OPENVINO_ASSERT(precision == ov::element::Type_t::u8 || precision == ov::element::Type_t::f32 || + precision == ov::element::Type_t::f16 || precision == ov::element::Type_t::i32, + "Unsupported precision ", precision.get_type_name()); + } + + cv::Mat in; + + if (C == 3) { + if (colorFormat == "RGB") { + cv::cvtColor(cvImg, in, cv::COLOR_BGR2RGB); + } else { + in = cvImg; + } + } else { + if (colorFormat == "RGB") { + cv::cvtColor(cvImg, in, cv::COLOR_BGR2RGBA); + } else { + cv::cvtColor(cvImg, in, cv::COLOR_BGR2BGRA); + } + } + + if (precision != ov::element::Type_t::u8) { + in.convertTo(in, CV_32F); + } + + const auto pictureArea = static_cast(in.size().area()); + + if (W * H > pictureArea) { + cv::resize(in, in, cv::Size(static_cast(W), static_cast(H)), 0.0, 0.0, cv::INTER_AREA); + } else { + cv::resize(in, in, cv::Size(static_cast(W), static_cast(H)), 0.0, 0.0, cv::INTER_LINEAR); + } + + if (layout == ov::Layout("NHWC")) { + const auto dataBuffer = reinterpret_cast(tensor.data()); + + cv::Mat out(static_cast(H), static_cast(W), cvType, dataBuffer); + + if (precision == ov::element::Type_t::f16) { + const auto inPtr = in.ptr(); + const auto outPtr = out.ptr(); + convertBufferType(outPtr, inPtr, out.size().area() * C); + } else if (precision == ov::element::Type_t::i32) { + in.convertTo(out, CV_32S); + } else { + in.copyTo(out); + } + + for (size_t n = 1; n < N; ++n) { + cv::Mat batch(static_cast(H), static_cast(W), cvType, + dataBuffer + n * (out.size().area() * out.elemSize())); + out.copyTo(batch); + } + } else if (layout == ov::Layout("NCHW")) { + auto tensorPlanes = ovToCV(tensor, shape, layout, 0); + + if (precision != ov::element::Type_t::f16) { + cv::split(in, tensorPlanes); + } else { + std::vector inPlanes; + cv::split(in, inPlanes); + + OPENVINO_ASSERT(tensorPlanes.size() == inPlanes.size()); + + for (size_t i = 0; i < tensorPlanes.size(); ++i) { + const auto inPtr = inPlanes[i].ptr(); + const auto outPtr = tensorPlanes[i].ptr(); + convertBufferType(outPtr, inPtr, inPlanes[i].size().area()); + } + } + + for (size_t n = 1; n < N; ++n) { + const auto batchPlanes = ovToCV(tensor, shape, layout, n); + + OPENVINO_ASSERT(batchPlanes.size() == tensorPlanes.size()); + + for (size_t i = 0; i < tensorPlanes.size(); ++i) { + tensorPlanes[i].copyTo(batchPlanes[i]); + } + } + } +} + +std::vector splitFloat(const std::string& s, char delim) { + std::vector result; + std::stringstream ss(s); + std::string item; + + while (getline(ss, item, delim)) { + result.push_back(std::stof(item)); + } + return result; +} + +std::unordered_map> parseMeanOrScaleString(const std::string& mean_scale) { + std::unordered_map> result; + + // Format: layer1[255,255,255],layer2[255,255,255] for particular layers, + // or [255,255,255] for all layers + std::string search_string = mean_scale; + auto start_pos = search_string.find_first_of('['); + while (start_pos != std::string::npos) { + auto end_pos = search_string.find_first_of(']'); + if (end_pos == std::string::npos) + break; + auto input_name = search_string.substr(0, start_pos); + if (result.count(input_name) == 0) { + auto input_value_string = search_string.substr(start_pos + 1, end_pos - start_pos - 1); + result[input_name] = splitFloat(input_value_string, ','); + if (input_name.empty()) { + if (mean_scale != search_string) { + throw std::logic_error("Can't parse input parameter string: " + mean_scale + + ". Format of value: layer1[255,255,255],layer2[255,255,255] " + "for particular layers, or just [255,255,255] for all layers."); + } + search_string = search_string.substr(end_pos + 1); + break; + } + } else { + throw std::logic_error("Specifying mean and scale for the same layer/s" + " more than once is prohibited: " + + mean_scale); + } + + search_string = search_string.substr(end_pos + 1); + if (search_string.empty() || search_string.front() != ',') + break; + search_string = search_string.substr(1); + if (search_string.empty()) { + throw std::logic_error("Can't parse input parameter string: " + mean_scale + + ". Format of value: layer1[255,255,255],layer2[255,255,255] " + "for particular layers, or just [255,255,255] for all layers."); + } + start_pos = search_string.find_first_of('['); + } + if (!search_string.empty()) + throw std::logic_error("Can't parse input parameter string: " + mean_scale + + ". Format of value: layer1[255,255,255],layer2[255,255,255] " + "for particular layers, or just [255,255,255] for all layers."); + + return result; +} + +std::vector> parseMeanOrScale(const std::string& mean_scale, + const std::vector>& inputs_info) { + std::vector> result(inputs_info.size()); + + auto mean_or_scale_map = parseMeanOrScaleString(mean_scale); + + for (auto&& [layer_name, mean_or_scale] : mean_or_scale_map) { + if (!layer_name.empty()) { + // Add an explicit reference. Lambda expressions in C++17 cannot capture structured bindings. + const auto& layer_name_ref = layer_name; + auto required_input_it = std::find_if(inputs_info.begin(), inputs_info.end(), + [&layer_name_ref](const ov::Output& item) { + return item.get_any_name() == layer_name_ref; + }); + if (required_input_it != inputs_info.end()) { + result[std::distance(inputs_info.begin(), required_input_it)] = mean_or_scale; + } else { + throw std::logic_error(std::string("Input with name '") + layer_name + "' doesn't exist."); + } + } else { + for (size_t idx = 0; idx < inputs_info.size(); ++idx) { + result[idx] = mean_or_scale; + } + } + } + + return result; +} + +// +// File utils +// + +bool hasLoadableExt(const std::string& network_path) { + static const std::array ext_support_table{"xml", "onnx", "pdmodel", "pb", "tflite"}; + return std::any_of(ext_support_table.begin(), ext_support_table.end(), [&network_path](const char* ext) { + static constexpr const auto dot_symbol = '.'; + auto pos = network_path.rfind(dot_symbol); + std::string ext_name = {}; + if (pos != std::string::npos) + ext_name = network_path.substr(pos + 1); + return strEq(ext_name, ext); + }); +} + +std::string cleanName(std::string&& name) { + std::replace_if( + name.begin(), name.end(), + [](unsigned char c) { + return !std::isalnum(c); + }, + '_'); + return std::move(name); +} + +ov::Tensor loadImage(const ov::element::Type& precision, const ov::Shape& shape, const ov::Layout& layout, + const std::string& filePath, const std::string& colorFormat) { + const auto frame = cv::imread(filePath, cv::IMREAD_COLOR); + OPENVINO_ASSERT(!frame.empty(), "Failed to open input image file ", filePath); + + const ov::Tensor tensor(precision, shape); + + cvToOV(frame, tensor, shape, layout, colorFormat); + + return tensor; +} + +ov::Tensor loadBinary(const ov::element::Type& modelPrecision, const ov::Shape& shape, const std::string& filePath, + const ov::element::Type& dataPrecision) { + std::ifstream binaryFile(filePath, std::ios_base::binary | std::ios_base::ate); + OPENVINO_ASSERT(binaryFile, "Failed to open input binary file: ", filePath); + const auto fileBytes = binaryFile.tellg(); + binaryFile.seekg(0, std::ios_base::beg); + OPENVINO_ASSERT(binaryFile.good(), "While reading a file an error is encountered"); + + const ov::Tensor requestedTensor(modelPrecision, shape); + const int reqTensorBytes = static_cast(requestedTensor.get_byte_size()); + + if (dataPrecision != modelPrecision && dataPrecision != ov::element::Type_t::undefined) { + std::cout << "Converting " << filePath << " input from " << dataPrecision << " to " << modelPrecision + << std::endl; + const ov::Tensor inputTensor(dataPrecision, shape); + binaryFile.read(reinterpret_cast(inputTensor.data()), static_cast(fileBytes)); + npu::utils::convertTensorPrecision(inputTensor, requestedTensor); + } else { + OPENVINO_ASSERT(fileBytes == reqTensorBytes, "File contains ", fileBytes, " bytes, but ", reqTensorBytes, + " expected"); + binaryFile.read(reinterpret_cast(requestedTensor.data()), static_cast(reqTensorBytes)); + } + + return requestedTensor; +} + +/** + * @brief Loads the contents of a locally stored file inside an OpenVINO tensor intended to be used as input in the + * context of the application. + * @details The data being loaded can either be an image or a binary file, the switch between these cases can be + * performed by setting the "img_as_bin" flag accordingly. If an image is being loaded, the OpenCV library is deployed + * for reading followed by a conversion to the OpenVINO format. If a binary file is loaded, the content's type is + * converted from "dataPrecision" to "modelPrecision" before constructing the tensor. + * + * @param modelPrecision The precision accepted by the model's input + * @param shape The shape accepted by the model's input + * @param layout The layout used by the model's input + * @param filePath Indicates the location of the file to be loaded + * @param colorFormat Indicates the color format only in the case when an image is being loaded. + * @param dataPrecision Indicates the precision used by the data found within the binary file. + * @return The tensor containing the loaded data. + */ +ov::Tensor loadInput(const ov::element::Type& modelPrecision, const ov::Shape& shape, const ov::Layout& layout, + const std::string& filePath, const std::string& colorFormat, + const ov::element::Type& dataPrecision = ov::element::Type_t::undefined) { + if (isImage(shape, layout) && !FLAGS_img_as_bin) { + return loadImage(modelPrecision, shape, layout, filePath, colorFormat); + } else { + return loadBinary(modelPrecision, shape, filePath, dataPrecision); + } +} + +ov::Tensor loadTensor(const ov::element::Type& precision, const ov::Shape& shape, const std::string& filePath) { + const ov::Tensor tensor(precision, shape); + + std::ifstream file(filePath, std::ios_base::in | std::ios_base::binary); + OPENVINO_ASSERT(file.is_open(), "Can't open file ", filePath, " for read"); + + const auto dataBuffer = reinterpret_cast(tensor.data()); + file.read(dataBuffer, static_cast(tensor.get_byte_size())); + + return tensor; +} + +void dumpTensor(const ov::Tensor& tensor, const std::string& filePath) { + std::ofstream file(filePath, std::ios_base::out | std::ios_base::binary); + OPENVINO_ASSERT(file.is_open(), "Can't open file ", filePath, " for write"); + + const auto dataBuffer = reinterpret_cast(tensor.data()); + file.write(dataBuffer, static_cast(tensor.get_byte_size())); +} + +std::map parseConfigFile() { + std::map config; + + std::ifstream file(FLAGS_config); + OPENVINO_ASSERT(file.is_open(), "Can't open file ", FLAGS_config, " for read"); + + std::string option; + while (std::getline(file, option)) { + if (option.empty() || option[0] == '#') { + continue; + } + size_t spacePos = option.find_first_of(" \t\n\r"); + OPENVINO_ASSERT(spacePos != std::string::npos, + "Invalid config parameter format. Space separator required here: ", option); + std::string key, value; + if (spacePos != std::string::npos) { + key = option.substr(0, spacePos); + size_t valueStart = option.find_first_not_of(" \t\n\r", spacePos); + OPENVINO_ASSERT(valueStart != std::string::npos, + "An invalid config parameter value detected, it mustn't be empty: ", option); + size_t valueEnd = option.find_last_not_of(" \t\n\r"); + value = option.substr(valueStart, valueEnd - valueStart + 1); + config[key] = value; + } + } + + return config; +} + +// This function formats performance counters in a same way as benchmark_app -pc does. +// It is a copy-paste from $OPENVINO_HOME/samples/cpp/common/utils/include/samples/common.hpp +using ProfVec = std::vector; +static void printPerformanceCounts(ProfVec performanceData, std::ostream& stream, std::string deviceName, + bool bshowHeader = true) { + std::chrono::microseconds totalTime = std::chrono::microseconds::zero(); + // Print performance counts + if (bshowHeader) { + stream << std::endl << "performance counts:" << std::endl << std::endl; + } + std::ios::fmtflags fmt(std::cout.flags()); + for (const auto& it : performanceData) { + std::string toPrint(it.node_name); + const int maxLayerName = 30; + + if (it.node_name.length() >= maxLayerName) { + toPrint = it.node_name.substr(0, maxLayerName - 4); + toPrint += "..."; + } + + stream << std::setw(maxLayerName) << std::left << toPrint; + switch (it.status) { + case ov::ProfilingInfo::Status::EXECUTED: + stream << std::setw(15) << std::left << "EXECUTED"; + break; + case ov::ProfilingInfo::Status::NOT_RUN: + stream << std::setw(15) << std::left << "NOT_RUN"; + break; + case ov::ProfilingInfo::Status::OPTIMIZED_OUT: + stream << std::setw(15) << std::left << "OPTIMIZED_OUT"; + break; + } + stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " "; + stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count()); + stream << std::setw(20) << std::left << "cpu: " + std::to_string(it.cpu_time.count()); + stream << " execType: " << it.exec_type << std::endl; + if (it.real_time.count() > 0) { + totalTime += it.real_time; + } + } + stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count()) << " microseconds" + << std::endl; + std::cout << std::endl; + std::cout << "Full device name: " << deviceName << std::endl; + std::cout << std::endl; + std::cout.flags(fmt); +} + +bool checkBBoxOutputs(std::vector& actualOutput, std::vector& refOutput, + const size_t imgWidth, const size_t imgHeight, const float boxTolerance, + const float probTolerance) { + std::cout << "Ref Top:" << std::endl; + for (size_t i = 0; i < refOutput.size(); ++i) { + const auto& bb = refOutput[i]; + std::cout << i << " : " << bb.idx << " : [(" << bb.left << " " << bb.top << "), (" << bb.right << " " + << bb.bottom << ")] : " << bb.prob * 100 << "%" << std::endl; + } + + std::cout << "Actual top:" << std::endl; + for (size_t i = 0; i < actualOutput.size(); ++i) { + const auto& bb = actualOutput[i]; + std::cout << i << " : " << bb.idx << " : [(" << bb.left << " " << bb.top << "), (" << bb.right << " " + << bb.bottom << ")] : " << bb.prob * 100 << "%" << std::endl; + } + + for (const auto& refBB : refOutput) { + bool found = false; + + float maxBoxError = 0.0f; + float maxProbError = 0.0f; + + for (const auto& actualBB : actualOutput) { + if (actualBB.idx != refBB.idx) { + continue; + } + + const utils::Box actualBox{actualBB.left / imgWidth, actualBB.top / imgHeight, + (actualBB.right - actualBB.left) / imgWidth, + (actualBB.bottom - actualBB.top) / imgHeight}; + const utils::Box refBox{refBB.left / imgWidth, refBB.top / imgHeight, (refBB.right - refBB.left) / imgWidth, + (refBB.bottom - refBB.top) / imgHeight}; + + const auto boxIntersection = boxIntersectionOverUnion(actualBox, refBox); + const auto boxError = 1.0f - boxIntersection; + maxBoxError = std::max(maxBoxError, boxError); + + const auto probError = std::fabs(actualBB.prob - refBB.prob); + maxProbError = std::max(maxProbError, probError); + + if (boxError > boxTolerance) { + continue; + } + + if (probError > probTolerance) { + continue; + } + + found = true; + break; + } + if (!found) { + std::cout << "maxBoxError=" << maxBoxError << " " + << "maxProbError=" << maxProbError << std::endl; + return false; + } + } + return true; +} + +// +// Classification mode +// + +std::vector> parseClassification(const float* dataBuffer, size_t dataBufferElementsCount) { + OPENVINO_ASSERT(dataBuffer != nullptr, "Received a tensor with no allocated buffer"); + + std::vector> res(dataBufferElementsCount); + for (size_t i = 0; i < dataBufferElementsCount; ++i) { + res[i].first = static_cast(i); + res[i].second = dataBuffer[i]; + } + + std::sort(res.begin(), res.end(), [](const std::pair& a, const std::pair& b) { + return a.second > b.second; + }); + + return res; +} + +std::vector>> parseClassificationBatch(const ov::Tensor& tensor, size_t batch_size) { + OPENVINO_ASSERT(batch_size, "batch_size can't be 0"); + OPENVINO_ASSERT(tensor.get_element_type() == ov::element::Type_t::f32, + "Unsupported precision: ", tensor.get_element_type().get_type_name()); + + std::vector>> ret; + + const float* dataBuffer = tensor.data(); + OPENVINO_ASSERT(dataBuffer != nullptr, "Received a tensor with no allocated buffer"); + + size_t batch_bundle_size = tensor.get_size() / batch_size; + OPENVINO_ASSERT(!(tensor.get_size() % batch_bundle_size), + "Tensor is a not batched tensor! Size: ", tensor.get_size(), + " can't be batched on a batch size: ", batch_size, " properly"); + + size_t i = 0; + for (; i < tensor.get_size(); i += batch_bundle_size) { + if (batch_size != 1) { + std::cout << "restore tensor from data bundle: (" << i << "/" << tensor.get_size() << " bytes)" + << std::endl; + } + ret.push_back(parseClassification(dataBuffer + i, batch_bundle_size)); + } + + OPENVINO_ASSERT(i == tensor.get_size()); + return ret; +} + +bool testClassification(const TensorMap& outputs, const TensorMap& references, size_t batch_size = 1) { + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.size() == references.size()); + + const ov::Tensor outputFP32 = npu::utils::toFP32(outputs.begin()->second); + const ov::Tensor referenceFP32 = npu::utils::toFP32(references.begin()->second); + + OPENVINO_ASSERT(outputFP32.get_element_type() == referenceFP32.get_element_type()); + OPENVINO_ASSERT(outputFP32.get_shape() == referenceFP32.get_shape()); + OPENVINO_ASSERT(referenceFP32.get_element_type() == ov::element::Type_t::f32); + + auto probsBatch = parseClassificationBatch(outputFP32, batch_size); + auto refProbsBatch = parseClassificationBatch(referenceFP32, batch_size); + OPENVINO_ASSERT(refProbsBatch.size() == probsBatch.size(), + "Incorrect batch size of both output tensor: ", probsBatch.size(), + " and reference tensor: ", refProbsBatch.size(), ". Expected: ", batch_size); + for (size_t i = 0; i < batch_size; i++) { + OPENVINO_ASSERT(probsBatch[i].size() == refProbsBatch[i].size(), + "Incorrect size of referenced tensor in batch bundle number: (", i, "/", batch_size, ")", + ". Expected size: ", probsBatch[i].size(), ", got: ", refProbsBatch[i].size()); + OPENVINO_ASSERT(refProbsBatch[i].size() >= FLAGS_top_k); + refProbsBatch[i].resize(FLAGS_top_k); + } + + bool result = true; + for (size_t i = 0; i < probsBatch.size(); i++) { + if (batch_size != 1) { + std::cout << "Check tensor bundle: (" << i << "/" << batch_size << " batch)" << std::endl; + } + auto probs = probsBatch[i]; + const auto& refs = refProbsBatch[i]; + OPENVINO_ASSERT(probs.size() >= FLAGS_top_k); + probs.resize(FLAGS_top_k); + + std::cout << "Actual top:" << std::endl; + for (size_t j = 0; j < probs.size(); ++j) { + std::cout << " " << j << " : " << probs[j].first << " : " << probs[j].second << std::endl; + } + + std::cout << "Ref Top:" << std::endl; + for (size_t j = 0; j < refs.size(); ++j) { + std::cout << " " << j << " : " << refs[j].first << " : " << refs[j].second << std::endl; + } + + for (const auto& refElem : refs) { + const auto actualIt = + std::find_if(probs.cbegin(), probs.cend(), [&refElem](const std::pair& arg) { + return refElem.first == arg.first; + }); + if (actualIt == probs.end()) { + std::cout << "Ref result " << refElem.first << " was not found in actual results" << std::endl; + result = result && false; + continue; + } + + const auto& actualElem = *actualIt; + + if (refElem.second > actualElem.second) { + const auto probDiff = std::fabs(refElem.second - actualElem.second); + if (probDiff > FLAGS_prob_tolerance) { + std::cout << "Probability value mismatch for " << refElem.first << " : " << refElem.second << " vs " + << actualElem.second; + result = result && false; + } + } + } + } + + return result; +} + +// +// RAW mode +// + +bool compareTensors(const ov::Tensor& output, const ov::Tensor& reference) { + if (output.get_shape() != reference.get_shape()) { + std::cout << "Output and reference tensors have different shapes" << std::endl; + return false; + } + + const ov::Tensor outputFP32 = npu::utils::toFP32(output); + const ov::Tensor referenceFP32 = npu::utils::toFP32(reference); + + const auto outputBuffer = outputFP32.data(); + const auto referenceBuffer = referenceFP32.data(); + + const auto totalCount = referenceFP32.get_size(); + const auto printCount = std::min(totalCount, 10); + + for (size_t i = 0; i < totalCount; ++i) { + const auto referenceValue = referenceBuffer[i]; + const auto outputValue = outputBuffer[i]; + const auto absDiff = std::fabs(referenceValue - outputValue); + + if (i < printCount) { + std::cout << " " << i << " :" + << " ref : " << std::setw(10) << referenceValue << " output : " << std::setw(10) << outputValue + << " absdiff : " << std::setw(10) << absDiff << std::endl; + } + + if (absDiff > FLAGS_raw_tolerance) { + std::cout << "Absolute difference between output value " << outputValue << " and reference value " + << referenceValue << " at index " << i << " larger then tolerance" << std::endl; + return false; + } + } + + return true; +} + +bool testRAW(const TensorMap& outputTensors, const TensorMap& referenceTensors, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'raw' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + if (outputTensors.size() != referenceTensors.size()) { + std::cout << "The number of predicted outputs differ from the number of references" << std::endl; + return false; + } + + for (const auto& [tensorName, outputTensor] : outputTensors) { + auto referenceTensorIterator = referenceTensors.find(tensorName); + OPENVINO_ASSERT(referenceTensorIterator != referenceTensors.end()); + + std::cout << "Compare " << tensorName << " with reference" << std::endl; + if (!compareTensors(outputTensor, referenceTensorIterator->second)) { + return false; + } + } + + return true; +} + +// +// Cosine-Similarity mode +// (using 'cosim_threshold' flag, with expected value in range [0.0 -> 1.0]) +// e.g. '--mode cosim --cosim_threshold 0.98' +// + +bool compareCoSim(const ov::Tensor& output, const ov::Tensor& reference) { + if (output.get_shape() != reference.get_shape()) { + std::cout << "Actual and reference blobs has different shape" << std::endl; + return false; + } + + const ov::Tensor outputFP32 = npu::utils::toFP32(output); + const ov::Tensor referenceFP32 = npu::utils::toFP32(reference); + + const auto outputBuffer = outputFP32.data(); + const auto referenceBuffer = referenceFP32.data(); + + const auto size = referenceFP32.get_size(); + + double numr = 0.0, denA = 0.0, denB = 0.0; + for (size_t i = 0; i < size; ++i) { + numr += outputBuffer[i] * referenceBuffer[i]; + denA += outputBuffer[i] * outputBuffer[i]; + denB += referenceBuffer[i] * referenceBuffer[i]; + } + + if (denA == 0 || denB == 0) { + std::cout << "Div by ZERO. Cannot compute CoSim metric" << std::endl; + return false; + } + + const auto similarity = numr / (sqrt(denA) * sqrt(denB)); + const double eps = 0.0000001; + // Some experiments revealed that when applying the CoSim metric to large buffers it could provide + // similarity values that are outside the [-1:1] interval due the big number of operations done on + // floating point value. A small epsilon value was added to extend the interval to [-(1+eps):1+eps] + // to ensure that the above check is not failing. + if (similarity > (1.0 + eps) || similarity < -(1.0 + eps)) { + std::cout << "Invalid result " << similarity << " (valid range [-1 : +1])" << std::endl; + return false; + } + + std::cout << "Cosine similarity : " << similarity * 100 << "%" << std::endl; + return similarity > FLAGS_cosim_threshold; +} + +bool testCoSim(const TensorMap& outputs, const TensorMap& references, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'testCoSim' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + if (outputs.size() != references.size()) { + std::cout << "The outputs and references differ in the number of tensors" << std::endl; + return false; + } + + for (const auto& [tensorName, output] : outputs) { + auto referencesIterator = references.find(tensorName); + OPENVINO_ASSERT(referencesIterator != references.end()); + + std::cout << "Compare " << tensorName << " with reference" << std::endl; + if (!compareCoSim(output, referencesIterator->second)) { + return false; + } + } + + return true; +} + +// +// Relative Root Mean Squared Error mode +// (using 'rrmse_loss_threshold' flag, with expected value in range [0.0 -> infinity)) +// e.g. '--mode rrmse --rrmse_loss_threshold 0.1' +// + +bool computeRRMSE(const ov::Tensor& output, const ov::Tensor& reference) { + if (output.get_shape() != reference.get_shape()) { + std::cout << "Output and reference tensors have different shapes" << std::endl; + return false; + } + + const ov::Tensor outputFP32 = npu::utils::toFP32(output); + const ov::Tensor referenceFP32 = npu::utils::toFP32(reference); + + const auto outputBuffer = outputFP32.data(); + const auto referenceBuffer = referenceFP32.data(); + + const auto size = referenceFP32.get_size(); + + double error = 0, sum = 0, diff; + for (size_t i = 0; i < size; ++i) { + diff = (outputBuffer[i] - referenceBuffer[i]); + sum += (outputBuffer[i] * outputBuffer[i]); + error += (diff * diff); + } + + if (sum == 0) { + if (error <= std::numeric_limits::epsilon()) { + std::cout << "The results perfectly match (error = 0). RRMSE loss could not be computed" << std::endl; + return true; + } + + std::cout << "Div by ZERO (Actual is the Zero Tensor). Cannot compute RRMSE loss" << std::endl; + return false; + } + + double rrmseLoss = sqrt(error / sum); + + std::cout << "RRMSE loss : " << rrmseLoss << " RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl; + return rrmseLoss <= FLAGS_rrmse_loss_threshold; +} + +bool testRRMSE(const TensorMap& outputs, const TensorMap& references, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'rrmse' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + if (outputs.size() != references.size()) { + std::cout << "Actual and reference has different number of output blobs" << std::endl; + return false; + } + + for (const auto& [tensorName, output] : outputs) { + auto referencesIterator = references.find(tensorName); + OPENVINO_ASSERT(referencesIterator != references.end()); + + std::cout << "Compare " << tensorName << " with reference" << std::endl; + if (!computeRRMSE(output, referencesIterator->second)) { + return false; + } + } + + return true; +} + +// +// Normalized Mean Squared Error mode +// (using 'nrmse_loss_threshold' flag, with expected value in range [0.0 -> infinity)) +// e.g. '--mode nrmse --nrmse_loss_threshold 0.01' +// + +bool computeNRMSE(const ov::Tensor& output, const ov::Tensor& reference) { + if (output.get_shape() != reference.get_shape()) { + std::cout << "Output and reference tensors have different shapes" << std::endl; + return false; + } + + const auto size = reference.get_size(); + + if (size == 0) { + std::cout << "Empty output and reference tensors, NRMSE loss set to 0" << std::endl; + return true; + } + + const ov::Tensor outputFP32 = npu::utils::toFP32(output); + const ov::Tensor referenceFP32 = npu::utils::toFP32(reference); + + const auto outputBuffer = outputFP32.data(); + const auto referenceBuffer = referenceFP32.data(); + + double error = 0; + float maxOutput = 0, maxReference = 0, minOutput = 0, minReference = 0; + for (size_t i = 0; i < size; ++i) { + const auto diff = outputBuffer[i] - referenceBuffer[i]; + error += diff * diff; + maxOutput = std::max(outputBuffer[i], maxOutput); + maxReference = std::max(referenceBuffer[i], maxReference); + minOutput = std::min(outputBuffer[i], minOutput); + minReference = std::min(referenceBuffer[i], minReference); + } + + double nrmseLoss = + sqrt(error / size) / std::max(0.001f, std::max(maxOutput - minOutput, maxReference - minReference)); + + std::cout << "NRMSE loss : " << nrmseLoss << " NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl; + return nrmseLoss <= FLAGS_nrmse_loss_threshold; +} + +bool testNRMSE(const TensorMap& outputs, const TensorMap& references, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'nrmse' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + if (outputs.size() != references.size()) { + std::cout << "Actual and reference has different number of output blobs" << std::endl; + return false; + } + + for (const auto& [tensorName, output] : outputs) { + auto referencesIterator = references.find(tensorName); + OPENVINO_ASSERT(referencesIterator != references.end()); + + std::cout << "Compare " << tensorName << " with reference" << std::endl; + if (!computeNRMSE(output, referencesIterator->second)) { + return false; + } + } + + return true; +} + +// +// PSNR mode +// using psnr_reference and psnr_tolerance flags for validation +// e.g. '--mode psnr --psnr_reference --psnr_tolerance ' +// Direction of metric’s growth is higher-better. If the images are identical, the PSNR is infinite. +// + +bool testPSNR(const TensorMap& outputs, const TensorMap& references, const int dstHeight, const int dstWidth, + size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'psnr' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + OPENVINO_ASSERT(outputs.size() == references.size(), + "Mismatch between the number of model outputs and the number of references"); + + int scaleBorder = FLAGS_scale_border; + bool normalizedImage = FLAGS_normalized_image; + + auto refOutput = npu::utils::parseTensorsAsFP32(references); + auto actOutput = npu::utils::parseTensorsAsFP32(outputs); + + auto result = utils::runPSNRMetric(actOutput, refOutput, dstHeight, dstWidth, scaleBorder, normalizedImage); + + if (std::fabs(result - FLAGS_psnr_reference) > FLAGS_psnr_tolerance) { + std::cout << "Absolute difference between actual value " << result << " and reference value " + << FLAGS_psnr_reference << " larger then tolerance " << FLAGS_psnr_tolerance << std::endl; + return false; + } + + return true; +} + +static void printPerformanceCountsAndLatency(size_t numberOfTestCase, const ProfVec& profilingData, + std::chrono::duration duration) { + auto durationMs = std::chrono::duration_cast(duration); + + if (!profilingData.empty()) { + std::cout << "Performance counts for " << numberOfTestCase << "-th infer request:" << std::endl; + printPerformanceCounts(profilingData, std::cout, FLAGS_device, false); + } + + std::cout << "Latency: " << std::fixed << std::setprecision(2) << durationMs.count() << " ms" << std::endl; +} + +bool compare_mean_IoU(std::vector> iou, float semSegThreshold, uint32_t classes) { + float threshold = semSegThreshold * 100; + float ma = 0.0f; + bool stateValue = true; + + if (FLAGS_sem_seg_ignore_label != std::numeric_limits::max()) { + classes--; + } + + size_t numberOfLabeledClasses = 0; + for (size_t i = 0; i < classes; i++) { + if (iou[i].first) { + numberOfLabeledClasses++; + if (FLAGS_dataset == "camVid12") { + std::cout << "mean_iou@" << camVid12[i].c_str() << ": " << std::fixed << std::setprecision(2) + << iou[i].second << "%" << std::endl; + } else { + std::cout << "mean_iou@class" << i << ": " << std::fixed << std::setprecision(2) << iou[i].second << "%" + << std::endl; + } + if (iou[i].second < threshold) { + std::cout << "Threshold smaller than " << threshold << "%" << std::endl; + stateValue = false; + } + ma += iou[i].second; + } else { + std::cout << "mean_iou@class" << i << ": no pixels labeled." << std::endl; + } + } + std::cout << "mean_iou@:mean " << std::fixed << std::setprecision(2) << (ma / numberOfLabeledClasses) << "%" + << std::endl; + + return stateValue; +} + +void setupOVCore(ov::Core& core) { + auto flagDevice = FLAGS_device; + + if (!FLAGS_log_level.empty()) { + core.set_property(flagDevice, {{ov::log::level.name(), FLAGS_log_level}}); + } + + if (FLAGS_device == "CPU") { + core.set_property(flagDevice, {{"LP_TRANSFORMS_MODE", "NO"}}); + } + + if (FLAGS_pc) { + core.set_property(flagDevice, {{ov::enable_profiling.name(), true}}); + } + + if (!FLAGS_config.empty()) { + const auto configs = parseConfigFile(); + core.set_property(flagDevice, {configs.begin(), configs.end()}); + } +} + +void nameIOTensors(std::shared_ptr model) { + auto inputInfo = model->inputs(); + for (std::size_t id = 0ul; id < inputInfo.size(); ++id) { + auto ii = inputInfo[id]; + if (ii.get_names().empty()) { + ii.add_names({"input_" + std::to_string(ii.get_index())}); + } + } + + auto outputInfo = model->outputs(); + for (std::size_t id = 0ul; id < outputInfo.size(); ++id) { + auto oi = outputInfo[id]; + if (oi.get_names().empty()) { + oi.add_names({"output_" + std::to_string(oi.get_index())}); + } + } +} + +std::pair runInfer(ov::InferRequest& inferRequest, ov::CompiledModel& compiledModel, + const TensorMap& inputs, const std::vector& dumpedInputsPaths) { + for (const auto& [tensorName, tensor] : inputs) { + inferRequest.set_tensor(tensorName, tensor); + } + + inferRequest.infer(); + + TensorMap out; + for (const auto& outputInfo : compiledModel.outputs()) { + const std::string layer_name = outputInfo.get_any_name(); + out.insert({layer_name, inferRequest.get_tensor(layer_name)}); + } + + ProfVec profData{}; + + if (FLAGS_pc) { + profData = inferRequest.get_profiling_info(); + } + + return std::make_pair(out, profData); +} + +void boundDynamicShape(std::shared_ptr& model) { + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + if (shape.is_static()) { + continue; + } + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by SIT"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by SIT\n" + " Setting batch to 1 forcibly" + << std::endl; + ov::set_batch(model, 1); + } + shape = item->get_partial_shape(); + if (shape.is_dynamic()) { + throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + + " is dynamic which is not supported by SIT"); + } + } +} + +void setModelBatch(std::shared_ptr& model, uint32_t batch) { + if (batch == 1) { + return; + } + + // New batch value is applicable if the model has non dynamic inputs/outputs only + // Amend layout by adding N if it has no batch dimension + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by SIT"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + } + + shape = item->get_partial_shape(); + if (shape.is_dynamic()) { + throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + + " is dynamic which is not supported by SIT"); + } + } + ov::set_batch(model, batch); +} + +// FIXME: User must provide layout explicitly. +// No "default" layout for IRv11 models. +static ov::Layout getLayoutByRank(const size_t rank) { + switch (rank) { + case 0: + return ov::Layout::scalar(); + case 1: + return ov::Layout("C"); + case 2: + return ov::Layout("NC"); + case 3: + return ov::Layout("CHW"); + case 4: + return ov::Layout("NCHW"); + case 5: + return ov::Layout("NCDHW"); + } + throw std::logic_error("Failed to get layout for rank equal to " + std::to_string(rank)); +} + +static std::string toString(const std::vector& vec) { + std::stringstream ss; + if (!vec.empty()) { + ss << "["; + for (size_t i = 0; i < vec.size() - 1; ++i) { + ss << vec[i] << ","; + } + ss << vec[vec.size() - 1]; + ss << "]"; + } else { + ss << "SCALAR"; + } + return ss.str(); +} + +bool testSSDDetection(const TensorMap& outputs, const TensorMap& references, + const TensorDescriptorMap& inputDescriptors, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'ssd' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + OPENVINO_ASSERT(outputs.size() == 1 && references.size() == 1); + OPENVINO_ASSERT(!inputDescriptors.empty(), "No input descriptors received"); + + const ov::Tensor& output = outputs.begin()->second; + const ov::Tensor& reference = references.begin()->second; + const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; + + const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + + auto confThresh = FLAGS_confidence_threshold; + auto probTolerance = FLAGS_prob_tolerance; + auto boxTolerance = FLAGS_box_tolerance; + + auto parsedOutput = utils::parseSSDOutput(output, imgWidth, imgHeight, static_cast(confThresh)); + auto parsedReference = utils::parseSSDOutput(reference, imgWidth, imgHeight, static_cast(confThresh)); + + auto result = checkBBoxOutputs(parsedOutput, parsedReference, imgWidth, imgHeight, static_cast(boxTolerance), + static_cast(probTolerance)); + + return result; +} + +// +// Yolo V2 mode +// +bool testYoloV2(const TensorMap& outputs, const TensorMap& references, const TensorDescriptorMap& inputDescriptors, + size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'yolo_v2' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + OPENVINO_ASSERT(inputDescriptors.size() == 1, "The YOLO v2 model accepts only a single input"); + OPENVINO_ASSERT(outputs.size() == 1, "The YOLO v2 model a single output"); + OPENVINO_ASSERT(outputs.size() == references.size(), + "Mismatch between the number of model outputs and the number of references"); + const ov::Tensor& output = outputs.begin()->second; + const ov::Tensor& reference = references.begin()->second; + + const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; + + const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + double confThresh = FLAGS_confidence_threshold; + double probTolerance = FLAGS_prob_tolerance; + double boxTolerance = FLAGS_box_tolerance; + bool isTiny = FLAGS_is_tiny_yolo; + + auto parsedOutput = utils::parseYoloOutput(npu::utils::toFP32(output), imgWidth, imgHeight, + static_cast(confThresh), isTiny); + auto parsedReference = utils::parseYoloOutput(npu::utils::toFP32(reference), imgWidth, imgHeight, + static_cast(confThresh), isTiny); + + bool result = checkBBoxOutputs(parsedOutput, parsedReference, imgWidth, imgHeight, static_cast(boxTolerance), + static_cast(probTolerance)); + return result; +} + +// +// Yolo V3 mode +// +bool testYoloV3(const TensorMap& outputs, const TensorMap& references, const TensorDescriptorMap& inputDescriptors, + const LayoutMap& outputLayouts, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'yolo_v3' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + OPENVINO_ASSERT(inputDescriptors.size() == 1, "The YOLO v3 model accepts only a single input"); + OPENVINO_ASSERT(outputs.size() == 3, "The YOLO v3 model has three outputs"); + OPENVINO_ASSERT(outputs.size() == references.size(), + "Mismatch between the number of model outputs and the number of references"); + + const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; + const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + + double confThresh = FLAGS_confidence_threshold; + double probTolerance = FLAGS_prob_tolerance; + double boxTolerance = FLAGS_box_tolerance; + int classes = FLAGS_classes; + int coords = FLAGS_coords; + int num = FLAGS_num; + std::vector anchors = {10.0, 13.0, 16.0, 30.0, 33.0, 23.0, 30.0, 61.0, 62.0, + 45.0, 59.0, 119.0, 116.0, 90.0, 156.0, 198.0, 373.0, 326.0}; + + auto parsedOutput = utils::parseYoloV3Output(outputs, imgWidth, imgHeight, classes, coords, num, anchors, + static_cast(confThresh), outputLayouts); + auto parsedReference = utils::parseYoloV3Output(references, imgWidth, imgHeight, classes, coords, num, anchors, + static_cast(confThresh), outputLayouts); + + bool result = checkBBoxOutputs(parsedOutput, parsedReference, imgWidth, imgHeight, static_cast(boxTolerance), + static_cast(probTolerance)); + return result; +} + +// +// Yolo V4 mode +// Ref link: https://docs.openvino.ai/latest/omz_models_model_yolo_v4_tiny_tf.html +// +bool testYoloV4(const TensorMap& outputs, const TensorMap& references, const TensorDescriptorMap& inputDescriptors, + const LayoutMap& outputLayouts, size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'yolo_v4' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + OPENVINO_ASSERT(inputDescriptors.size() == 1, "The YOLO v4 model accepts only a single input"); + OPENVINO_ASSERT(outputs.size() == 2, "The YOLO v4 model has two outputs"); + OPENVINO_ASSERT(outputs.size() == references.size(), + "Mismatch between the number of model outputs and the number of references"); + + const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; + const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + + double confThresh = FLAGS_confidence_threshold; + double probTolerance = FLAGS_prob_tolerance; + double boxTolerance = FLAGS_box_tolerance; + int classes = FLAGS_classes; + int coords = FLAGS_coords; + int num = FLAGS_num; + std::vector anchors = {10.0, 14.0, 23.0, 27.0, 37.0, 58.0, 81.0, 82.0, 135.0, 169.0, 344.0, 319.0}; + std::vector> anchor_mask{{3, 4, 5}, {1, 2, 3}}; + std::vector masked_anchors{}; + for (auto& it : anchor_mask) { + int index = 0; + for (auto& anchorIndex : it) { + if (index >= num) + break; + + index++; + masked_anchors.push_back(anchors[static_cast(2 * anchorIndex)]); + masked_anchors.push_back(anchors[static_cast(2 * anchorIndex + 1)]); + } + } + + auto refOutput = utils::parseYoloV4Output(references, imgWidth, imgHeight, classes, coords, num, masked_anchors, + static_cast(confThresh), outputLayouts); + auto actOutput = utils::parseYoloV4Output(outputs, imgWidth, imgHeight, classes, coords, num, masked_anchors, + static_cast(confThresh), outputLayouts); + bool result = checkBBoxOutputs(actOutput, refOutput, imgWidth, imgHeight, static_cast(boxTolerance), + static_cast(probTolerance)); + return result; +} + +// +// MeanIoU mode +// Using sem_seg_classes, sem_seg_threshold flags and optionally sem_seg_ignore_label and dataset flags for validation +// e.g. '--mode mean_iou --sem_seg_classes 12 --sem_seg_threshold 0.98 --sem_seg_ignore_label 11 --dataset camVid12' +// +bool testMeanIoU(const TensorMap& outputs, const TensorMap& references, const LayoutMap& outputLayouts, + size_t batch_size = 1) { + if (batch_size != 1) { + throw std::runtime_error( + "The testcase 'mean_iou' doesn't support any `override_model_batch_size` values besides 1 yet"); + } + + OPENVINO_ASSERT(outputs.size() == 1, "The metric accepts only a single output"); + OPENVINO_ASSERT(outputs.size() == references.size(), + "Mismatch between the number of model outputs and the number of references"); + OPENVINO_ASSERT(outputs.size() == outputLayouts.size(), + "Mismatch between the number of model outputs and their corresponding layout values"); + + unsigned int classes = FLAGS_sem_seg_classes; + auto semSegThreshold = static_cast(FLAGS_sem_seg_threshold); + + std::vector parsedReferences; + std::vector parsedOutputs; + std::vector> iou(classes, {false, 0.0f}); + + utils::argMax_channels(references.begin()->second, parsedReferences, outputLayouts.begin()->second); + utils::argMax_channels(outputs.begin()->second, parsedOutputs, outputLayouts.begin()->second); + + if (parsedReferences.size() != parsedOutputs.size()) { + std::cout << "Reference size and output size are different" << std::endl; + return false; + } + iou = utils::mean_IoU(parsedOutputs, parsedReferences, classes, FLAGS_sem_seg_ignore_label); + + return compare_mean_IoU(iou, semSegThreshold, classes); +} + +static int runSingleImageTest() { + std::cout << "Run single image test" << std::endl; + try { + const std::unordered_set allowedPrecision = {"U8", "I32", "I64", "FP16", "FP32"}; + if (!FLAGS_ip.empty()) { + // input precision is U8, I32, I64, FP16 or FP32 only + std::transform(FLAGS_ip.begin(), FLAGS_ip.end(), FLAGS_ip.begin(), ::toupper); + if (allowedPrecision.count(FLAGS_ip) == 0) + throw std::logic_error("Parameter -ip " + FLAGS_ip + " is not supported"); + } + if (!FLAGS_op.empty()) { + // output precision is U8, I32, I64, FP16 or FP32 only + std::transform(FLAGS_op.begin(), FLAGS_op.end(), FLAGS_op.begin(), ::toupper); + if (allowedPrecision.count(FLAGS_op) == 0) + throw std::logic_error("Parameter -op " + FLAGS_op + " is not supported"); + } + + ov::Layout inUserLayout(FLAGS_il); + ov::Layout outUserLayout(FLAGS_ol); + ov::Layout inModelLayout(FLAGS_iml); + ov::Layout outModelLayout(FLAGS_oml); + + std::vector inputFilesPerCase; + std::vector> inputFilesForOneInfer; + + inputFilesPerCase = splitStringList(FLAGS_input, ';'); + for (const auto& images : inputFilesPerCase) { + inputFilesForOneInfer.push_back(splitStringList(images, ',')); + } + + std::vector inputBinPrecisionStrPerCase; + std::vector> inputBinPrecisionForOneInfer(inputFilesForOneInfer.size()); + if (FLAGS_img_as_bin) { + for (std::size_t i = 0; i < inputFilesForOneInfer.size(); ++i) { + inputBinPrecisionForOneInfer[i] = + std::vector(inputFilesForOneInfer[i].size(), ov::element::undefined); + } + inputBinPrecisionStrPerCase = splitStringList(FLAGS_img_bin_precision, ';'); + std::size_t inferIdx = 0; + for (const auto& precisions : inputBinPrecisionStrPerCase) { + std::vector inputBinPrecisionsStrThisInfer = splitStringList(precisions, ','); + std::size_t precisionIdx = 0; + for (const auto& precision : inputBinPrecisionsStrThisInfer) { + if (strEq(precision, "FP32")) { + inputBinPrecisionForOneInfer[inferIdx][precisionIdx] = ov::element::f32; + } else if (strEq(precision, "FP16")) { + inputBinPrecisionForOneInfer[inferIdx][precisionIdx] = ov::element::f16; + } else if (strEq(precision, "I32")) { + inputBinPrecisionForOneInfer[inferIdx][precisionIdx] = ov::element::i32; + } else if (strEq(precision, "I64")) { + inputBinPrecisionForOneInfer[inferIdx][precisionIdx] = ov::element::i64; + } else if (strEq(precision, "U8")) { + inputBinPrecisionForOneInfer[inferIdx][precisionIdx] = ov::element::u8; + } else { + std::cout << "WARNING: Unhandled precision '" << precision + << "'! Only FP32, FP16, I32, I64 and U8 can be currently converted to the network's" + << "input tensor precision."; + } + ++precisionIdx; + } + ++inferIdx; + } + } + + if (FLAGS_network.empty()) { + std::cout << "Not enough parameters. Check help." << std::endl; + return EXIT_FAILURE; + } + + ov::Core core; + setupOVCore(core); + + ov::CompiledModel compiledModel; + if (hasLoadableExt(FLAGS_network)) { + std::cout << "Load network " << FLAGS_network << std::endl; + + auto model = core.read_model(FLAGS_network); + nameIOTensors(model); + + setModelBatch(model, FLAGS_override_model_batch_size); + if (FLAGS_device.find("NPU") != std::string::npos || + // FIXME: SIT on CPU also requires to bound dynamic shapes + FLAGS_device.find("CPU") != std::string::npos) { + boundDynamicShape(model); + } + + ov::preprocess::PrePostProcessor ppp(model); + + // Input precision + const auto inputInfo = model->inputs(); + if (!FLAGS_ip.empty()) { + ov::element::Type prc_in = ov::element::u8; + if (FLAGS_ip == "FP16") + prc_in = ov::element::f16; + else if (FLAGS_ip == "FP32") + prc_in = ov::element::f32; + else if (FLAGS_ip == "I32") + prc_in = ov::element::i32; + else if (FLAGS_ip == "I64") + prc_in = ov::element::i64; + else + prc_in = ov::element::u8; + + for (size_t i = 0; i < inputInfo.size(); ++i) { + ppp.input(i).tensor().set_element_type(prc_in); + } + } + + // Input layout + if (!inUserLayout.empty()) { + for (size_t i = 0; i < inputInfo.size(); ++i) { + ov::Layout inLayerModelLayout; + if (inModelLayout.empty()) { + const auto shape = inputInfo[i].get_shape(); + inLayerModelLayout = getLayoutByRank(shape.size()); + std::cout << "WARNING: Configuring preprocessing. Since --iml option isn't set, input model " + "layout for layer \"" + << inputInfo[i].get_any_name() << "\" is infered from shape: " << toString(shape) + << " rank (" << shape.size() << ") as " << inLayerModelLayout.to_string() + << std::endl; + } else { + inLayerModelLayout = inModelLayout; + } + ppp.input(i).model().set_layout(inLayerModelLayout); + ppp.input(i).tensor().set_layout(inUserLayout); + } + } + + // Input mean and scale if exist + if (!FLAGS_mean_values.empty() || !FLAGS_scale_values.empty()) { + auto means = parseMeanOrScale(FLAGS_mean_values, inputInfo); + auto scales = parseMeanOrScale(FLAGS_scale_values, inputInfo); + for (size_t i = 0; i < inputInfo.size(); ++i) { + if (!means[i].empty()) { + ppp.input(i).preprocess().convert_element_type(ov::element::f32).mean(means[i]); + } + if (!scales[i].empty()) { + ppp.input(i).preprocess().convert_element_type(ov::element::f32).scale(scales[i]); + } + } + } + + // Output precision + const auto outputInfo = model->outputs(); + if (!FLAGS_op.empty()) { + ov::element::Type prc_out = ov::element::u8; + if (FLAGS_op == "FP16") + prc_out = ov::element::f16; + else if (FLAGS_op == "FP32") + prc_out = ov::element::f32; + else if (FLAGS_op == "I32") + prc_out = ov::element::i32; + else if (FLAGS_op == "I64") + prc_out = ov::element::i64; + else + prc_out = ov::element::u8; + + for (size_t i = 0; i < outputInfo.size(); ++i) { + ppp.output(i).tensor().set_element_type(prc_out); + } + } + + // Output layout + if (!outUserLayout.empty()) { + for (size_t i = 0; i < outputInfo.size(); ++i) { + ov::Layout outLayerModelLayout; + if (outModelLayout.empty()) { + const auto shape = outputInfo[i].get_shape(); + outLayerModelLayout = getLayoutByRank(shape.size()); + std::cout << "WARNING: Configuring preprocessing. Since --oml option isn't set, output model " + "layout for layer \"" + << outputInfo[i].get_any_name() << "\" is infered from shape: " << toString(shape) + << " rank (" << shape.size() << ") as " << outLayerModelLayout.to_string() + << std::endl; + } else { + outLayerModelLayout = outModelLayout; + } + ppp.output(i).model().set_layout(outLayerModelLayout); + ppp.output(i).tensor().set_layout(outUserLayout); + } + } + + compiledModel = core.compile_model(ppp.build(), FLAGS_device); + } else { + std::cout << "Import network " << FLAGS_network << std::endl; + + if (!FLAGS_mean_values.empty() || !FLAGS_scale_values.empty()) { + throw std::runtime_error("--mean_values and --scale_values aren't supported for " + "compiled model.\n The values can be set via " + "model_optimizer while generating xml\n"); + } + + std::ifstream file(FLAGS_network, std::ios_base::in | std::ios_base::binary); + OPENVINO_ASSERT(file.is_open(), "Can't open file ", FLAGS_network, " for read"); + compiledModel = core.import_model(file, FLAGS_device); + } + + // store compiled model, if required + if (!FLAGS_compiled_blob.empty()) { + std::ofstream outputFile{FLAGS_compiled_blob, std::ios::out | std::ios::binary}; + if (!outputFile.is_open()) { + std::cerr << "Output file \"" << FLAGS_compiled_blob << "\" can't be opened for writing" << std::endl; + return EXIT_FAILURE; + } else { + compiledModel.export_model(outputFile); + } + } + + auto inferRequest = compiledModel.create_infer_request(); + + std::string netFileName; + { + auto startPos = FLAGS_network.rfind('/'); + if (startPos == std::string::npos) { + startPos = FLAGS_network.rfind('\\'); + if (startPos == std::string::npos) { + startPos = 0; + } + } + + auto endPos = FLAGS_network.rfind('.'); + if (endPos == std::string::npos) { + endPos = FLAGS_network.size(); + } + + OPENVINO_ASSERT(endPos > startPos); + netFileName = cleanName(FLAGS_network.substr(startPos, endPos - startPos)); + } + + for (size_t numberOfTestCase = 0; numberOfTestCase < inputFilesPerCase.size(); ++numberOfTestCase) { + const auto inputsInfo = compiledModel.inputs(); + const auto outputsInfo = compiledModel.outputs(); + std::vector inputFiles = inputFilesForOneInfer[numberOfTestCase]; + OPENVINO_ASSERT(inputFiles.size() == inputsInfo.size(), "Number of input files ", inputFiles.size(), + " doesn't match network configuration ", inputsInfo.size()); + + TensorMap inTensors; + size_t inputInd = 0; + std::vector dumpedInputsPaths; + TensorDescriptorMap inputDescriptors; // Several metrics require the input metadata + + // Load the input data + for (const auto& inputInfo : inputsInfo) { + const ov::Shape& shape = inputInfo.get_shape(); + const ov::element::Type& precision = inputInfo.get_element_type(); + + // Determine the input layout + ov::Layout inputLayout; + + if (!inUserLayout.empty()) { + inputLayout = inUserLayout; + } else if (!inModelLayout.empty()) { + inputLayout = inModelLayout; + } else { + inputLayout = getLayoutByRank(shape.size()); + std::cout << "WARNING: Loading input data. Since --iml option isn't set, input model layout for " + "layer \"" + << inputInfo.get_any_name() << "\" is infered from shape: " << toString(shape) + << " rank (" << shape.size() << ") as " << inputLayout.to_string() << std::endl; + } + + inputDescriptors.emplace(inputInfo.get_any_name(), TensorDescriptor{precision, shape, inputLayout}); + + std::cout << "Load input #" << inputInd << " from " << inputFiles[inputInd] << " as " << precision + << " " << inputLayout.to_string() << " " << shape << std::endl; + + const ov::Tensor tensor = + !FLAGS_img_as_bin + ? loadInput(precision, shape, inputLayout, inputFiles[inputInd], FLAGS_color_format) + : loadInput(precision, shape, inputLayout, inputFiles[inputInd], FLAGS_color_format, + inputBinPrecisionForOneInfer[numberOfTestCase][inputInd]); + std::ostringstream ostr; + ostr << netFileName << "_input_" << inputInd << "_case_" << numberOfTestCase << ".blob"; + const auto blobFileName = ostr.str(); + + std::cout << "Dump input #" << inputInd << "_case_" << numberOfTestCase << " to " << blobFileName + << std::endl; + dumpTensor(tensor, blobFileName); + + ++inputInd; + + dumpedInputsPaths.push_back(blobFileName); + + inTensors.emplace(inputInfo.get_any_name(), std::move(tensor)); + } + + std::cout << "Run inference on " << FLAGS_device << std::endl; + + const auto startTime = Time::now(); + const auto outInference = runInfer(inferRequest, compiledModel, inTensors, dumpedInputsPaths); + const auto endTime = Time::now(); + + const TensorMap& outputTensors = outInference.first; + + printPerformanceCountsAndLatency(numberOfTestCase, outInference.second, endTime - startTime); + + if (FLAGS_run_test) { + TensorMap referenceTensors; + size_t outputInd = 0; + LayoutMap outputLayouts; // Several metrics may require this + + // Load the reference data + for (const auto& [tensorName, tensor] : outputTensors) { + const ov::element::Type& precision = tensor.get_element_type(); + const ov::Shape& shape = tensor.get_shape(); + + std::ostringstream ostr; + ostr << netFileName << "_ref_out_" << outputInd << "_case_" << numberOfTestCase << ".blob"; + const auto blobFileName = ostr.str(); + + std::cout << "Load reference output #" << outputInd << " from " << blobFileName << " as " + << precision << std::endl; + + const ov::Tensor referenceTensor = loadTensor(precision, shape, blobFileName); + referenceTensors.emplace(tensorName, referenceTensor); + + // Determine the output layout + ov::Layout outputLayout; + + if (!outUserLayout.empty()) { + outputLayout = outUserLayout; + } else if (!outModelLayout.empty()) { + outputLayout = outModelLayout; + } else { + outputLayout = getLayoutByRank(shape.size()); + std::cout << "WARNING: Since --oml option isn't set, output model layout for layer \"" + << tensorName << "\" is infered from shape: " << toString(shape) << " rank (" + << shape.size() << ") as " << outputLayout.to_string() << std::endl; + } + + outputLayouts.emplace(tensorName, outputLayout); + + ++outputInd; + } + + outputInd = 0; + + // Dump the outputs obtained upon prediction + for (const auto& tensorEntry : outputTensors) { + std::ostringstream ostr; + ostr << netFileName << "_kmb_out_" << outputInd << "_case_" << numberOfTestCase << ".blob"; + const auto blobFileName = ostr.str(); + + std::cout << "Dump device output #" << outputInd << "_case_" << numberOfTestCase << " to " + << blobFileName << std::endl; + + dumpTensor(tensorEntry.second, blobFileName); + ++outputInd; + } + + // Compare the outputs with their references using the chosen metric + if (strEq(FLAGS_mode, "classification")) { + if (testClassification(outputTensors, referenceTensors, FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "raw")) { + if (testRAW(outputTensors, referenceTensors, FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "cosim")) { + if (testCoSim(outputTensors, referenceTensors, FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "rrmse")) { + if (testRRMSE(outputTensors, referenceTensors, FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "nrmse")) { + if (testNRMSE(outputTensors, referenceTensors, FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "ssd")) { + if (testSSDDetection(outputTensors, referenceTensors, inputDescriptors, + FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "yolo_v2")) { + if (testYoloV2(outputTensors, referenceTensors, inputDescriptors, + FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "yolo_v3")) { + if (testYoloV3(outputTensors, referenceTensors, inputDescriptors, outputLayouts, + FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "yolo_v4")) { + if (testYoloV4(outputTensors, referenceTensors, inputDescriptors, outputLayouts, + FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "psnr")) { + const auto& [firstOutputName, firstOutput] = *outputTensors.begin(); + const ov::Shape& shape = firstOutput.get_shape(); + const ov::Layout& outputLayout = outputLayouts.at(firstOutputName); + const size_t dstHeight = shape[ov::layout::height_idx(outputLayout)]; + const size_t dstWidth = shape[ov::layout::width_idx(outputLayout)]; + + if (testPSNR(outputTensors, referenceTensors, static_cast(dstHeight), + static_cast(dstWidth), FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else if (strEq(FLAGS_mode, "mean_iou")) { + if (testMeanIoU(outputTensors, referenceTensors, outputLayouts, FLAGS_override_model_batch_size)) { + std::cout << "PASSED" << std::endl; + } else { + std::cout << "FAILED" << std::endl; + return EXIT_FAILURE; + } + } else { + std::cout << "Unknown mode " << FLAGS_mode << std::endl; + return EXIT_FAILURE; + } + } else { + size_t outputInd = 0; + for (const auto& tensorEntry : outputTensors) { + std::ostringstream ostr; + ostr << netFileName << "_ref_out_" << outputInd << "_case_" << numberOfTestCase << ".blob"; + const auto blobFileName = ostr.str(); + + std::cout << "Dump reference output #" << outputInd << " to " << blobFileName << std::endl; + dumpTensor(tensorEntry.second, blobFileName); + + ++outputInd; + } + } + } + } // try + catch (const std::exception& ex) { + std::cerr << "exception: " << ex.what() << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + +// +// main +// + +int main(int argc, char* argv[]) { + parseCommandLine(argc, argv); + + return runSingleImageTest(); +} diff --git a/src/plugins/intel_npu/tools/single-image-test/semantic_segmentation_helpers.cpp b/src/plugins/intel_npu/tools/single-image-test/semantic_segmentation_helpers.cpp new file mode 100644 index 00000000000000..ec69a531e5cb37 --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/semantic_segmentation_helpers.cpp @@ -0,0 +1,41 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +#include "semantic_segmentation_helpers.hpp" +#include "tensor_utils.hpp" + +void utils::argMax_channels(const ov::Tensor& tensor, std::vector& resultArgmax, const ov::Layout& layout) { + OPENVINO_ASSERT(layout == ov::Layout("NCHW") || layout == ov::Layout("NHWC"), + "Unsupported layout: ", layout.to_string()); + + const ov::Tensor tensorFP32 = npu::utils::toFP32(tensor); + const auto dataBuffer = tensorFP32.data(); + + const size_t C = tensorFP32.get_shape()[ov::layout::channels_idx(layout)]; + const size_t H = tensorFP32.get_shape()[ov::layout::height_idx(layout)]; + const size_t W = tensorFP32.get_shape()[ov::layout::width_idx(layout)]; + + for (size_t h = 0; h < H; h++) { + for (size_t w = 0; w < W; w++) { + float argMax = 0.0f; + uint8_t clsIdx = std::numeric_limits::max(); + for (size_t c = 0; c < C; c++) { + size_t offset; + + if (layout == ov::Layout("NCHW")) { + offset = c * H * W + h * W + w; + } else { + offset = h * W * C + w * C + c; + } + + if (argMax < dataBuffer[offset]) { + argMax = dataBuffer[offset]; + clsIdx = static_cast(c); + } + } + resultArgmax.push_back(clsIdx); + } + } +} diff --git a/src/plugins/intel_npu/tools/single-image-test/semantic_segmentation_helpers.hpp b/src/plugins/intel_npu/tools/single-image-test/semantic_segmentation_helpers.hpp new file mode 100644 index 00000000000000..abb86996c3c137 --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/semantic_segmentation_helpers.hpp @@ -0,0 +1,82 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +// + +#pragma once + +#include +#include + +#include +#include + +namespace utils { + +void argMax_channels(const ov::Tensor& tensor, std::vector& resultArgmax, const ov::Layout& layout); + +template +std::vector> mean_IoU(std::vector actOutput, std::vector refOutput, uint32_t classes, + uint32_t ignoreLabel) { + std::vector output; + for (size_t i = 0; i < refOutput.size(); i++) { + auto mask = (refOutput[i] < classes) & (actOutput[i] < classes); + + if (mask == 1) { + output.push_back(classes * refOutput[i] + actOutput[i]); + } + } + + std::vector binC(classes * classes, 0); + for (size_t i = 0; i < output.size(); i++) { + binC[output[i]]++; + } + + std::vector> hist(classes, std::vector(classes)); + for (size_t i = 0; i < classes; i++) { + for (size_t j = 0; j < classes; j++) { + hist[i][j] = binC[i * classes + j]; + } + } + + if (ignoreLabel != std::numeric_limits::max()) { + // Sanity check + OPENVINO_ASSERT(ignoreLabel < classes); + for (size_t i = 0; i < classes; i++) { + hist[i][ignoreLabel] = 0; + hist[ignoreLabel][i] = 0; + } + } + + std::vector diagonal(classes, 0.0f); + std::vector sum0(classes, 0.0f); + std::vector sum1(classes, 0.0f); + for (size_t i = 0; i < classes; i++) { + for (size_t j = 0; j < classes; j++) { + if (i == j) { + diagonal[i] = static_cast(hist[i][j]); + } + sum0[j] += static_cast(hist[i][j]); + sum1[i] += static_cast(hist[i][j]); + } + } + + std::vector unionVect(classes, 0.0f); + for (size_t i = 0; i < sum0.size(); i++) { + unionVect[i] = sum1[i] + sum0[i] - diagonal[i]; + } + + std::vector> iou(classes, {false, 0.0f}); + for (size_t i = 0; i < diagonal.size() - 1; i++) { + if (unionVect[i] != 0) { + iou[i].first = true; + iou[i].second = (diagonal[i] / unionVect[i]) * 100.0f; + } + } + + return iou; +} + +} // namespace utils diff --git a/src/plugins/intel_npu/tools/single-image-test/yolo_helpers.cpp b/src/plugins/intel_npu/tools/single-image-test/yolo_helpers.cpp new file mode 100644 index 00000000000000..6740b490d36c15 --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/yolo_helpers.cpp @@ -0,0 +1,476 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +#include "yolo_helpers.hpp" +#include "data_type_converters.hpp" +#include "tensor_utils.hpp" + +#include + +static int entryIndex(int lw, int lh, int lcoords, int lclasses, int lnum, int batch, int location, int entry) { + int n = location / (lw * lh); + int loc = location % (lw * lh); + int loutputs = lh * lw * lnum * (lclasses + lcoords + 1); + return batch * loutputs + n * lw * lh * (lcoords + lclasses + 1) + entry * lw * lh + loc; +} + +static utils::Box getRegionBox(float* x, const std::vector& biases, int n, int index, int i, int j, int w, int h, + int stride) { + utils::Box b; + b.x = (i + x[index + 0 * stride]) / w; + b.y = (j + x[index + 1 * stride]) / h; + b.w = std::exp(x[index + 2 * stride]) * biases[2 * n] / w; + b.h = std::exp(x[index + 3 * stride]) * biases[2 * n + 1] / h; + + return b; +} + +static utils::Box getRegionBoxV3V4(const std::vector& predictions, const std::vector& anchor, + int anchor_offset, int n, int box_index, int col, int row, int imw, int imh, + int side, const std::function& transformationFunc) { + utils::Box b; + b.x = (col + transformationFunc(predictions[box_index + 0 * side * side])) / side; + b.y = (row + transformationFunc(predictions[box_index + 1 * side * side])) / side; + b.w = std::exp(predictions[box_index + 2 * side * side]) * anchor[anchor_offset + 2 * n] / imw; + b.h = std::exp(predictions[box_index + 3 * side * side]) * anchor[anchor_offset + 2 * n + 1] / imh; + + return b; +} + +static void correctRegionBoxes(std::vector& boxes, int n, int w, int h, int netw, int neth, int relative) { + int new_w = 0; + int new_h = 0; + if ((static_cast(netw) / w) < (static_cast(neth) / h)) { + new_w = netw; + new_h = (h * netw) / w; + } else { + new_h = neth; + new_w = (w * neth) / h; + } + + OPENVINO_ASSERT(static_cast(boxes.size()) >= n); + for (int i = 0; i < n; ++i) { + utils::Box b = boxes[i]; + b.x = (b.x - (netw - new_w) / 2.0f / netw) / (static_cast(new_w) / netw); + b.y = (b.y - (neth - new_h) / 2.0f / neth) / (static_cast(new_h) / neth); + b.w *= static_cast(netw) / new_w; + b.h *= static_cast(neth) / new_h; + if (!relative) { + b.x *= w; + b.w *= w; + b.y *= h; + b.h *= h; + } + boxes[i] = b; + } +} + +static void getRegionBoxesV3V4(const std::vector>& predictions, int w, int h, int lclasses, + int lcoords, int lnum, const std::vector& anchors, + std::vector>& tensorWH, float thresh, + std::vector>& probs, std::vector& boxes, + const std::function& transformationFunc, + const std::function& anchorFunc) { + for (size_t iout = 0; iout < predictions.size(); ++iout) { + auto lw = static_cast(tensorWH[iout][0]); + auto lh = static_cast(tensorWH[iout][1]); + int anchorOffset = anchorFunc(static_cast(lnum), static_cast(iout)); + + for (int i = 0; i < lw * lh; ++i) { + int row = i / lw; + int col = i % lw; + + for (int n = 0; n < lnum; ++n) { + int obj_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, lcoords); + float scale = predictions[iout][obj_index]; + float max = 0; + for (int j = 0; j < lclasses; ++j) { + int class_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, lcoords + 1 + j); + float prob = transformationFunc(scale) * transformationFunc(predictions[iout][class_index]); + if (max < prob) + max = prob; + } + if (max < thresh) + continue; + + int box_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, 0); + + boxes.push_back(getRegionBoxV3V4(predictions[iout], anchors, anchorOffset, n, box_index, col, row, w, h, + lw, transformationFunc)); + + std::vector prob(lclasses + 1, 0.0); + for (int j = 0; j < lclasses; ++j) { + int class_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, lcoords + 1 + j); + float probability = transformationFunc(scale) * transformationFunc(predictions[iout][class_index]); + prob[j] = probability > thresh ? probability : 0; + } + prob[lclasses] = max; + probs.push_back(prob); + } + } + } +} + +static void getRegionBoxes(std::vector& predictions, int lw, int lh, int lcoords, int lclasses, int lnum, int w, + int h, int netw, int neth, float thresh, std::vector>& probs, + std::vector& boxes, int relative, const std::vector& anchors) { + for (int i = 0; i < lw * lh; ++i) { + int row = i / lw; + int col = i % lw; + for (int n = 0; n < lnum; ++n) { + int index = n * lw * lh + i; + int obj_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, lcoords); + int box_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, 0); + float scale = predictions[obj_index]; + + boxes[index] = getRegionBox(predictions.data(), anchors, n, box_index, col, row, lw, lh, lw * lh); + + float max = 0; + for (int j = 0; j < lclasses; ++j) { + int class_index = entryIndex(lw, lh, lcoords, lclasses, lnum, 0, n * lw * lh + i, lcoords + 1 + j); + float prob = scale * predictions[class_index]; + probs[index][j] = (prob > thresh) ? prob : 0; + if (prob > max) + max = prob; + } + probs[index][lclasses] = max; + } + } + correctRegionBoxes(boxes, lw * lh * lnum, w, h, netw, neth, relative); +} + +struct sortableYoloBBox { + int index; + int cclass; + std::vector> probs; + sortableYoloBBox(int index, int cclass, std::vector>& probs) + : index(index), cclass(cclass), probs(probs){} +}; + +static float overlap(float x1, float w1, float x2, float w2) { + const float l1 = x1 - w1 / 2; + const float l2 = x2 - w2 / 2; + const float left = l1 > l2 ? l1 : l2; + + const float r1 = x1 + w1 / 2; + const float r2 = x2 + w2 / 2; + const float right = r1 < r2 ? r1 : r2; + + return right - left; +} + +static float boxIntersection(const utils::Box& a, const utils::Box& b) { + const float w = overlap(a.x, a.w, b.x, b.w); + const float h = overlap(a.y, a.h, b.y, b.h); + + if (w < 0 || h < 0) { + return 0.0f; + } + + return w * h; +} + +static float boxUnion(const utils::Box& a, const utils::Box& b) { + const float i = boxIntersection(a, b); + return a.w * a.h + b.w * b.h - i; +} + +float utils::boxIntersectionOverUnion(const utils::Box& a, const utils::Box& b) { + return boxIntersection(a, b) / boxUnion(a, b); +} + +static void doNonMaximumSupressionSort(std::vector& boxes, std::vector>& probs, + int total, int classes, float thresh) { + std::vector boxCandidates; + + for (int i = 0; i < total; ++i) { + sortableYoloBBox candidate(i, 0, probs); + boxCandidates.push_back(candidate); + } + + for (int k = 0; k < classes; ++k) { + for (int i = 0; i < total; ++i) { + boxCandidates[i].cclass = k; + } + std::sort(boxCandidates.begin(), boxCandidates.end(), [](const sortableYoloBBox& a, const sortableYoloBBox& b) { + float diff = a.probs[a.index][b.cclass] - b.probs[b.index][b.cclass]; + return diff > 0; + }); + for (int i = 0; i < total; ++i) { + if (probs[boxCandidates[i].index][k] == 0) + continue; + utils::Box a = boxes[boxCandidates[i].index]; + for (int j = i + 1; j < total; ++j) { + utils::Box b = boxes[boxCandidates[j].index]; + if (utils::boxIntersectionOverUnion(a, b) > thresh) { + probs[boxCandidates[j].index][k] = 0; + } + } + } + } +} + +static size_t maxIndex(std::vector& a, int n) { + OPENVINO_ASSERT(n > 0, "Expected a positive number of classes, got {0}", n); + int max_i = 0; + float max = a[0]; + for (int i = 1; i < n; ++i) { + if (a[i] > max) { + max = a[i]; + max_i = i; + } + } + return npu::utils::convertValuePrecision(max_i); +} + +static float clampToImageSize(const float& valueToClamp, const float& low, const float& high) { + float result = valueToClamp; + if (valueToClamp > high) { + result = high; + } else if (valueToClamp < low) { + result = low; + } else { + result = valueToClamp; + } + + return result; +} + +static void getDetections(int imw, int imh, int num, float thresh, utils::Box* boxes, + std::vector>& probs, int classes, + std::vector& detect_result) { + for (int i = 0; i < num; ++i) { + auto idxClass = static_cast(maxIndex(probs[i], classes)); + float prob = probs[i][idxClass]; + + if (prob > thresh) { + utils::Box b = boxes[i]; + + float left = (b.x - b.w / 2.0f) * imw; + float right = (b.x + b.w / 2.0f) * imw; + float top = (b.y - b.h / 2.0f) * imh; + float bot = (b.y + b.h / 2.0f) * imh; + float clampedLeft = clampToImageSize(left, 0.0f, static_cast(imw)); + float clampedRight = clampToImageSize(right, 0.0f, static_cast(imw)); + float clampedTop = clampToImageSize(top, 0.0f, static_cast(imh)); + float clampedBottom = clampToImageSize(bot, 0.0f, static_cast(imh)); + + utils::BoundingBox bx(idxClass, clampedLeft, clampedTop, clampedRight, clampedBottom, prob); + detect_result.push_back(bx); + } + } +} + +static std::vector yolov2BoxExtractor(float threshold, std::vector& net_out, int imgWidth, + int imgHeight, int class_num, bool isTiny) { + int classes = class_num; + int coords = 4; + int num = 5; + std::vector boxes_result; + + std::vector TINY_YOLOV2_ANCHORS = {1.08f, 1.19f, 3.42f, 4.41f, 6.63f, 11.38f, 9.42f, 5.11f, 16.62f, 10.52f}; + std::vector YOLOV2_ANCHORS = {1.3221f, 1.73145f, 3.19275f, 4.00944f, 5.05587f, + 8.09892f, 9.47112f, 4.84053f, 11.2364f, 10.0071f}; + std::vector YOLOV2_ANCHORS_80_CLASSES = {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, + 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f}; + + int imw = 416; + int imh = 416; + + int lw = 13; + int lh = 13; + float nms = 0.4f; + + std::vector boxes(lw * lh * num); + std::vector> probs(lw * lh * num, std::vector(classes + 1, 0.0f)); + + // TODO refactoring ticket S#37819 + std::vector* anchors = nullptr; + if (isTiny) { + anchors = &TINY_YOLOV2_ANCHORS; + } else { + anchors = &YOLOV2_ANCHORS; + if (class_num == 80) { + anchors = &YOLOV2_ANCHORS_80_CLASSES; + } + } + + getRegionBoxes(net_out, lw, lh, coords, classes, num, imgWidth, imgHeight, imw, imh, threshold, probs, boxes, 1, + *anchors); + + doNonMaximumSupressionSort(boxes, probs, lw * lh * num, classes, nms); + getDetections(imgWidth, imgHeight, lw * lh * num, threshold, boxes.data(), probs, classes, boxes_result); + + return boxes_result; +} + +static std::vector yolov3v4BoxExtractor( + std::vector>& net_out, int imgW, int imgH, int classes, int coords, int num, + const std::vector& anchors, std::vector>& tensorWH, float threshold, float nms, + const std::function& transformationFunc, + const std::function& anchorFunc) { + std::vector boxes_result; + std::vector boxes; + std::vector> probs; + + getRegionBoxesV3V4(net_out, imgW, imgH, classes, coords, num, anchors, tensorWH, threshold, probs, boxes, + transformationFunc, anchorFunc); + doNonMaximumSupressionSort(boxes, probs, static_cast(probs.size()), classes, nms); + getDetections(imgW, imgH, static_cast(probs.size()), threshold, boxes.data(), probs, classes, boxes_result); + + return boxes_result; +} + +static std::vector SSDBoxExtractor(const float threshold, std::vector& net_out, + const size_t imgWidth, const size_t imgHeight) { + std::vector boxes_result; + + if (net_out.empty()) { + return boxes_result; + } + size_t oneDetectionSize = 7; + + OPENVINO_ASSERT(net_out.size() % oneDetectionSize == 0); + + for (size_t i = 0; i < net_out.size() / oneDetectionSize; i++) { + if (net_out[i * oneDetectionSize + 2] > threshold) { + boxes_result.emplace_back(npu::utils::convertValuePrecision(net_out[i * oneDetectionSize + 1]), + net_out[i * oneDetectionSize + 3] * imgWidth, + net_out[i * oneDetectionSize + 4] * imgHeight, + net_out[i * oneDetectionSize + 5] * imgWidth, + net_out[i * oneDetectionSize + 6] * imgHeight, net_out[i * oneDetectionSize + 2]); + } + } + + return boxes_result; +} + +std::vector utils::parseYoloOutput(const ov::Tensor& tensor, const size_t imgWidth, + const size_t imgHeight, const float confThresh, + const bool isTiny) { + const auto dataBuffer = tensor.data(); + OPENVINO_ASSERT(dataBuffer != nullptr); + + std::vector results(tensor.get_size()); + for (size_t i = 0; i < tensor.get_size(); i++) { + results[i] = dataBuffer[i]; + } + + std::vector out; + int classes = 20; + out = yolov2BoxExtractor(confThresh, results, static_cast(imgWidth), static_cast(imgHeight), classes, + isTiny); + + return out; +} + +std::vector utils::parseYoloV3Output(const std::map& tensors, + const size_t imgWidth, const size_t imgHeight, + const int classes, const int coords, const int num, + const std::vector& anchors, const float confThresh, + const std::unordered_map& layouts) { + auto funcV3 = [](const float x) -> float { + return x; + }; + + auto anchorOffsetV3 = [&](const size_t iout, const int lnum) -> int { + return static_cast((tensors.size() - 1) * (lnum * (tensors.size() - 1 - iout))); + }; + + auto result = parseYoloV3V4Output(tensors, imgWidth, imgHeight, classes, coords, num, anchors, confThresh, layouts, + funcV3, anchorOffsetV3); + return result; +} + +std::vector utils::parseYoloV4Output(const std::map& tensors, + const size_t imgWidth, const size_t imgHeight, + const int classes, const int coords, const int num, + const std::vector& anchors, const float confThresh, + const std::unordered_map& layouts) { + auto funcV4 = [](const float x) -> float { + return 1 / (1 + std::exp(-x)); + }; + + auto anchorOffsetV4 = [&](const size_t iout, const int lnum) -> int { + return static_cast(lnum * 2 * iout); + }; + + auto result = parseYoloV3V4Output(tensors, imgWidth, imgHeight, classes, coords, num, anchors, confThresh, layouts, + funcV4, anchorOffsetV4); + return result; +} + +std::vector utils::parseYoloV3V4Output( + const std::map& tensors, const size_t imgWidth, const size_t imgHeight, + const int classes, const int coords, const int num, const std::vector& anchors, const float confThresh, + const std::unordered_map& layouts, + const std::function& transformationFunc, + const std::function& anchorFunc) { + std::vector> results; + std::vector> tensorWH; + + for (const auto& [tensorName, tensor] : tensors) { + const ov::Tensor tensorFP32 = npu::utils::toFP32(tensor); + const auto dataBuffer = tensorFP32.data(); + OPENVINO_ASSERT(dataBuffer != nullptr); + + const ov::Layout& layout = layouts.at(tensorName); + + const size_t H = tensorFP32.get_shape()[ov::layout::height_idx(layout)]; + const size_t W = tensorFP32.get_shape()[ov::layout::width_idx(layout)]; + + std::vector result(tensorFP32.get_size()); + if (layout == ov::Layout("NCHW")) { + for (size_t j = 0; j < tensorFP32.get_size(); j++) { + result[j] = dataBuffer[j]; + } + } else if (layout == ov::Layout("NHWC")) { + const size_t C = tensorFP32.get_shape()[ov::layout::channels_idx(layout)]; + + // TODO may be using copyTensor is good decision but can't find a way how include it + for (size_t c = 0; c < C; c++) { + for (size_t h = 0; h < H; h++) { + for (size_t w = 0; w < W; w++) { + result[c * H * W + h * W + w] = dataBuffer[h * W * C + w * C + c]; + } + } + } + } + results.push_back(result); + tensorWH.push_back(std::vector{W, H}); + } + + return yolov3v4BoxExtractor(results, static_cast(imgWidth), static_cast(imgHeight), classes, coords, num, + anchors, tensorWH, confThresh, 0.4f, transformationFunc, anchorFunc); +} + +std::vector utils::parseSSDOutput(const ov::Tensor& tensor, const size_t imgWidth, + const size_t imgHeight, const float confThresh) { + const ov::Tensor tensorFP32 = npu::utils::toFP32(tensor); + + const auto dataBuffer = tensorFP32.data(); + std::vector results(tensorFP32.get_size()); + std::copy_n(dataBuffer, tensorFP32.get_size(), results.begin()); + + std::vector out; + out = SSDBoxExtractor(confThresh, results, imgWidth, imgHeight); + return out; +} + +void utils::printDetectionBBoxOutputs(const std::vector& actualOutput, + std::ostringstream& outputStream, const std::vector& labels) { + outputStream << "Actual top:" << std::endl; + for (size_t i = 0; i < actualOutput.size(); ++i) { + const auto& bb = actualOutput[i]; + outputStream << i << " : "; + if (static_cast(labels.size()) < bb.idx) { + outputStream << bb.idx; + } else { + outputStream << labels.at(bb.idx); + } + outputStream << " : [(" << bb.left << " " << bb.top << "), (" << bb.right << " " << bb.bottom + << ")] : " << bb.prob * 100 << "%" << std::endl; + } +} diff --git a/src/plugins/intel_npu/tools/single-image-test/yolo_helpers.hpp b/src/plugins/intel_npu/tools/single-image-test/yolo_helpers.hpp new file mode 100644 index 00000000000000..3dc7132761f42b --- /dev/null +++ b/src/plugins/intel_npu/tools/single-image-test/yolo_helpers.hpp @@ -0,0 +1,56 @@ +// +// Copyright (C) 2022-2024 Intel Corporation. +// SPDX-License-Identifier: Apache 2.0 +// + +// + +#pragma once + +#include +#include + +#include + +namespace utils { +struct Box final { + float x, y, w, h; +}; + +struct BoundingBox final { + int idx; + float left, right, top, bottom; + float prob; + BoundingBox(int idx, float xmin, float ymin, float xmax, float ymax, float prob) + : idx(idx), left(xmin), right(xmax), top(ymin), bottom(ymax), prob(prob) { + } +}; + +std::vector parseYoloOutput(const ov::Tensor& tensor, const size_t imgWidth, const size_t imgHeight, + const float confThresh, const bool isTiny); + +std::vector parseYoloV3Output(const std::map& tensors, const size_t imgWidth, + const size_t imgHeight, const int classes, const int coords, const int num, + const std::vector& anchors, const float confThresh, + const std::unordered_map& layouts); + +std::vector parseYoloV4Output(const std::map& tensors, const size_t imgWidth, + const size_t imgHeight, const int classes, const int coords, const int num, + const std::vector& anchors, const float confThresh, + const std::unordered_map& layouts); + +std::vector parseYoloV3V4Output(const std::map& tensors, const size_t imgWidth, + const size_t imgHeight, const int classes, const int coords, const int num, + const std::vector& anchors, const float confThresh, + const std::unordered_map& layouts, + const std::function& transformationFunc, + const std::function& anchorFunc); + +std::vector parseSSDOutput(const ov::Tensor& tensor, const size_t imgWidth, const size_t imgHeight, + const float confThresh); + +void printDetectionBBoxOutputs(const std::vector& actualOutput, std::ostringstream& outputStream, + const std::vector& labels = {}); + +float boxIntersectionOverUnion(const Box& a, const Box& b); +} // namespace utils