openvinotoolkit · dmitry-gorokhov · Jul 18, 2023 · Jun 25, 2023 · Jul 5, 2023 · Jul 10, 2023
@@ -121,6 +121,13 @@ using ov::with_cpu_x86_avx512_core_vnni;
  */
 using ov::with_cpu_x86_bfloat16;
 
+/**
+ * @brief      Checks whether CPU supports fp16 capability
+ * @ingroup    ie_dev_api_system_conf
+ * @return     `True` is tAVX512_FP16 instructions are available, `false` otherwise
+ */
+using ov::with_cpu_x86_avx512_core_fp16;
+
 /**
  * @brief      Checks whether CPU supports AMX int8 capability
  * @ingroup    ie_dev_api_system_conf

@@ -110,6 +110,13 @@ OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_vnni();
  */
 OPENVINO_RUNTIME_API bool with_cpu_x86_bfloat16();
 
+/**
+ * @brief      Checks whether CPU supports fp16 capability
+ * @ingroup    ov_dev_api_system_conf
+ * @return     `True` is tAVX512_FP16 instructions are available, `false` otherwise
+ */
+OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_fp16();
+
 /**
  * @brief      Checks whether CPU supports AMX int8 capability
  * @ingroup    ov_dev_api_system_conf

@@ -72,6 +72,10 @@ bool with_cpu_x86_bfloat16() {
     return get_cpu_info().has(Xbyak::util::Cpu::tAVX512_BF16);
 }
 
+bool with_cpu_x86_avx512_core_fp16() {
+    return get_cpu_info().has(Xbyak::util::Cpu::tAVX512_FP16);
+}
+
 bool with_cpu_x86_avx512_core_amx_int8() {
     return get_cpu_info().has(Xbyak::util::Cpu::tAMX_INT8);
 }
@@ -107,6 +111,9 @@ bool with_cpu_x86_avx512_core_vnni() {
 bool with_cpu_x86_bfloat16() {
     return false;
 }
+bool with_cpu_x86_avx512_core_fp16() {
+    return false;
+}
 bool with_cpu_x86_avx512_core_amx_int8() {
     return false;
 }

@@ -311,7 +311,8 @@ void Graph::Replicate(const CNNNetwork &network) {
         const auto childEdges = input.second->getChildEdgesAtPort(0);
         for (size_t i = 0; i < childEdges.size(); i++) {
             const auto child = childEdges[i]->getChild();
-            if (child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum()) != Precision::BF16 &&
+            if (!one_of(child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum()),
+                Precision::BF16, Precision::FP16) &&
                 // remove this WA when #78939 is resolved
                 !hasSubgraphConsumers(child))
                 child->setOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum(), precToSet);

@@ -108,7 +108,7 @@ bool ReduceKey::operator==(const ReduceKey &rhs) const {
 
 // some utility functions
 static inline bool isFloatCompatible(memory::data_type type) {
-    return memory::data_type::f32 == type || memory::data_type::bf16 == type;
+    return memory::data_type::f32 == type || memory::data_type::bf16 == type || memory::data_type::f16 == type;
 }
 
 template <cpu_isa_t isa>
@@ -590,6 +590,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                 }
                 break;
             case memory::data_type::bf16:
+            case memory::data_type::f16:
             case memory::data_type::s8:
             case memory::data_type::u8:
                 pack_gathered_vector(vmm_src, vmm_idx, offset, jcp_.src_dt);
@@ -614,8 +615,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                     mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32());
                     break;
                 case memory::data_type::bf16:
+                case memory::data_type::f16:
                     mov(reg_tmp_64.cvt16(), table_idx);
-                    mov(ptr[rsp + i * sizeof(ov::intel_cpu::bfloat16_t)], reg_tmp_64.cvt16());
+                    mov(ptr[rsp + i * 2], reg_tmp_64.cvt16());
                     break;
                 case memory::data_type::s8:
                 case memory::data_type::u8:
@@ -635,7 +637,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
             case memory::data_type::bf16:
                 uni_vpmovzxwd(vmm_val, ptr[rsp]);
                 uni_vpslld(vmm_val, vmm_val, 16);
-            break;
+                break;
+            case memory::data_type::f16:
+                vcvtph2ps(vmm_val, ptr[rsp]);
+                break;
             case memory::data_type::s8:
                 uni_vpmovsxbd(vmm_val, ptr[rsp]);
                 break;
@@ -890,6 +895,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                 uni_vpmovzxwd(vmm_src, op);
                 uni_vpslld(vmm_src, vmm_src, 16);
                 break;
+            case memory::data_type::f16:
+                vcvtph2ps(vmm_src, op);
+                break;
             case memory::data_type::s8:
                 uni_vpmovsxbd(vmm_src, op);
                 break;
@@ -914,6 +922,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                 uni_vpinsrw(xmm_src, xmm_src, op, 0x0);
                 uni_vpslld(xmm_src, xmm_src, 16);
                 break;
+            case memory::data_type::f16:
+                vcvtph2ps(xmm_src, op);
+                break;
             case memory::data_type::s8:
                 movsx(reg_tmp_32, op);
                 uni_vmovq(xmm_src, reg_tmp_64);
@@ -948,6 +959,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                 uni_vcvtneps2bf16->emit_code({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
                 vmovdqu16(op, ymm_dst);
                 break;
+            case memory::data_type::f16:
+                vcvtps2ph(op, vmm_dst, 0x4);
+                break;
             case memory::data_type::s8:
                 if (isa == cpu::x64::avx512_core) {
                     vpmovsdb(op, vmm_dst);
@@ -996,6 +1010,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                 uni_vpsrld(xmm_dst, xmm_dst, 16);
                 uni_vpextrw(op, xmm_dst, 0x0);
                 break;
+            case memory::data_type::f16:
+                vcvtps2ph(op, xmm_dst, 0x4);
+                break;
             case memory::data_type::s8:
                 uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                 uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
@@ -1540,6 +1557,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
                 uni_vpmovzxwd(vmm_src, op);
                 uni_vpslld(vmm_src, vmm_src, 16);
                 break;
+            case memory::data_type::f16:
+                vcvtph2ps(vmm_src, op);
+                break;
             case memory::data_type::s8:
                 uni_vpmovsxbd(vmm_src, op);
                 break;
@@ -1564,6 +1584,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
                 uni_vpinsrw(xmm_src, xmm_src, op, 0x0);
                 uni_vpslld(xmm_src, xmm_src, 16);
                 break;
+            case memory::data_type::f16:
+                vcvtph2ps(xmm_src, op);
+                break;
             case memory::data_type::s8:
                 movsx(reg_tmp_32, op);
                 uni_vmovq(xmm_src, reg_tmp_64);
@@ -1598,6 +1621,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
                 uni_vcvtneps2bf16->emit_code({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
                 vmovdqu16(op, ymm_dst);
                 break;
+            case memory::data_type::f16:
+                vcvtps2ph(op, vmm_dst, 0x4);
+                break;
             case memory::data_type::s8:
                 if (isa == cpu::x64::avx512_core) {
                     vpmovsdb(op, vmm_dst);
@@ -1646,6 +1672,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
                 uni_vpsrld(xmm_dst, xmm_dst, 16);
                 uni_vpextrw(op, xmm_dst, 0x0);
                 break;
+            case memory::data_type::f16:
+                vcvtps2ph(op, xmm_dst, 0x4);
+                break;
             case memory::data_type::s8:
                 uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                 uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
@@ -1878,16 +1907,20 @@ void Reduce::initSupportedPrimitiveDescriptors() {
 
     jit_mode = canApplyJIT(input_prec, output_prec);
 
+    auto is_precision_sensitive_reduce = [](const Algorithm &algorithm) {
+        return algorithm != Algorithm::ReduceAnd && algorithm != Algorithm::ReduceOr &&
+               algorithm != Algorithm::ReduceMin && algorithm != Algorithm::ReduceMax;
+    };
+
     if (jit_mode) {
-        // Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to
+        // Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16/FP16 output precision due to
         // the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32.
         if (Precision::BF16 == output_prec) {
-            if (!mayiuse(avx512_core)) {
-                    output_prec = Precision::FP32;
-            } else if (algorithm != Algorithm::ReduceAnd && algorithm != Algorithm::ReduceOr &&
-                       algorithm != Algorithm::ReduceMin && algorithm != Algorithm::ReduceMax) {
-                            output_prec = Precision::FP32;
-            }
+            if (!mayiuse(avx512_core) || is_precision_sensitive_reduce(algorithm))
+                output_prec = Precision::FP32;
+        } else if (Precision::FP16 == output_prec) {
+            if (!mayiuse(cpu::x64::avx2) || is_precision_sensitive_reduce(algorithm))
+                output_prec = Precision::FP32;
         }
     }
 
@@ -2862,6 +2895,9 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
             } else if (output_prec == Precision::BF16) {
                 auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
                 parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<bfloat16_t>(1); });
+            } else if (output_prec == Precision::FP16) {
+                auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
+                parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<ov::float16>(1); });
             } else if (output_prec == Precision::U8) {
                 auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
                 parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<uint8_t>(1); });
@@ -2880,6 +2916,9 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
             } else if (output_prec == Precision::BF16) {
                 auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
                 parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::lowest(); });
+            } else if (output_prec == Precision::FP16) {
+                auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
+                parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::lowest(); });
             } else if (output_prec == Precision::U8) {
                 auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
                 parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::min(); });
@@ -2898,6 +2937,9 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
             } else if (output_prec == Precision::BF16) {
                 auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
                 parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::max(); });
+            } else if (output_prec == Precision::FP16) {
+                auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
+                parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::max(); });
             } else if (output_prec == Precision::U8) {
                 auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
                 parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::max(); });
@@ -3268,6 +3310,7 @@ std::vector<int> Reduce::update_src_dims() {
 bool Reduce::canApplyJIT(const Precision &input_prec, const Precision &output_prec) const {
     static const Precision supportedPrecisions[] = {
             Precision::FP32,
+            Precision::FP16,
             Precision::BF16,
             Precision::I32,
             Precision::I8,

@@ -74,7 +74,7 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*OVCompiledModelBaseTest.*(CanGetInputsInfoAndCheck|canSetConfigToCompiledModel).*)",
         R"(.*Behavior.*CorrectConfigCheck.*(canSetConfigAndCheckGetConfig|canSetConfigTwiceAndCheckGetConfig).*CPU_BIND_THREAD=YES.*)",
         // Issue: 72021 Unreasonable abs_threshold for comparing bf16 results
-        R"(.*smoke_Reduce.*type=(Prod|Min).*netPRC=(BF|bf)16.*)",
+        R"(.*smoke_Reduce.*type=(Prod|Min).*INFERENCE_PRECISION_HINT=(BF|bf)16.*)",
         // TODO: 56520 Accuracy mismatch
         R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=(I64|I32).*)",
         R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=U64.*)",
@@ -246,6 +246,12 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*Snippets.*MHA.*)");
         retVector.emplace_back(R"(.*Snippets.*(MatMul|Matmul).*)");
     }
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+    if (!InferenceEngine::with_cpu_x86_avx512_core_fp16()) {
+        // Skip fp16 tests for paltforms that don't support fp16 precision
+        retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
+    }
+#endif
     if (!InferenceEngine::with_cpu_x86_avx512_core_vnni() && !InferenceEngine::with_cpu_x86_avx512_core_amx_int8()) {
         // MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions
         retVector.emplace_back(R"(.*Snippets.*MatMulFQ.*)");

@@ -18,7 +18,8 @@ std::string ReduceCPULayerTest::getTestCaseName(testing::TestParamInfo<ReduceLay
     basicReduceParams basicParams;
     CPUSpecificParams cpuParams;
     fusingSpecificParams fusingParams;
-    std::tie(basicParams, cpuParams, fusingParams) = obj.param;
+    std::map<std::string, ov::element::Type> additionalConfig;
+    std::tie(basicParams, cpuParams, fusingParams, additionalConfig) = obj.param;
 
     std::vector<int> axes;
     CommonTestUtils::OpType opType;
@@ -51,6 +52,13 @@ std::string ReduceCPULayerTest::getTestCaseName(testing::TestParamInfo<ReduceLay
     result << "inPRC=" << inPrc << "_";
     result << "outPRC=" << outPrc << "_";
 
+    if (!additionalConfig.empty()) {
+        result << "PluginConf";
+        for (auto& item : additionalConfig) {
+            result << "_" << item.first << "=" << item.second.get_type_name();
+        }
+    }
+
     result << CPUTestsBase::getTestCaseName(cpuParams);
     result << CpuTestWithFusing::getTestCaseName(fusingParams);
 
@@ -63,7 +71,8 @@ void ReduceCPULayerTest::SetUp() {
     basicReduceParams basicParams;
     CPUSpecificParams cpuParams;
     fusingSpecificParams fusingParams;
-    std::tie(basicParams, cpuParams, fusingParams) = this->GetParam();
+    std::map<std::string, ov::element::Type> additionalConfig;
+    std::tie(basicParams, cpuParams, fusingParams, additionalConfig) = this->GetParam();
 
     std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
     std::tie(postOpMgrPtr, fusedOps) = fusingParams;
@@ -75,7 +84,18 @@ void ReduceCPULayerTest::SetUp() {
     std::vector<InputShape> inputShapes;
 
     std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inputShapes) = basicParams;
-    inPrc = outPrc = netPrecision;
+    if (netPrecision == ElementType::boolean) {
+        inPrc = outPrc = netPrecision;
+    } else {
+        if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::bf16) {
+            inPrc = outPrc = netPrecision = ElementType::bf16;
+        } else if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::f16) {
+            inPrc = outPrc = netPrecision = ElementType::f16;
+        } else {
+            inPrc = outPrc = netPrecision;
+        }
+    }
+    configuration.insert(additionalConfig.begin(), additionalConfig.end());
 
     init_input_shapes(inputShapes);
 
@@ -144,6 +164,11 @@ void ReduceCPULayerTest::generate_inputs(const std::vector<ngraph::Shape>& targe
                 for (size_t i = 0; i < tensor.get_size(); ++i) {
                     rawBlobDataPtr[i] /= 10.f;
                 }
+            } else if (netPrecision == ElementType::f16) {
+                auto *rawBlobDataPtr = static_cast<ngraph::float16 *>(tensor.data());
+                for (size_t i = 0; i < tensor.get_size(); ++i) {
+                    rawBlobDataPtr[i] /= 10.f;
+                }
             } else if (netPrecision == ElementType::bf16) {
                 auto* rawBlobDataPtr = static_cast<ngraph::bfloat16*>(tensor.data());
                 for (size_t i = 0; i < tensor.get_size(); ++i) {
@@ -222,10 +247,29 @@ const std::vector<ngraph::helpers::ReductionType>& reductionTypes() {
 }
 
 const std::vector<ElementType>& inpOutPrc() {
-    static const std::vector<ElementType> inpOutPrc = {ElementType::bf16, ElementType::f32};
+    static const std::vector<ElementType> inpOutPrc = {ElementType::f32};
     return inpOutPrc;
 }
 
+const std::vector<std::map<std::string, ov::element::Type>> additionalConfig() {
+    static const std::vector<std::map<std::string, ov::element::Type>> additionalConfig = {
+        {{ov::hint::inference_precision.name(), ov::element::f32}},
+        {{ov::hint::inference_precision.name(), ov::element::bf16}},
+// ARM doesn't support FP16 for now
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+        {{ov::hint::inference_precision.name(), ov::element::f16}},
+#endif
+    };
+    return additionalConfig;
+}
+
+const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32() {
+    static const std::vector<std::map<std::string, ov::element::Type>> additionalConfig = {
+        {{ov::hint::inference_precision.name(), ov::element::f32}}
+    };
+    return additionalConfig;
+}
+
 const std::vector<ngraph::helpers::ReductionType>& reductionTypesInt32() {
     static const std::vector<ngraph::helpers::ReductionType> reductionTypesInt32 = {
             ngraph::helpers::ReductionType::Sum,

@@ -29,7 +29,8 @@ typedef std::tuple<
 typedef std::tuple<
         basicReduceParams,
         CPUSpecificParams,
-        fusingSpecificParams> ReduceLayerCPUTestParamSet;
+        fusingSpecificParams,
+        std::map<std::string, ov::element::Type>> ReduceLayerCPUTestParamSet;
 
 class ReduceCPULayerTest : public testing::WithParamInterface<ReduceLayerCPUTestParamSet>,
                            virtual public SubgraphBaseTest, public CpuTestWithFusing {
@@ -52,6 +53,8 @@ const std::vector<std::vector<int>>& axesND();
 const std::vector<CommonTestUtils::OpType>& opTypes();
 const std::vector<ngraph::helpers::ReductionType>& reductionTypes();
 const std::vector<ElementType>& inpOutPrc();
+const std::vector<std::map<std::string, ov::element::Type>> additionalConfig();
+const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32();
 const std::vector<ngraph::helpers::ReductionType>& reductionTypesInt32();
 
 } // namespace Reduce