openvinotoolkit · dmitry-gorokhov · Jun 22, 2023 · May 10, 2023 · May 10, 2023 · May 15, 2023
@@ -51,8 +51,15 @@ Config::Config() {
     }
 #endif
 
-    if (!mayiuse(avx512_core_bf16))
-        enforceBF16 = false;
+    if (mayiuse(avx512_core_bf16)) {
+        inferencePrecision = ov::element::bf16;
+    } else if (mayiuse(avx512_core_amx_fp16)) {
+        inferencePrecision = ov::element::f16;
+    } else if (mayiuse(avx512_core_fp16)) {
+        inferencePrecision = ov::element::f16;
+    } else {
+        inferencePrecision = ov::element::f32;
+    }
 
     CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
 
@@ -183,12 +190,12 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
         } else if (key == PluginConfigParams::KEY_ENFORCE_BF16) {
             if (val == PluginConfigParams::YES) {
                 if (mayiuse(avx512_core)) {
-                    enforceBF16 = true;
+                    inferencePrecision = ov::element::bf16;
                 } else {
                     IE_THROW() << "Platform doesn't support BF16 format";
                 }
             } else if (val == PluginConfigParams::NO) {
-                enforceBF16 = false;
+                inferencePrecision = ov::element::f32;
             } else {
                 IE_THROW() << "Wrong value for property key " << PluginConfigParams::KEY_ENFORCE_BF16
                     << ". Expected only YES/NO";
@@ -197,15 +204,21 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
         } else if (key == ov::hint::inference_precision.name()) {
             if (val == "bf16") {
                 if (mayiuse(avx512_core)) {
-                    enforceBF16 = true;
+                    inferencePrecision = ov::element::bf16;
                 } else {
                     IE_THROW() << "Platform doesn't support BF16 format";
                 }
+            } else if (val == "f16") {
+                if (mayiuse(avx512_core_fp16) || mayiuse(avx512_core_amx_fp16)) {
+                    inferencePrecision = ov::element::f16;
+                } else {
+                    IE_THROW() << "Platform doesn't support FP16 format";
+                }
             } else if (val == "f32") {
-                enforceBF16 = false;
+                inferencePrecision = ov::element::f32;
             } else {
                 IE_THROW() << "Wrong value for property key " << ov::hint::inference_precision.name()
-                    << ". Supported values: bf16, f32";
+                    << ". Supported values: bf16, f16, f32";
             }
             inferencePrecisionSetExplicitly = true;
         } else if (PluginConfigInternalParams::KEY_CPU_RUNTIME_CACHE_CAPACITY == key) {
@@ -256,10 +269,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
     // apply execution mode after all the params are handled to prevent possible conflicts
     // when both execution_mode and inference_precision are specified
     if (!inferencePrecisionSetExplicitly) {
-        if (executionMode == ov::hint::ExecutionMode::PERFORMANCE && (mayiuse(avx512_core_bf16))) {
-            enforceBF16 = true;
+        if (executionMode == ov::hint::ExecutionMode::PERFORMANCE) {
+            if (mayiuse(avx512_core_bf16))
+                inferencePrecision = ov::element::bf16;
+            else if (mayiuse(avx512_core_amx_fp16) || mayiuse(avx512_core_fp16))
+                inferencePrecision = ov::element::f16;
+            else
+                inferencePrecision = ov::element::f32;
         } else {
-            enforceBF16 = false;
+            inferencePrecision = ov::element::f32;
         }
     }
 
@@ -325,8 +343,7 @@ void Config::updateProperties() {
     IE_SUPPRESS_DEPRECATED_START
         _config.insert({ PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT, dumpToDot });
     IE_SUPPRESS_DEPRECATED_END;
-
-    if (enforceBF16) {
+    if (inferencePrecision == ov::element::bf16) {
         _config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES });
     } else {
         _config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO });

@@ -10,7 +10,7 @@
 #include <openvino/runtime/properties.hpp>
 #include <openvino/util/common_util.hpp>
 #include "utils/debug_caps_config.h"
-#include "openvino/runtime/properties.hpp"
+#include <openvino/core/type/element_type.hpp>
 
 #include <bitset>
 #include <string>
@@ -63,11 +63,11 @@ struct Config {
     bool changedHyperThreading = false;
 #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
     LPTransformsMode lpTransformsMode = LPTransformsMode::On;
-    bool enforceBF16 = true;
+    ov::element::Type inferencePrecision = ov::element::bf16;
 #else
     // Currently INT8 mode is not optimized on ARM / RISCV or other non-x86 platforms, fallback to FP32 mode.
     LPTransformsMode lpTransformsMode = LPTransformsMode::Off;
-    bool enforceBF16 = false;
+    ov::element::Type inferencePrecision = ov::element::f32;
 #endif
     bool inferencePrecisionSetExplicitly = false;
     ov::hint::ExecutionMode executionMode = ov::hint::ExecutionMode::PERFORMANCE;

diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
@@ -26,6 +26,8 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) {
         return 4;
     case dnnl::memory::data_type::bf16:
         return 2;
+    case dnnl::memory::data_type::f16:
+        return 2;
     case dnnl::memory::data_type::s8:
         return 1;
     case dnnl::memory::data_type::u8:
@@ -47,6 +49,8 @@ memory::data_type DnnlExtensionUtils::IEPrecisionToDataType(const InferenceEngin
             return memory::data_type::s32;
         case InferenceEngine::Precision::BF16:
             return memory::data_type::bf16;
+        case InferenceEngine::Precision::FP16:
+            return memory::data_type::f16;
         case InferenceEngine::Precision::I8:
             return memory::data_type::s8;
         case InferenceEngine::Precision::U8:
@@ -70,6 +74,8 @@ InferenceEngine::Precision DnnlExtensionUtils::DataTypeToIEPrecision(memory::dat
             return InferenceEngine::Precision::I32;
         case memory::data_type::bf16:
             return InferenceEngine::Precision::BF16;
+        case memory::data_type::f16:
+            return InferenceEngine::Precision::FP16;
         case memory::data_type::s8:
             return InferenceEngine::Precision::I8;
         case memory::data_type::u8:

@@ -110,6 +110,11 @@ void jit_convert_truncation_emitter::emit_isa(const std::vector<size_t> &in_vec_
             if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
                 h->uni_vcvttps2dq(vmm_dst, vmm_dst);
             break;
+        case ov::element::f16:
+            // to be exact, vcvtph2ps belongs to AVX512VL/AVX512F
+            assert(dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16));
+            h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx()));
+            break;
         case ov::element::i8:
             h->uni_vpmovsxbd(vmm_dst, vmm_src);
             break;
@@ -222,6 +227,11 @@ void jit_convert_saturation_emitter::emit_isa(const std::vector<size_t> &in_vec_
             if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8))
                 h->uni_vcvttps2dq(vmm_dst, vmm_dst);
             break;
+        case ov::element::f16:
+            // to be exact, vcvtph2ps belongs to AVX512VL/AVX512F
+            assert(dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16));
+            h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx()));
+            break;
         case ov::element::i8:
             h->uni_vpmovsxbd(vmm_dst, vmm_src);
             break;
@@ -234,7 +244,7 @@ void jit_convert_saturation_emitter::emit_isa(const std::vector<size_t> &in_vec_
 
     switch (output_type) {
         case ov::element::f32:
-            if (!one_of(input_type, ov::element::i32, ov::element::bf16)) {
+            if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) {
                 h->uni_vcvtdq2ps(vmm_dst, vmm_dst);
             }
             break;

@@ -32,6 +32,7 @@ class jit_convert_emitter : public jit_emitter {
             ov::element::f32,
             ov::element::i32,
             ov::element::bf16,
+            ov::element::f16,
             ov::element::i8,
             ov::element::u8
     };

@@ -138,13 +138,10 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 &reg_src, const int out_vec_i
                 load_bytes_to_dword_extension<Vmm>(Vmm(out_vec_idx), reg_src, offset, false, load_size_);
                 break;
             case Precision::I16:
-                load_words_to_dword_extension<Vmm>(Vmm(out_vec_idx), reg_src, offset, false, true, load_size_);
-                break;
             case Precision::U16:
-                load_words_to_dword_extension<Vmm>(Vmm(out_vec_idx), reg_src, offset, false, false, load_size_);
-                break;
             case Precision::BF16:
-                load_words_to_dword_extension<Vmm>(Vmm(out_vec_idx), reg_src, offset, true, false, load_size_);
+            case Precision::FP16:
+                load_words_to_dword_extension<Vmm>(Vmm(out_vec_idx), reg_src, offset, src_prc_, load_size_);
                 break;
             default:
                 IE_THROW() << "Load emitter in " << name_ << " has unsupported src precision to load.";
@@ -155,11 +152,11 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 &reg_src, const int out_vec_i
     if (src_prc_ != dst_prc_) {
         switch (dst_prc_) {
             case Precision::FP32:
-                if ((src_prc_ != Precision::FP32) && (src_prc_ != Precision::BF16))
+                if ((src_prc_ != Precision::FP32) && (src_prc_ != Precision::BF16) && (src_prc_ != Precision::FP16))
                     h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx));
                 break;
             case Precision::I32:
-                if ((src_prc_ == Precision::FP32) || (src_prc_ == Precision::BF16)) {
+                if ((src_prc_ == Precision::FP32) || (src_prc_ == Precision::BF16) || (src_prc_ == Precision::FP16)) {
                     h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx));
                 }
                 break;
@@ -447,7 +444,7 @@ void jit_load_emitter::load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak
 * [0.. 32] for ZMM version of the function. i.e. 16 words -> 16 * 32 bit == 512 bit
 */
 template <typename Vmm>
-void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, bool is_bf16, bool is_signed, int load_size) const {
+void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, InferenceEngine::Precision prc, int load_size) const {
     constexpr bool is_xmm = std::is_same<Vmm, Xbyak::Xmm>::value;
     constexpr bool is_ymm = std::is_same<Vmm, Xbyak::Ymm>::value;
     constexpr bool is_zmm = std::is_same<Vmm, Xbyak::Zmm>::value;
@@ -456,6 +453,13 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak
     MAYBE_UNUSED(is_ymm);
     MAYBE_UNUSED(is_zmm);
 
+    bool is_bf16 = (prc == Precision::BF16);
+    bool is_f16 = (prc == Precision::FP16);
+    bool is_signed = (prc == Precision::I16);
+
+    if (is_f16 && !mayiuse(cpu::x64::avx512_core_fp16))
+        IE_THROW() << "Load emitter in " << name_ << " only support fp16 on platform with avx512_core_fp16.";
+
     // Ensure extended double words fit inside Zmm (32/2(num) * 32 <= 512)
     // For Ymm register, load capacity is halved (16/2(num) * 32 <= 128)
     // For Xmm register, load capacity is halved again (8/2(num) * 32 <= 128)
@@ -477,6 +481,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak
             if (is_bf16) {
                 h->uni_vpmovzxwd(zmm, ptr[reg + offset]);
                 h->uni_vpslld(zmm, zmm, 16);
+            } else if (is_f16) {
+                h->vcvtph2ps(zmm, ptr[reg + offset]);
             } else {
                 if (is_signed)
                     h->uni_vpmovsxwd(zmm, ptr[reg + offset]);
@@ -489,6 +495,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak
             if (is_bf16) {
                 h->uni_vpmovzxwd(ymm, ptr[reg + offset]);
                 h->uni_vpslld(ymm, ymm, 16);
+            } else if (is_f16) {
+                h->vcvtph2ps(ymm, ptr[reg + offset]);
             } else {
                 if (is_signed)
                     h->uni_vpmovsxwd(ymm, ptr[reg + offset]);
@@ -501,6 +509,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak
             if (is_bf16) {
                 h->uni_vpmovzxwd(xmm, ptr[reg + offset]);
                 h->uni_vpslld(xmm, xmm, 16);
+            } else if (is_f16) {
+                h->vcvtph2ps(xmm, ptr[reg + offset]);
             } else {
                 if (is_signed)
                     h->uni_vpmovsxwd(xmm, ptr[reg + offset]);
@@ -518,6 +528,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak
                 if (is_bf16) {
                     h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]);
                     h->uni_vpslld(vmm, vmm, 16);
+                } else if (is_f16) {
+                    h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]);
                 } else {
                     if (is_signed)
                         h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]);
@@ -530,6 +542,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak
                 if (is_bf16) {
                     h->uni_vpmovzxwd(vmm, xmm);
                     h->uni_vpslld(vmm, vmm, 16);
+                } else if (is_f16) {
+                    h->vcvtph2ps(ymm, xmm);
                 } else {
                     if (is_signed)
                         h->uni_vpmovsxwd(vmm, xmm);
@@ -665,7 +679,7 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d
     if (src_prc_ != dst_prc_) {
         switch (src_prc_) {
             case Precision::FP32:
-                if ((dst_prc_ != Precision::FP32) && (dst_prc_ != Precision::BF16)) {
+                if ((dst_prc_ != Precision::FP32) && (dst_prc_ != Precision::BF16) && (dst_prc_ != Precision::FP16)) {
                     if (is_saturation()) {
                         h->uni_vcvtps2dq(Vmm(aux_src_idx), Vmm(data_idx));
                     } else {
@@ -676,7 +690,7 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d
                 }
                 break;
             case Precision::I32:
-                if ((dst_prc_ == Precision::FP32) || (dst_prc_ == Precision::BF16)) {
+                if ((dst_prc_ == Precision::FP32) || (dst_prc_ == Precision::BF16) || (dst_prc_ == Precision::FP16)) {
                     h->uni_vcvtdq2ps(Vmm(aux_src_idx), Vmm(data_idx));
                     data_idx = aux_src_idx;
                     data_reg_updated = true;
@@ -702,13 +716,10 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_d
                 store_dword_to_byte_extension<Vmm>(reg_dst, offset, false, store_num_);
                 break;
             case Precision::I16:
-                store_dword_to_word_extension<Vmm>(reg_dst, offset, false, true, store_num_);
-                break;
             case Precision::U16:
-                store_dword_to_word_extension<Vmm>(reg_dst, offset, false, false, store_num_);
-                break;
             case Precision::BF16:
-                store_dword_to_word_extension<Vmm>(reg_dst, offset, true, false, store_num_);
+            case Precision::FP16:
+                store_dword_to_word_extension<Vmm>(reg_dst, offset, dst_prc_, store_num_);
                 break;
             default:
                 IE_THROW() << "Store emitter in " << name_ << " has unsupported dst precision to store.";
@@ -1037,7 +1048,11 @@ void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 &reg, i
 */
 template <typename Vmm>
 void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 &reg,
-    int offset, bool is_bf16, bool is_signed, int store_num) const {
+    int offset, InferenceEngine::Precision precision, int store_num) const {
+    const bool is_bf16 = (precision == Precision::BF16);
+    const bool is_f16 = (precision == Precision::FP16);
+    const bool is_signed = (precision == Precision::I16);
+
     constexpr bool is_xmm = std::is_same<Vmm, Xbyak::Xmm>::value;
     constexpr bool is_ymm = std::is_same<Vmm, Xbyak::Ymm>::value;
     constexpr bool is_zmm = std::is_same<Vmm, Xbyak::Zmm>::value;
@@ -1135,6 +1150,22 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 &reg,
             data_idx = static_cast<int>(xmm.getIdx());
             store_bytes<Vmm>(reg, offset, store_num * 2);
         }
+    } else if (is_f16) {
+        if (!mayiuse(cpu::x64::avx512_core_fp16))
+            IE_THROW() << "Store emitter in " << name_ << " only support fp16 on platform with avx512_core_fp16.";
+        // to avoid src vmm pollution
+        if (src_prc_ == Precision::FP32) {
+            // since avx512, zmm(fp32) => ymm(fp16)
+            ymm = Ymm(aux_vec_idxs[0]);
+        } // in I32 case, zmm&ymm is already in aux reg
+
+        h->vcvtps2ph(ymm, zmm, 0x4);
+        if (store_num == 16) {
+            h->vmovdqu16(ptr[reg + offset], ymm);
+        } else {
+            data_idx = static_cast<int>(ymm.getIdx());
+            store_bytes<Vmm>(reg, offset, store_num * 2);
+        }
     } else {
         switch (store_num) {
         case 16:

@@ -82,7 +82,7 @@ class jit_load_emitter : public jit_emitter {
     void load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, bool is_signed, int load_size) const;
 
     template <typename Vmm>
-    void load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, bool is_bf16, bool is_signed, int load_size) const;
+    void load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 &reg, int offset, InferenceEngine::Precision prc, int load_size) const;
 
     template <typename Vmm>
     void fill_with_default(const Vmm &vmm, std::string fill_value, const int &load_num) const;
@@ -145,7 +145,7 @@ class jit_store_emitter : public jit_emitter {
     void store_dword_to_byte_extension(const Xbyak::Reg64 &reg, int offset, bool is_signed, int store_size) const;
 
     template <typename Vmm>
-    void store_dword_to_word_extension(const Xbyak::Reg64 &reg, int offset, bool is_bf16, bool is_signed, int store_size) const;
+    void store_dword_to_word_extension(const Xbyak::Reg64 &reg, int offset, InferenceEngine::Precision precision, int store_size) const;
 
     void register_table_entries() override;
 

@@ -346,9 +346,7 @@ InferenceEngine::Parameter ExecNetwork::GetMetric(const std::string &name) const
         const bool perfCount = config.collectPerfCounters;
         return decltype(ov::enable_profiling)::value_type(perfCount);
     } else if (name == ov::hint::inference_precision) {
-        const auto enforceBF16 = config.enforceBF16;
-        const auto inference_precision = enforceBF16 ? ov::element::bf16 : ov::element::f32;
-        return decltype(ov::hint::inference_precision)::value_type(inference_precision);
+        return decltype(ov::hint::inference_precision)::value_type(config.inferencePrecision);
     } else if (name == ov::hint::performance_mode) {
         const auto perfHint = ov::util::from_string(config.perfHintsConfig.ovPerfHint, ov::hint::performance_mode);
         return perfHint;