Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/inference/dev_api/ie_system_conf.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ using ov::with_cpu_x86_avx512_core_vnni;
*/
using ov::with_cpu_x86_bfloat16;

/**
* @brief Checks whether CPU supports fp16 capability
* @ingroup ie_dev_api_system_conf
* @return `True` is tAVX512_FP16 instructions are available, `false` otherwise
*/
using ov::with_cpu_x86_avx512_core_fp16;

/**
* @brief Checks whether CPU supports AMX int8 capability
* @ingroup ie_dev_api_system_conf
Expand Down
7 changes: 7 additions & 0 deletions src/inference/dev_api/openvino/runtime/system_conf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_vnni();
*/
OPENVINO_RUNTIME_API bool with_cpu_x86_bfloat16();

/**
* @brief Checks whether CPU supports fp16 capability
* @ingroup ov_dev_api_system_conf
* @return `True` is tAVX512_FP16 instructions are available, `false` otherwise
*/
OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_fp16();

/**
* @brief Checks whether CPU supports AMX int8 capability
* @ingroup ov_dev_api_system_conf
Expand Down
7 changes: 7 additions & 0 deletions src/inference/src/system_conf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ bool with_cpu_x86_bfloat16() {
return get_cpu_info().has(Xbyak::util::Cpu::tAVX512_BF16);
}

bool with_cpu_x86_avx512_core_fp16() {
return get_cpu_info().has(Xbyak::util::Cpu::tAVX512_FP16);
}

bool with_cpu_x86_avx512_core_amx_int8() {
return get_cpu_info().has(Xbyak::util::Cpu::tAMX_INT8);
}
Expand Down Expand Up @@ -107,6 +111,9 @@ bool with_cpu_x86_avx512_core_vnni() {
bool with_cpu_x86_bfloat16() {
return false;
}
bool with_cpu_x86_avx512_core_fp16() {
return false;
}
bool with_cpu_x86_avx512_core_amx_int8() {
return false;
}
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,8 @@ void Graph::Replicate(const CNNNetwork &network) {
const auto childEdges = input.second->getChildEdgesAtPort(0);
for (size_t i = 0; i < childEdges.size(); i++) {
const auto child = childEdges[i]->getChild();
if (child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum()) != Precision::BF16 &&
if (!one_of(child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum()),
Precision::BF16, Precision::FP16) &&
// remove this WA when #78939 is resolved
!hasSubgraphConsumers(child))
child->setOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum(), precToSet);
Expand Down
63 changes: 53 additions & 10 deletions src/plugins/intel_cpu/src/nodes/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ bool ReduceKey::operator==(const ReduceKey &rhs) const {

// some utility functions
static inline bool isFloatCompatible(memory::data_type type) {
return memory::data_type::f32 == type || memory::data_type::bf16 == type;
return memory::data_type::f32 == type || memory::data_type::bf16 == type || memory::data_type::f16 == type;
}

template <cpu_isa_t isa>
Expand Down Expand Up @@ -590,6 +590,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
}
break;
case memory::data_type::bf16:
case memory::data_type::f16:
case memory::data_type::s8:
case memory::data_type::u8:
pack_gathered_vector(vmm_src, vmm_idx, offset, jcp_.src_dt);
Expand All @@ -614,8 +615,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32());
break;
case memory::data_type::bf16:
case memory::data_type::f16:
mov(reg_tmp_64.cvt16(), table_idx);
mov(ptr[rsp + i * sizeof(ov::intel_cpu::bfloat16_t)], reg_tmp_64.cvt16());
mov(ptr[rsp + i * 2], reg_tmp_64.cvt16());
break;
case memory::data_type::s8:
case memory::data_type::u8:
Expand All @@ -635,7 +637,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_val, ptr[rsp]);
uni_vpslld(vmm_val, vmm_val, 16);
break;
break;
case memory::data_type::f16:
vcvtph2ps(vmm_val, ptr[rsp]);
break;
case memory::data_type::s8:
uni_vpmovsxbd(vmm_val, ptr[rsp]);
break;
Expand Down Expand Up @@ -890,6 +895,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
case memory::data_type::f16:
vcvtph2ps(vmm_src, op);
break;
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
Expand All @@ -914,6 +922,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
uni_vpinsrw(xmm_src, xmm_src, op, 0x0);
uni_vpslld(xmm_src, xmm_src, 16);
break;
case memory::data_type::f16:
vcvtph2ps(xmm_src, op);
break;
case memory::data_type::s8:
movsx(reg_tmp_32, op);
uni_vmovq(xmm_src, reg_tmp_64);
Expand Down Expand Up @@ -948,6 +959,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
uni_vcvtneps2bf16->emit_code({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case memory::data_type::f16:
vcvtps2ph(op, vmm_dst, 0x4);
break;
case memory::data_type::s8:
if (isa == cpu::x64::avx512_core) {
vpmovsdb(op, vmm_dst);
Expand Down Expand Up @@ -996,6 +1010,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
uni_vpsrld(xmm_dst, xmm_dst, 16);
uni_vpextrw(op, xmm_dst, 0x0);
break;
case memory::data_type::f16:
vcvtps2ph(op, xmm_dst, 0x4);
break;
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
Expand Down Expand Up @@ -1540,6 +1557,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
case memory::data_type::f16:
vcvtph2ps(vmm_src, op);
break;
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
Expand All @@ -1564,6 +1584,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
uni_vpinsrw(xmm_src, xmm_src, op, 0x0);
uni_vpslld(xmm_src, xmm_src, 16);
break;
case memory::data_type::f16:
vcvtph2ps(xmm_src, op);
break;
case memory::data_type::s8:
movsx(reg_tmp_32, op);
uni_vmovq(xmm_src, reg_tmp_64);
Expand Down Expand Up @@ -1598,6 +1621,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
uni_vcvtneps2bf16->emit_code({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case memory::data_type::f16:
vcvtps2ph(op, vmm_dst, 0x4);
break;
case memory::data_type::s8:
if (isa == cpu::x64::avx512_core) {
vpmovsdb(op, vmm_dst);
Expand Down Expand Up @@ -1646,6 +1672,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
uni_vpsrld(xmm_dst, xmm_dst, 16);
uni_vpextrw(op, xmm_dst, 0x0);
break;
case memory::data_type::f16:
vcvtps2ph(op, xmm_dst, 0x4);
break;
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
Expand Down Expand Up @@ -1878,16 +1907,20 @@ void Reduce::initSupportedPrimitiveDescriptors() {

jit_mode = canApplyJIT(input_prec, output_prec);

auto is_precision_sensitive_reduce = [](const Algorithm &algorithm) {
return algorithm != Algorithm::ReduceAnd && algorithm != Algorithm::ReduceOr &&
algorithm != Algorithm::ReduceMin && algorithm != Algorithm::ReduceMax;
};

if (jit_mode) {
// Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to
// Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16/FP16 output precision due to
// the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32.
if (Precision::BF16 == output_prec) {
if (!mayiuse(avx512_core)) {
output_prec = Precision::FP32;
} else if (algorithm != Algorithm::ReduceAnd && algorithm != Algorithm::ReduceOr &&
algorithm != Algorithm::ReduceMin && algorithm != Algorithm::ReduceMax) {
output_prec = Precision::FP32;
}
if (!mayiuse(avx512_core) || is_precision_sensitive_reduce(algorithm))
output_prec = Precision::FP32;
} else if (Precision::FP16 == output_prec) {
if (!mayiuse(cpu::x64::avx2) || is_precision_sensitive_reduce(algorithm))
output_prec = Precision::FP32;
}
}

Expand Down Expand Up @@ -2862,6 +2895,9 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
} else if (output_prec == Precision::BF16) {
auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<bfloat16_t>(1); });
} else if (output_prec == Precision::FP16) {
auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<ov::float16>(1); });
} else if (output_prec == Precision::U8) {
auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<uint8_t>(1); });
Expand All @@ -2880,6 +2916,9 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
} else if (output_prec == Precision::BF16) {
auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::lowest(); });
} else if (output_prec == Precision::FP16) {
auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::lowest(); });
} else if (output_prec == Precision::U8) {
auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::min(); });
Expand All @@ -2898,6 +2937,9 @@ inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
} else if (output_prec == Precision::BF16) {
auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::max(); });
} else if (output_prec == Precision::FP16) {
auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::max(); });
} else if (output_prec == Precision::U8) {
auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::max(); });
Expand Down Expand Up @@ -3268,6 +3310,7 @@ std::vector<int> Reduce::update_src_dims() {
bool Reduce::canApplyJIT(const Precision &input_prec, const Precision &output_prec) const {
static const Precision supportedPrecisions[] = {
Precision::FP32,
Precision::FP16,
Precision::BF16,
Precision::I32,
Precision::I8,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ std::vector<std::string> disabledTestPatterns() {
R"(.*OVCompiledModelBaseTest.*(CanGetInputsInfoAndCheck|canSetConfigToCompiledModel).*)",
R"(.*Behavior.*CorrectConfigCheck.*(canSetConfigAndCheckGetConfig|canSetConfigTwiceAndCheckGetConfig).*CPU_BIND_THREAD=YES.*)",
// Issue: 72021 Unreasonable abs_threshold for comparing bf16 results
R"(.*smoke_Reduce.*type=(Prod|Min).*netPRC=(BF|bf)16.*)",
R"(.*smoke_Reduce.*type=(Prod|Min).*INFERENCE_PRECISION_HINT=(BF|bf)16.*)",
// TODO: 56520 Accuracy mismatch
R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=(I64|I32).*)",
R"(.*ReduceOpsLayerTest.*type=Mean_.*netPRC=U64.*)",
Expand Down Expand Up @@ -246,6 +246,12 @@ std::vector<std::string> disabledTestPatterns() {
retVector.emplace_back(R"(.*Snippets.*MHA.*)");
retVector.emplace_back(R"(.*Snippets.*(MatMul|Matmul).*)");
}
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
if (!InferenceEngine::with_cpu_x86_avx512_core_fp16()) {
// Skip fp16 tests for paltforms that don't support fp16 precision
retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)");
}
#endif
if (!InferenceEngine::with_cpu_x86_avx512_core_vnni() && !InferenceEngine::with_cpu_x86_avx512_core_amx_int8()) {
// MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions
retVector.emplace_back(R"(.*Snippets.*MatMulFQ.*)");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ std::string ReduceCPULayerTest::getTestCaseName(testing::TestParamInfo<ReduceLay
basicReduceParams basicParams;
CPUSpecificParams cpuParams;
fusingSpecificParams fusingParams;
std::tie(basicParams, cpuParams, fusingParams) = obj.param;
std::map<std::string, ov::element::Type> additionalConfig;
std::tie(basicParams, cpuParams, fusingParams, additionalConfig) = obj.param;

std::vector<int> axes;
CommonTestUtils::OpType opType;
Expand Down Expand Up @@ -51,6 +52,13 @@ std::string ReduceCPULayerTest::getTestCaseName(testing::TestParamInfo<ReduceLay
result << "inPRC=" << inPrc << "_";
result << "outPRC=" << outPrc << "_";

if (!additionalConfig.empty()) {
result << "PluginConf";
for (auto& item : additionalConfig) {
result << "_" << item.first << "=" << item.second.get_type_name();
}
}

result << CPUTestsBase::getTestCaseName(cpuParams);
result << CpuTestWithFusing::getTestCaseName(fusingParams);

Expand All @@ -63,7 +71,8 @@ void ReduceCPULayerTest::SetUp() {
basicReduceParams basicParams;
CPUSpecificParams cpuParams;
fusingSpecificParams fusingParams;
std::tie(basicParams, cpuParams, fusingParams) = this->GetParam();
std::map<std::string, ov::element::Type> additionalConfig;
std::tie(basicParams, cpuParams, fusingParams, additionalConfig) = this->GetParam();

std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
std::tie(postOpMgrPtr, fusedOps) = fusingParams;
Expand All @@ -75,7 +84,18 @@ void ReduceCPULayerTest::SetUp() {
std::vector<InputShape> inputShapes;

std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inputShapes) = basicParams;
inPrc = outPrc = netPrecision;
if (netPrecision == ElementType::boolean) {
inPrc = outPrc = netPrecision;
} else {
if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::bf16) {
inPrc = outPrc = netPrecision = ElementType::bf16;
} else if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::f16) {
inPrc = outPrc = netPrecision = ElementType::f16;
} else {
inPrc = outPrc = netPrecision;
}
}
configuration.insert(additionalConfig.begin(), additionalConfig.end());

init_input_shapes(inputShapes);

Expand Down Expand Up @@ -144,6 +164,11 @@ void ReduceCPULayerTest::generate_inputs(const std::vector<ngraph::Shape>& targe
for (size_t i = 0; i < tensor.get_size(); ++i) {
rawBlobDataPtr[i] /= 10.f;
}
} else if (netPrecision == ElementType::f16) {
auto *rawBlobDataPtr = static_cast<ngraph::float16 *>(tensor.data());
for (size_t i = 0; i < tensor.get_size(); ++i) {
rawBlobDataPtr[i] /= 10.f;
}
} else if (netPrecision == ElementType::bf16) {
auto* rawBlobDataPtr = static_cast<ngraph::bfloat16*>(tensor.data());
for (size_t i = 0; i < tensor.get_size(); ++i) {
Expand Down Expand Up @@ -222,10 +247,29 @@ const std::vector<ngraph::helpers::ReductionType>& reductionTypes() {
}

const std::vector<ElementType>& inpOutPrc() {
static const std::vector<ElementType> inpOutPrc = {ElementType::bf16, ElementType::f32};
static const std::vector<ElementType> inpOutPrc = {ElementType::f32};
return inpOutPrc;
}

const std::vector<std::map<std::string, ov::element::Type>> additionalConfig() {
static const std::vector<std::map<std::string, ov::element::Type>> additionalConfig = {
{{ov::hint::inference_precision.name(), ov::element::f32}},
{{ov::hint::inference_precision.name(), ov::element::bf16}},
// ARM doesn't support FP16 for now
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
{{ov::hint::inference_precision.name(), ov::element::f16}},
#endif
};
return additionalConfig;
}

const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32() {
static const std::vector<std::map<std::string, ov::element::Type>> additionalConfig = {
{{ov::hint::inference_precision.name(), ov::element::f32}}
};
return additionalConfig;
}

const std::vector<ngraph::helpers::ReductionType>& reductionTypesInt32() {
static const std::vector<ngraph::helpers::ReductionType> reductionTypesInt32 = {
ngraph::helpers::ReductionType::Sum,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ typedef std::tuple<
typedef std::tuple<
basicReduceParams,
CPUSpecificParams,
fusingSpecificParams> ReduceLayerCPUTestParamSet;
fusingSpecificParams,
std::map<std::string, ov::element::Type>> ReduceLayerCPUTestParamSet;

class ReduceCPULayerTest : public testing::WithParamInterface<ReduceLayerCPUTestParamSet>,
virtual public SubgraphBaseTest, public CpuTestWithFusing {
Expand All @@ -52,6 +53,8 @@ const std::vector<std::vector<int>>& axesND();
const std::vector<CommonTestUtils::OpType>& opTypes();
const std::vector<ngraph::helpers::ReductionType>& reductionTypes();
const std::vector<ElementType>& inpOutPrc();
const std::vector<std::map<std::string, ov::element::Type>> additionalConfig();
const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32();
const std::vector<ngraph::helpers::ReductionType>& reductionTypesInt32();

} // namespace Reduce
Expand Down
Loading