Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
5e1d1f5
[CPU][ARM] Initial FP16 support
May 24, 2023
fb7115f
fp16 support - additional changes
alvoron Jul 5, 2023
5c8f685
mvn node - avoid fp32 force
alvoron Jul 7, 2023
c5527c1
added FP16 case into dumpAsTxt
alvoron Jul 7, 2023
bdec6f3
fixed comments
alvoron Jul 12, 2023
1a8f01c
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 12, 2023
47834f8
mvn fp16 tests added
alvoron Jul 12, 2023
25997e6
enabled fp16 eltwise tests
alvoron Jul 13, 2023
bb65558
skip failed mvn tests and removed eltwise experiments
alvoron Jul 13, 2023
0cd85d2
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 18, 2023
91c5799
disable acl fp16 transpose
alvoron Jul 18, 2023
781cead
eltwise - fp16 for acl only
alvoron Jul 24, 2023
4714b63
BlobDumper fp16 support
alvoron Jul 24, 2023
5853643
enable fp16 tests for transpose
alvoron Jul 24, 2023
a135aa8
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 24, 2023
ea3b96e
enable fp16 mvn and transpose tests on arm only
alvoron Jul 24, 2023
fbd8d88
enable fp16 eltwise tests on arm only
alvoron Jul 24, 2023
2427369
wrap fp16 changes by arm fp16 define
alvoron Jul 25, 2023
55e6126
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 25, 2023
be75803
REG_FAST_DIRECT_COPY_F16_F16 adedd to oneDNN
alvoron Jul 25, 2023
e9d15a3
disable FuseFullyConnectedAndSimpleOperation is ACL is used
alvoron Jul 25, 2023
a1fd3f2
fullyconnected - force weights and bias to fp16
alvoron Jul 25, 2023
10be069
FC - weightsDataType fp16 force for arm only
alvoron Jul 25, 2023
c3ba7b3
fix comments
alvoron Jul 26, 2023
18fdcb3
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 26, 2023
4cbdbe3
updated mvn and transpose tests
alvoron Jul 26, 2023
f8bf113
fix additional_config type in Eltwise tests
alvoron Jul 26, 2023
51be938
fix arm fp16 define in Eltwise tests
alvoron Jul 26, 2023
ce64f8d
removed FuseFullyConnectedAndSimpleOperation limitation
alvoron Jul 27, 2023
982dd76
enable fp16 in eltwise tests
alvoron Jul 27, 2023
2811c2a
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 27, 2023
ebc6d67
fix eltwise slt tests
alvoron Jul 27, 2023
0376376
fix eltwise and mvn tests
alvoron Jul 27, 2023
aca2c69
fix tests
alvoron Jul 27, 2023
5713603
force precision to fp32 for integer types
alvoron Jul 27, 2023
a3d9c1d
fix eltwise tests
alvoron Jul 27, 2023
5904b32
fix transpose tests and allow nhwc transpose for fp16
alvoron Jul 27, 2023
213a9ca
created OV_CPU_ARM_TARGET_ARCH
alvoron Jul 28, 2023
f31baf0
removed OV_CPU_ARM_TARGET_GENERIC_ARCHS
alvoron Jul 28, 2023
062eda1
Merge branch 'master' into alvoron_arm_fp16_func
alvoron Jul 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ if(DNNL_USE_ACL)
set(OV_CPU_WITH_ACL ON)
endif()

if (OV_CPU_WITH_ACL AND ARM_COMPUTE_ENABLE_FP16)
set(OV_CPU_WITH_ACL_FP16 ON)
endif()

if(OV_CPU_WITH_ACL)
set(CMAKE_CXX_STANDARD 14)
endif()
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,16 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
inferencePrecisionSetExplicitly = true;
}
} else if (val == "f16") {
#if defined(OPENVINO_ARCH_X86_64)
if (mayiuse(avx512_core_fp16) || mayiuse(avx512_core_amx_fp16)) {
inferencePrecision = ov::element::f16;
inferencePrecisionSetExplicitly = true;
}
#elif defined(OV_CPU_WITH_ACL_FP16)
// TODO: add runtime FP16 feature support check for ARM
inferencePrecision = ov::element::f16;
inferencePrecisionSetExplicitly = true;
#endif
} else if (val == "f32") {
inferencePrecision = ov::element::f32;
inferencePrecisionSetExplicitly = true;
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,8 @@ void Graph::Replicate(const CNNNetwork &network) {
const auto childEdges = input.second->getChildEdgesAtPort(0);
for (size_t i = 0; i < childEdges.size(); i++) {
const auto child = childEdges[i]->getChild();
if (!one_of(child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum()),
Precision::BF16, Precision::FP16) &&
const auto child_prec = child->getOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum());
if (!one_of(child_prec, Precision::BF16, Precision::FP16) &&
// remove this WA when #78939 is resolved
!hasSubgraphConsumers(child))
child->setOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum(), precToSet);
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,11 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) {
FuseConvolutionAndSimpleOperation(graph);
graph.RemoveDroppedNodes();

#if !defined(OV_CPU_WITH_ACL)
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFullyConnectedAndSimpleOperation");
FuseFullyConnectedAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
#endif

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseMatMulAndSimpleOperation");
FuseMatMulAndSimpleOperation(graph);
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_cpu/src/nodes/conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,12 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
dnnl::memory::desc biasDnnlDesc;

if (withBiases) {
//oneDNN ARM Convolution primitive supports only identical in/out data types
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
memory::data_type bdt = outDnnlDesc.get_data_type();
#else
memory::data_type bdt = memory::data_type::f32;
#endif
biasDnnlDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any);
}

Expand Down
20 changes: 20 additions & 0 deletions src/plugins/intel_cpu/src/nodes/eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2021,6 +2021,25 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
IE_THROW() << "Eltwise node with name `" << getName() << "` doesn't support BF16 precision on this target.";
}

#if defined(OV_CPU_WITH_ACL)
Precision forcedPrec;
//ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one
if (AclEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) {
for (size_t i = 0; i < getParentEdges().size(); i++) {
if (!getParentEdgeAt(i)->getParent()->isConstant()) {
if (!forcedPrec || getOriginalInputPrecisionAtPort(i).size() > forcedPrec.size()) {
forcedPrec = getOriginalInputPrecisionAtPort(i);
}
}
}
} else {
forcedPrec = Precision::FP32;
}
for (size_t i = 0; i < inputPrecisions.size(); i++) {
inputPrecisions[i] = forcedPrec;
}
outputPrecision = forcedPrec;
#else
auto filterPrecision = [&](Precision& prc) {
if (implType == EltwiseImplType::reference) {
return Precision(Precision::FP32);
Expand All @@ -2039,6 +2058,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
inputPrecisions[i] = filterPrecision(inputPrecisions[i]);
}
outputPrecision = filterPrecision(outputPrecision);
#endif

// TODO: delete after new LPT (ngraph based) is merged
// WA is needed to handle bug in LPT that produces wrong precision after average pooling (I8/U8 instead of FP32)
Expand Down
36 changes: 35 additions & 1 deletion src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,40 @@ inline VectorDims reshape_sizes(VectorDims dims) {
return result_dims;
}

bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) {
if (one_of(algorithm, Algorithm::EltwiseSqrt,
Algorithm::EltwiseDivide,
Algorithm::EltwiseRelu,
#ifdef OPENVINO_ARCH_ARM64
Algorithm::EltwiseGeluErf,
#endif
Algorithm::EltwiseElu,
Algorithm::EltwiseTanh,
Algorithm::EltwiseSigmoid,
Algorithm::EltwiseSoftRelu,
Algorithm::EltwiseClamp,
Algorithm::EltwiseSwish,
Algorithm::EltwisePrelu,
Algorithm::EltwiseHswish,
Algorithm::EltwiseAbs,
Algorithm::EltwiseExp,
Algorithm::EltwiseLog,
Algorithm::EltwiseMaximum,
Algorithm::EltwiseMinimum,
Algorithm::EltwiseSquaredDifference,
Algorithm::EltwiseAdd,
Algorithm::EltwiseSubtract,
Algorithm::EltwiseMultiply,
Algorithm::EltwiseEqual,
Algorithm::EltwiseNotEqual,
Algorithm::EltwiseGreater,
Algorithm::EltwiseGreaterEqual,
Algorithm::EltwiseLess,
Algorithm::EltwiseLessEqual)) {
return true;
}
return false;
}

bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
Expand All @@ -50,7 +84,7 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs,
case Algorithm::EltwiseSigmoid:
case Algorithm::EltwiseSoftRelu:
case Algorithm::EltwiseClamp:
case Algorithm::EltwiseSwish: // TODO: CVS-109354: efficientdet-d0 accuracy drops if ACL Swish is used
case Algorithm::EltwiseSwish:
case Algorithm::EltwisePrelu:
case Algorithm::EltwiseHswish:
if (!(checkPrecision({Precision::FP16, Precision::FP16}, Precision::FP16) ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ using namespace InferenceEngine;
class AclEltwiseExecutor : public EltwiseExecutor {
public:
explicit AclEltwiseExecutor(const ExecutorContext::CPtr context);
static bool isEltwiseAlgorithmSupported(Algorithm algorithm);

bool init(const EltwiseAttrs& eltwiseAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ class ACLTransposeExecutorBuilder : public TransposeExecutorBuilder {
srcDescs[0]->getShape().getRank());
return false;
}
if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision()) {
DEBUG_LOG("NEPermute requires the same input and output precisions");
return false;
}
if (srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[0]->getPrecision() != InferenceEngine::Precision::I8) {
DEBUG_LOG("NEPermute supports 1, 2, 4 bytes data types. FP16 implementation is disabled due to performance issues");
return false;
}
return true;
}

Expand Down
8 changes: 8 additions & 0 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,9 @@ void FullyConnected::getSupportedDescriptors() {
if (!one_of(outputDataType , memory::data_type::f32, memory::data_type::f16)) {
outputDataType = memory::data_type::f16;
}
#if defined(OV_CPU_WITH_ACL)
weightsDataType = memory::data_type::f16;
#endif
} else if (one_of(inputDataType, memory::data_type::u8, memory::data_type::s8)) {
if (weightsDataType != memory::data_type::s8) {
// weight has to be s8 for INT8 mode, otherwise fallback to
Expand Down Expand Up @@ -715,7 +718,12 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
dnnl::memory::data_type bdt = outdt;

if (one_of(indt, dnnl::memory::data_type::bf16, dnnl::memory::data_type::f16)) {
//oneDNN ARM InnerProduct primitive supports only identical in/out data types
#if defined(OPENVINO_ARCH_X86_64)
bdt = dnnl::memory::data_type::f32;
#else
bdt = dnnl::memory::data_type::f16;
#endif
} else if (indt == dnnl::memory::data_type::u8 || indt == dnnl::memory::data_type::s8) {
wdt = memory::data_type::s8;
if (withBiases)
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/nodes/mvn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1859,12 +1859,12 @@ void MVN::initSupportedPrimitiveDescriptors() {
}
}
}

#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
// ref with float planar and no fusion
if (!mayiuse(cpu::x64::sse41)) {
inputPrecision = outputPrecision = Precision::FP32;
}

#endif
// TODO [DS]: inplace
bool canBeInplace = !isDynamicNode() && (inputPrecision.size() == outputPrecision.size()) &&
(getParentEdgeAt(0)->getParent()->getChildEdges().size() == 1) &&
Expand Down
17 changes: 15 additions & 2 deletions src/plugins/intel_cpu/src/utils/blob_dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ void BlobDumper::prepare_plain_data(const MemoryPtr &memory, std::vector<uint8_t
pln_blob_ptr[i] = blob_ptr[desc.getElementOffset(i)];
break;
}
case Precision::FP16: {
auto *pln_blob_ptr = reinterpret_cast<float16 *>(data.data());
auto *blob_ptr = reinterpret_cast<const float16 *>(ptr);
for (size_t i = 0; i < data_size; i++)
pln_blob_ptr[i] = blob_ptr[desc.getElementOffset(i)];
break;
}
case Precision::I8:
case Precision::U8: {
auto *pln_blob_ptr = reinterpret_cast<int8_t*>(data.data());
Expand Down Expand Up @@ -172,6 +179,12 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) const {
stream << blob_ptr[desc.getElementOffset(i)] << std::endl;
break;
}
case Precision::I32: {
auto *blob_ptr = reinterpret_cast<const int32_t*>(ptr);
for (size_t i = 0; i < data_size; i++)
stream << blob_ptr[desc.getElementOffset(i)] << std::endl;
break;
}
case Precision::BF16: {
auto *blob_ptr = reinterpret_cast<const bfloat16_t*>(ptr);
for (size_t i = 0; i < data_size; i++) {
Expand All @@ -180,8 +193,8 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) const {
}
break;
}
case Precision::I32: {
auto *blob_ptr = reinterpret_cast<const int32_t*>(ptr);
case Precision::FP16: {
auto *blob_ptr = reinterpret_cast<const float16*>(ptr);
for (size_t i = 0; i < data_size; i++)
stream << blob_ptr[desc.getElementOffset(i)] << std::endl;
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void EltwiseLayerCPUTest::SetUp() {
ElementType netType;
ngraph::helpers::InputLayerType secondaryInputType;
CommonTestUtils::OpType opType;
Config additional_config;
std::map<std::string, ov::element::Type> additional_config;
std::tie(shapes, eltwiseType, secondaryInputType, opType, netType, inType, outType, targetDevice, configuration) = basicParamsSet;

if (ElementType::bf16 == netType) {
Expand Down Expand Up @@ -161,9 +161,15 @@ TEST_P(EltwiseLayerCPUTest, CompareWithRefs) {

namespace Eltwise {

const ov::AnyMap& additional_config() {
static const ov::AnyMap additional_config;
return additional_config;
const std::vector<std::map<std::string, ov::element::Type>>& additional_config() {
static const std::vector<std::map<std::string, ov::element::Type>> additionalConfig = {
{{ov::hint::inference_precision.name(), ov::element::f32}},
// x86 doesn't support FP16 for now
#if defined(OV_CPU_WITH_ACL_FP16)
{{ov::hint::inference_precision.name(), ov::element::f16}},
#endif
};
return additionalConfig;
}

const std::vector<ElementType>& netType() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class EltwiseLayerCPUTest : public testing::WithParamInterface<EltwiseLayerCPUTe

namespace Eltwise {

const ov::AnyMap& additional_config();
const std::vector<std::map<std::string, ov::element::Type>>& additional_config();

const std::vector<ElementType>& netType();
const std::vector<CommonTestUtils::OpType>& opTypes();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ std::string MvnLayerCPUTest::getTestCaseName(testing::TestParamInfo<MvnLayerCPUT
CPUSpecificParams cpuParams;
fusingSpecificParams fusingParams;
ElementType inputPrecision, outputPrecision;
std::tie(basicParamsSet, cpuParams, fusingParams, inputPrecision, outputPrecision) = obj.param;
std::map<std::string, ov::element::Type> additionalConfig;
std::tie(basicParamsSet, cpuParams, fusingParams, inputPrecision, outputPrecision, additionalConfig) = obj.param;

InputShape inputShapes;
ElementType netPrecision;
Expand Down Expand Up @@ -46,6 +47,13 @@ std::string MvnLayerCPUTest::getTestCaseName(testing::TestParamInfo<MvnLayerCPUT
result << "_"
<< "CNNOutPrc=" << outputPrecision;

if (!additionalConfig.empty()) {
result << "PluginConf";
for (auto& item : additionalConfig) {
result << "_" << item.first << "=" << item.second.get_type_name();
}
}

result << CPUTestsBase::getTestCaseName(cpuParams);

result << CpuTestWithFusing::getTestCaseName(fusingParams);
Expand All @@ -72,7 +80,8 @@ void MvnLayerCPUTest::SetUp() {
fusingSpecificParams fusingParams;
ElementType inPrc;
ElementType outPrc;
std::tie(basicParamsSet, cpuParams, fusingParams, inPrc, outPrc) = this->GetParam();
std::map<std::string, ov::element::Type> additionalConfig;
std::tie(basicParamsSet, cpuParams, fusingParams, inPrc, outPrc, additionalConfig) = this->GetParam();

std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
std::tie(postOpMgrPtr, fusedOps) = fusingParams;
Expand Down Expand Up @@ -102,6 +111,13 @@ void MvnLayerCPUTest::SetUp() {
selectedType = makeSelectedTypeStr(selectedType, netPrecision);

rel_threshold = 0.015f;
if (additionalConfig[ov::hint::inference_precision.name()] == ov::element::f16) {
inPrc = outPrc = netPrecision = ElementType::f16;
//FIXME: ref and acl mvn implementation has accuracy issues on fp16 (#116344)
abs_threshold = .05f;
rel_threshold = 250.f;
}
configuration.insert(additionalConfig.begin(), additionalConfig.end());
function = makeNgraphFunction(netPrecision, param, mvn, "mvn");
}

Expand All @@ -111,6 +127,17 @@ TEST_P(MvnLayerCPUTest, CompareWithRefs) {
}

namespace MVN {
const std::vector<std::map<std::string, ov::element::Type>>& additionalConfig() {
static const std::vector<std::map<std::string, ov::element::Type>> additionalConfig = {
{{ov::hint::inference_precision.name(), ov::element::f32}},
// x86 doesn't support FP16 for now
#if defined(OV_CPU_WITH_ACL_FP16)
{{ov::hint::inference_precision.name(), ov::element::f16}},
#endif
};
return additionalConfig;
}

const std::vector<InputShape>& inputShapes_1D() {
static const std::vector<InputShape> inputShapes_1D = {
{ {}, {{5}}},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,6 @@ namespace MVN {
const std::vector<bool>& acrossChannels();
const std::vector<double>& epsilon();

const std::vector<std::map<std::string, ov::element::Type>>& additionalConfig();
} // namespace MVN
} // namespace CPULayerTestsDefinitions
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ const auto params_4D_emptyCPUSpec = ::testing::Combine(
::testing::ValuesIn(eltwiseOpTypesDiffInp()),
::testing::ValuesIn(secondaryInputTypes()),
::testing::ValuesIn(opTypes()),
::testing::ValuesIn(netType()),
::testing::Values(ElementType::f32), //FIXME: FP16 tests crashes on this test
::testing::Values(ov::element::undefined),
::testing::Values(ov::element::undefined),
::testing::Values(CommonTestUtils::DEVICE_CPU),
Expand Down Expand Up @@ -69,7 +69,7 @@ const auto params_5D_emptyCPUSpec = ::testing::Combine(
::testing::ValuesIn(eltwiseOpTypesDiffInp()),
::testing::ValuesIn(secondaryInputTypes()),
::testing::ValuesIn(opTypes()),
::testing::ValuesIn(netType()),
::testing::Values(ElementType::f32), //FIXME: FP16 tests crashes on this test
::testing::Values(ov::element::undefined),
::testing::Values(ov::element::undefined),
::testing::Values(CommonTestUtils::DEVICE_CPU),
Expand Down
Loading