-
Notifications
You must be signed in to change notification settings - Fork 2.9k
[GPU] Recognize parameters as valid inputs for compressed weights #32276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 11 commits
f557a05
1c45696
a714620
53d9074
ea48b32
29d0d89
e32bf59
8f8becf
30dd0ef
d80ce82
515f0d0
83a0b55
d8d9e22
6f49a0d
9831c47
9f75ca4
ae5c996
9f96b7f
d443c08
b510b91
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -463,6 +463,147 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed10) { | |
| } | ||
| } | ||
|
|
||
| TEST_F(TransformationTestsF, ConvertFCToCompressed11) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [random spot] Please add functional accuracy tests
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Attempted this. The tests have a singular parameter for input precision which prevents proper creation of weight parameters in compressed types.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests added with a |
||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{32, 16}); | ||
| auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f16); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{32, 1}, {1}); | ||
| auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
|
|
||
| auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias); | ||
|
|
||
| model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param}); | ||
| manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>(); | ||
| } | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{32, 16}); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{32, 1}, {1}); | ||
|
|
||
| auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, weights_param, no_bias, scale_const); | ||
|
|
||
| model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param}); | ||
| } | ||
| } | ||
|
|
||
| TEST_F(TransformationTestsF, ConvertFCToCompressed12) { | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32}); | ||
| auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32}); | ||
| auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false); | ||
| auto convert = std::make_shared<ov::op::v0::Convert>(reshape, ov::element::f16); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1}); | ||
| auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const); | ||
| auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose = std::make_shared<ov::op::v1::Transpose>(scale, transpose_const); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
|
|
||
| auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias); | ||
|
|
||
| model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param}); | ||
| manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>(); | ||
| } | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32}); | ||
| auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32}); | ||
| auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false); | ||
| auto transpose_weights_param = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_weights_param); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1}); | ||
| auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const); | ||
|
|
||
| auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, transpose_weights, no_bias, transpose_scale); | ||
|
|
||
| model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param}); | ||
| } | ||
| } | ||
|
|
||
| TEST_F(TransformationTestsF, ConvertFCToCompressed13) { | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32}); | ||
| auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f16); | ||
| auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32}); | ||
| auto reshape = std::make_shared<ov::op::v1::Reshape>(convert, reshape_const, false); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1}); | ||
| auto scale = std::make_shared<ov::op::v1::Multiply>(reshape, scale_const); | ||
| auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose = std::make_shared<ov::op::v1::Transpose>(scale, transpose_const); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
|
|
||
| auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias); | ||
|
|
||
| model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param}); | ||
| manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>(); | ||
| } | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32}); | ||
| auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32}); | ||
| auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false); | ||
| auto transpose_weights_param = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_weights_param); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1}); | ||
| auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const); | ||
|
|
||
| auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, transpose_weights, no_bias, transpose_scale); | ||
|
|
||
| model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param}); | ||
| } | ||
| } | ||
|
|
||
| TEST_F(TransformationTestsF, ConvertFCToCompressed14) { | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32}); | ||
| auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f16); | ||
| auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{4, 1, 32}, {1}); | ||
| auto zp_convert = std::make_shared<ov::op::v0::Convert>(zp_const, ov::element::f16); | ||
| auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_convert); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{4, 1, 32}, {1}); | ||
| auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const); | ||
| auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32}); | ||
| auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false); | ||
| auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
|
|
||
| auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias); | ||
|
|
||
| model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param}); | ||
| manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>(); | ||
| } | ||
| { | ||
| auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16}); | ||
| auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32}); | ||
| auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32}); | ||
| auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false); | ||
| auto transpose_weights_param = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_weights_param); | ||
| auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>(); | ||
| auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{4, 32}, {1}); | ||
| auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const); | ||
| auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{4, 32}, {1}); | ||
| auto zp_convert = std::make_shared<ov::op::v0::Convert>(zp_const, ov::element::f16); | ||
| auto transpose_zp_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0}); | ||
| auto transpose_zp = std::make_shared<ov::op::v1::Transpose>(zp_const, transpose_zp_const); | ||
|
|
||
| auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, transpose_weights, no_bias, transpose_scale, transpose_zp); | ||
|
|
||
| model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param}); | ||
| } | ||
| } | ||
|
|
||
| } // namespace intel_gpu | ||
| } // namespace test | ||
| } // namespace ov | ||
Uh oh!
There was an error while loading. Please reload this page.