Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,24 @@ using namespace ov::pass::pattern;
((in_ps.size() == 3 && out_ps.size() == 2) || (in_ps.size() == 4 && out_ps.size() == 3));\
};\
\
auto compressed_weights_m = wrap_type<ov::op::v0::Constant>(compressed_constant);\
auto weights_const_m = wrap_type<ov::op::v0::Constant>(compressed_constant);\
auto weights_param_m = wrap_type<ov::op::v0::Parameter>(compressed_constant);\
auto weights_param_reshape_m = wrap_type<ov::op::v1::Reshape>({weights_param_m, any_input()});\
auto compressed_weights_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{weights_const_m, weights_param_m, weights_param_reshape_m});\
auto convert_m = wrap_type<ov::op::v0::Convert>({compressed_weights_m});\
auto weights_param_convert_m = wrap_type<ov::op::v0::Convert>({weights_param_m});\
auto weights_convert_reshape_m = wrap_type<ov::op::v1::Reshape>({weights_param_convert_m, any_input()});\
auto decompressed_weights_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{convert_m, weights_convert_reshape_m});\
\
auto sub_const_m = wrap_type<ov::op::v0::Constant>();\
auto sub_convert_const_m = wrap_type<ov::op::v0::Convert>({sub_const_m});\
auto sub_with_convert_m = wrap_type<ov::op::v1::Subtract>({convert_m, sub_convert_const_m});\
auto sub_no_convert_m = wrap_type<ov::op::v1::Subtract>({convert_m, sub_const_m});\
auto sub_with_convert_m = wrap_type<ov::op::v1::Subtract>({decompressed_weights_m, sub_convert_const_m});\
auto sub_no_convert_m = wrap_type<ov::op::v1::Subtract>({decompressed_weights_m, sub_const_m});\
auto subtract_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{sub_with_convert_m, sub_no_convert_m});\
\
auto mul_const_m = wrap_type<ov::op::v0::Constant>();\
auto mul_with_sub_m = wrap_type<ov::op::v1::Multiply>({subtract_m, mul_const_m});\
auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({convert_m, mul_const_m});\
auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({decompressed_weights_m, mul_const_m});\
auto mul_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_with_sub_m, mul_no_sub_m});\
\
auto reshape_const_m = wrap_type<ov::op::v0::Constant>();\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
const auto& pattern_map = m.get_pattern_value_map();
OPENVINO_ASSERT(pattern_map.count(fully_connected_m));
OPENVINO_ASSERT(pattern_map.count(mul_const_m));
OPENVINO_ASSERT(pattern_map.count(compressed_weights_m));
OPENVINO_ASSERT(pattern_map.count(decompressed_weights_m));
OPENVINO_ASSERT(pattern_map.count(bias_m));
OPENVINO_ASSERT(pattern_map.count(convert_m));
auto fc = ov::as_type_ptr<op::FullyConnected>(pattern_map.at(fully_connected_m).get_node_shared_ptr());
if (!fc || transformation_callback(fc)) {
return false;
Expand All @@ -55,8 +54,9 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
auto weight_shape = fc->get_input_shape(1);
bool is_weight_3d = (std::count_if(weight_shape.begin(), weight_shape.end(), [](size_t d) { return d > 1; }) == 3);

auto weight_ptr = ov::as_type_ptr<ov::op::v0::Constant>(pattern_map.at(compressed_weights_m).get_node_shared_ptr());
bool weight_u8 = false;
std::shared_ptr<ov::Node> weight_ptr =
pattern_map.count(weights_const_m) ? pattern_map.at(weights_const_m).get_node_shared_ptr() : pattern_map.at(weights_param_m).get_node_shared_ptr();
if (weight_ptr->get_element_type() == ov::element::u8 || weight_ptr->get_element_type() == ov::element::i8)
weight_u8 = true;

Expand Down Expand Up @@ -102,7 +102,6 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
return result;
};


const ov::Output<Node>& fc_input_a = fc->input(0).get_source_output();
const auto& scale = reshape_const(pattern_map.at(mul_const_m).get_node_shared_ptr());
std::shared_ptr<ov::Node> optional_zero_point = nullptr;
Expand All @@ -112,12 +111,28 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
optional_zero_point = convert_const_to_u8(reshape_const(pattern_map.at(sub_const_m).get_node_shared_ptr()));
}

std::shared_ptr<ov::Node> fc_input_b = reshape_const(pattern_map.at(compressed_weights_m).get_node_shared_ptr());
std::shared_ptr<ov::Node> fc_input_b =
pattern_map.count(weights_const_m) ? reshape_const(pattern_map.at(weights_const_m).get_node_shared_ptr())
: (pattern_map.count(weights_param_reshape_m) ? pattern_map.at(weights_param_reshape_m).get_node_shared_ptr()
: pattern_map.at(weights_param_m).get_node_shared_ptr());
std::shared_ptr<ov::Node> fc_input_scale = scale;
std::shared_ptr<ov::Node> fc_input_zp = optional_zero_point;
std::shared_ptr<ov::Node> fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr();
std::vector<std::shared_ptr<ov::Node>> result_nodes = {};

if (fc_input_b->get_output_partial_shape(0).size() != weight_shape.size()) {
OPENVINO_ASSERT(weight_shape.size() < 3);
if (has_transpose) {
OPENVINO_ASSERT(weight_shape.size() == 2);
std::swap(weight_shape[0], weight_shape[1]);
}
std::shared_ptr<ov::Node> weight_shape_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{weight_shape.size()}, weight_shape);
fc_input_b = std::make_shared<ov::op::v1::Reshape>(fc_input_b, weight_shape_const, false);
result_nodes.push_back(weight_shape_const);
result_nodes.push_back(fc_input_b);
}

if (has_transpose) {
const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr();
std::shared_ptr<ov::Node> transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr();
Expand All @@ -128,16 +143,16 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
transpose_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{new_order.size()}, new_order);
}

fc_input_b = transpose->clone_with_new_inputs({ fc_input_b->output(0), transpose_const });
fc_input_b = transpose->clone_with_new_inputs({fc_input_b->output(0), transpose_const});
result_nodes.push_back(fc_input_b);

if (ov::shape_size(scale->output(0).get_shape()) > 1) {
fc_input_scale = transpose->clone_with_new_inputs({ scale->output(0), transpose_const });
fc_input_scale = transpose->clone_with_new_inputs({scale->output(0), transpose_const});
result_nodes.push_back(fc_input_scale);
}

if (with_zero_point && ov::shape_size(optional_zero_point->output(0).get_shape()) > 1) {
fc_input_zp = transpose->clone_with_new_inputs({ optional_zero_point->output(0), transpose_const });
fc_input_zp = transpose->clone_with_new_inputs({optional_zero_point->output(0), transpose_const});
result_nodes.push_back(fc_input_zp);
}
}
Expand All @@ -149,18 +164,10 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon

std::shared_ptr<ov::Node> new_fc = nullptr;
if (with_zero_point) {
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
fc_input_b,
fc_input_bias,
fc_input_scale,
fc_input_zp,
fc->get_output_type());
new_fc =
std::make_shared<op::FullyConnectedCompressed>(fc_input_a, fc_input_b, fc_input_bias, fc_input_scale, fc_input_zp, fc->get_output_type());
} else {
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
fc_input_b,
fc_input_bias,
fc_input_scale,
fc->get_output_type());
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a, fc_input_b, fc_input_bias, fc_input_scale, fc->get_output_type());
}

result_nodes.push_back(new_fc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ ConvertMatMulToFullyConnected::ConvertMatMulToFullyConnected(bool supports_immad
};
auto weights_path = [&static_rank_gt_1](const ov::Output<ov::Node>& output) {
const auto& pshape = output.get_partial_shape();
return ov::op::util::is_on_path<ov::op::v0::Constant>(output) &&
return ov::op::util::is_on_path<ov::op::v0::Constant, ov::op::v0::Parameter>(output) &&
static_rank_gt_1(output) &&
pshape.is_static();
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,147 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed10) {
}
}

TEST_F(TransformationTestsF, ConvertFCToCompressed11) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[random spot] Please add functional accuracy tests
It looks like you can extend an existing test with additional parameters
src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Attempted this. The tests have a singular parameter for input precision which prevents proper creation of weight parameters in compressed types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests added with a configure_model() override to ensure weight parameters are provided in appropriate type. Tests currently fail due to the lack of u4 transposition support, WIP to fix.

{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{32, 16});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f16);
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{32, 1}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param});
manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{32, 16});
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{32, 1}, {1});

auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, weights_param, no_bias, scale_const);

model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param});
}
}

TEST_F(TransformationTestsF, ConvertFCToCompressed12) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32});
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32});
auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false);
auto convert = std::make_shared<ov::op::v0::Convert>(reshape, ov::element::f16);
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose = std::make_shared<ov::op::v1::Transpose>(scale, transpose_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param});
manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32});
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32});
auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false);
auto transpose_weights_param = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_weights_param);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1});
auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);

auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, transpose_weights, no_bias, transpose_scale);

model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param});
}
}

TEST_F(TransformationTestsF, ConvertFCToCompressed13) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f16);
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32});
auto reshape = std::make_shared<ov::op::v1::Reshape>(convert, reshape_const, false);
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(reshape, scale_const);
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose = std::make_shared<ov::op::v1::Transpose>(scale, transpose_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param});
manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32});
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32});
auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false);
auto transpose_weights_param = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_weights_param);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 32}, {1});
auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);

auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, transpose_weights, no_bias, transpose_scale);

model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param});
}
}

TEST_F(TransformationTestsF, ConvertFCToCompressed14) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32});
auto convert = std::make_shared<ov::op::v0::Convert>(weights_param, ov::element::f16);
auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{4, 1, 32}, {1});
auto zp_convert = std::make_shared<ov::op::v0::Convert>(zp_const, ov::element::f16);
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_convert);
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{4, 1, 32}, {1});
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32});
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);

model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1, weights_param});
manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 16});
auto weights_param = std::make_shared<ov::op::v0::Parameter>(ov::element::u4, ov::Shape{4, 4, 32});
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {16, 32});
auto reshape = std::make_shared<ov::op::v1::Reshape>(weights_param, reshape_const, false);
auto transpose_weights_param = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_weights_param);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{4, 32}, {1});
auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{4, 32}, {1});
auto zp_convert = std::make_shared<ov::op::v0::Convert>(zp_const, ov::element::f16);
auto transpose_zp_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{2}, {1, 0});
auto transpose_zp = std::make_shared<ov::op::v1::Transpose>(zp_const, transpose_zp_const);

auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, transpose_weights, no_bias, transpose_scale, transpose_zp);

model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1, weights_param});
}
}

} // namespace intel_gpu
} // namespace test
} // namespace ov
Loading