From 9eeaac7d230695dc75dafc826c43307eb1dcbf72 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Thu, 13 Jul 2023 10:27:10 +0400
Subject: [PATCH 01/38] switch on pytorch frontend

---
 optimum/intel/openvino/export.py             | 286 +++++++++++++++++++
 optimum/intel/openvino/modeling_base.py      |   8 +-
 optimum/intel/openvino/modeling_decoder.py   |   8 +-
 optimum/intel/openvino/modeling_diffusion.py |   3 +-
 optimum/intel/openvino/trainer.py            |   2 +-
 5 files changed, 298 insertions(+), 9 deletions(-)
 create mode 100644 optimum/intel/openvino/export.py

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
new file mode 100644
index 0000000000..24636f7266
--- /dev/null
+++ b/optimum/intel/openvino/export.py
@@ -0,0 +1,286 @@
+import os
+import logging
+import inspect
+from inspect import signature
+from itertools import chain
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+import time
+
+import numpy as np
+from transformers.utils import is_tf_available, is_torch_available
+
+from optimum.utils import TORCH_MINIMUM_VERSION, is_diffusers_available, is_torch_onnx_support_available, logging
+from optimum.exporters.onnx.base import OnnxConfig
+from optimum.exporters.onnx.convert import export_tensorflow, check_dummy_inputs_are_allowed
+
+from openvino.tools import mo
+from openvino.runtime import serialize, PartialShape
+from openvino.runtime.utils.types import get_element_type
+from .utils import OV_XML_FILE_NAME
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.pytorch_utils import is_torch_less_than_1_11
+
+if is_diffusers_available():
+    from diffusers import ModelMixin
+
+if is_tf_available():
+    from transformers.modeling_tf_utils import TFPreTrainedModel
+
+def is_torch_model(model):
+    if not is_torch_available():
+        return False
+    return isinstance(model, nn.Module)
+
+def export(
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+    config: OnnxConfig,
+    output: Path,
+    opset: Optional[int] = None,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+) -> Tuple[List[str], List[str]]:
+    """
+    Exports a Pytorch or TensorFlow model to an ONNX Intermediate Representation.
+
+    Args:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model to export.
+        config ([`~exporters.onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+        opset (`Optional[int]`, defaults to `None`):
+            The version of the ONNX operator set to use.
+        device (`str`, *optional*, defaults to `cpu`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`Optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+    if not (is_torch_available() or is_tf_available()):
+        raise ImportError(
+            "Cannot convert because neither PyTorch nor TensorFlow are installed. "
+            "Please install torch or tensorflow first."
+        )
+
+    if "diffusers" in str(model.__class__) and not is_diffusers_available():
+        raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
+
+    if is_torch_available() and isinstance(model, nn.Module):
+        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes)
+
+    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
+        output.parent.mkdir(parents=True, exist_ok=True)
+        if opset is None:
+            opset = config.DEFAULT_ONNX_OPSET
+        if device == "cuda":
+            raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
+        if input_shapes is not None:
+            print("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
+        return export_tensorflow(model, config, opset, output)
+
+    else:
+        raise RuntimeError(
+            "You either provided a PyTorch model with only TensorFlow installed, or a TensorFlow model with only PyTorch installed."
+        )
+
+
+def export_pytorch(
+    model: Union["PreTrainedModel", "ModelMixin"],
+    config: OnnxConfig,
+    opset: int,
+    output: Path,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+) -> Tuple[List[str], List[str]]:
+    """
+    Exports a PyTorch model to an ONNX Intermediate Representation.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The model to export.
+        config ([`~exporters.onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        opset (`int`):
+            The version of the ONNX operator set to use.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+        device (`str`, defaults to `"cpu"`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+    import torch
+    from torch.onnx import export as onnx_export
+    from torch.utils._pytree import tree_map
+
+    print(f"Using framework PyTorch: {torch.__version__}")
+
+    with torch.no_grad():
+        model.config.return_dict = True
+        model.config.torchscript = True
+        model.eval()
+
+        # Check if we need to override certain configuration item
+        if config.values_override is not None:
+            print(f"Overriding {len(config.values_override)} configuration item(s)")
+            for override_config_key, override_config_value in config.values_override.items():
+                print(f"\t- {override_config_key} -> {override_config_value}")
+                setattr(model.config, override_config_key, override_config_value)
+
+        if input_shapes is None:
+            input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES
+
+        # Check that inputs match, and order them properly
+        dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes)
+        device = torch.device(device)
+        if device.type == "cuda" and torch.cuda.is_available():
+            model.to(device)
+            dummy_inputs = tree_map(
+                lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs
+            )
+        check_dummy_inputs_are_allowed(model, dummy_inputs)
+        inputs = config.ordered_inputs(model)
+        input_names = list(inputs.keys())
+        output_names = list(config.outputs.keys())
+
+        if hasattr(config, "patch_ops"):
+            config.patch_ops()
+        
+        if hasattr(model, "forward"):
+            sig = inspect.signature(model.forward)
+        else:
+            sig = inspect.signature(model.call)
+
+        input_info = get_input_shapes(dummy_inputs, inputs)
+        start0 = time.perf_counter()
+        ov_model = mo.convert_model(model, example_input=dummy_inputs, input=input_info)
+        end0 = time.perf_counter()
+        print(f"Convert model took {end0 - start0}s")
+        ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
+        ordered_input_names = list(inputs)
+        flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
+                
+        for idx, inp_tensor in enumerate(ov_model.inputs):
+            input_name = ordered_input_names[idx]
+            inp_tensor.get_tensor().set_names({input_name})
+            inp_data = flatten_inputs[idx]
+            static_shape = PartialShape(inp_data.shape)
+            dims = inputs[input_name]
+
+            for dim in dims:
+                static_shape[dim] = -1 
+            inp_tensor.get_node().set_partial_shape(static_shape)
+            inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
+
+        for idx, out_tensor in enumerate(ov_model.outputs):
+            if idx < len(output_names):
+                out_tensor.get_tensor().set_names({output_names[idx]})
+        ov_model.validate_nodes_and_infer_types()
+        start1 = time.perf_counter()
+        serialize(ov_model, output.parent / OV_XML_FILE_NAME)
+        end1 = time.perf_counter()
+        print(f"Serailize model took {end1 - start1}s")
+        if hasattr(config, "restore_ops"):
+            config.restore_ops()
+
+    return input_names, output_names
+
+
+def export_models(
+    models_and_onnx_configs: Dict[
+        str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
+    ],
+    output_dir: Path,
+    opset: Optional[int] = None,
+    output_names: Optional[List[str]] = None,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+) -> Tuple[List[List[str]], List[List[str]]]:
+    """
+    Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
+    The following method exports the encoder and decoder components of the model as separate
+    ONNX files.
+
+    Args:
+        models_and_onnx_configs (`Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `OnnxConfig`]]):
+            A dictionnary containing the models to export and their corresponding onnx configs.
+        output_dir (`Path`):
+            Output directory to store the exported ONNX models.
+        opset (`Optional[int]`, defaults to `None`):
+            The version of the ONNX operator set to use.
+        output_names (`Optional[List[str]]`, defaults to `None`):
+            The names to use for the exported ONNX files. The order must be the same as the order of submodels in the ordered dict `models_and_onnx_configs`.
+            If None, will use the keys from `models_and_onnx_configs` as names.
+        device (`str`, defaults to `"cpu"`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`Optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+    Returns:
+        `Tuple[List[List[str]], List[List[str]]]`: A tuple with an ordered list of the model's inputs, and the named
+        inputs from the ONNX configuration.
+    """
+    outputs = []
+
+    if output_names is not None and len(output_names) != len(models_and_onnx_configs):
+        raise ValueError(
+            f"Provided custom names {output_names} for the export of {len(models_and_onnx_configs)} models. Please provide the same number of names as models to export."
+        )
+
+    for i, model_name in enumerate(models_and_onnx_configs.keys()):
+        submodel, sub_onnx_config = models_and_onnx_configs[model_name]
+        output_name = output_names[i] if output_names is not None else Path(model_name + ".xml")
+        output_path = output_dir / output_name
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        outputs.append(
+            export(
+                model=submodel,
+                config=sub_onnx_config,
+                output=output_path,
+                opset=opset,
+                device=device,
+                input_shapes=input_shapes,
+            )
+        )
+
+    outputs = list(map(list, zip(*outputs)))
+    return outputs
+
+
+def flattenize_inputs(inputs):
+    flatten_inputs = []
+    for input_data in inputs:
+        if isinstance(input_data, (list, tuple)):
+            flatten_inputs.extend(flattenize_inputs(input_data))
+        else:
+            flatten_inputs.append(input_data)
+    return flatten_inputs
+
+
+def get_input_shapes(dummy_inputs, inputs):
+    input_info = []
+    for input_name, data in dummy_inputs.items():
+        if isinstance(data, (tuple, list)):
+            return None
+        static_shape = PartialShape(data.shape)
+        if input_name in inputs:
+            dynamic_dims = inputs[input_name]
+            for dim in dynamic_dims:
+                static_shape[dim] = -1
+        input_info.append((input_name, static_shape))
+    return input_info
\ No newline at end of file
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 14ac76137f..bc1bf6cbbd 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -21,12 +21,14 @@
 import openvino
 from huggingface_hub import hf_hub_download
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
+from openvino.tools import mo
 from openvino.runtime import Core
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
-from optimum.exporters.onnx import OnnxConfig, export
+from optimum.exporters.onnx import OnnxConfig
 from optimum.exporters.tasks import TasksManager
+from .export import export, is_torch_model
 from optimum.modeling_base import OptimizedModel
 
 from ..utils.import_utils import is_transformers_version
@@ -130,7 +132,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
             file_name = Path(file_name)
         bin_file_name = file_name.with_suffix(".bin") if file_name.suffix == ".xml" else None
 
-        model = core.read_model(file_name, bin_file_name)
+        model = core.read_model(file_name, bin_file_name) if not file_name.suffix == ".onnx" else mo.convert_model(file_name)
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
@@ -315,7 +317,7 @@ def _to_onnx_to_load(
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=True,
+            from_onnx=not is_torch_model(model),
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 16cf6c20d3..01c3ffae26 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -27,8 +27,8 @@
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
+from optimum.exporters import TasksManager
 from optimum.exporters.onnx import export
-from optimum.exporters.tasks import TasksManager
 from optimum.utils import NormalizedConfigManager
 
 from ..utils.import_utils import is_transformers_version
@@ -234,7 +234,7 @@ def _from_transformers(
         # TODO : create ModelPatcher to patch each architecture
         if config.model_type == "bloom":
             model.transformer._prepare_attn_mask = _prepare_attn_mask
-        elif config.model_type == "llama":
+        elif config.model_type in {"llama", "longllama"}:
             model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
         elif config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
             model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
@@ -245,12 +245,12 @@ def _from_transformers(
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=True,
+            from_onnx=not is_torch_model(model),
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
             cache_dir=cache_dir,
-            file_name=ONNX_WEIGHTS_NAME,
+            file_name=ONNX_WEIGHTS_NAME if not is_torch_model(model) else OV_XML_FILE_NAME,
             local_files_only=local_files_only,
             use_cache=use_cache,
             **kwargs,
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 73ec66d473..4c54e69614 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -51,6 +51,7 @@
 )
 
 from .loaders import OVTextualInversionLoaderMixin
+from .export import export_models
 from .modeling_base import OVBaseModel
 from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME
 
@@ -312,7 +313,7 @@ def _from_transformers(
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=True,
+            from_onnx=False,
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 811309806a..091d28047a 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -41,7 +41,7 @@
 from nncf.torch.quantization.algo import QuantizationController
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, PartialShape, serialize
-from openvino.tools.mo.back.offline_transformations import (
+from openvino.tools.ovc.moc_frontend.offline_transformations import (
     apply_fused_names_cleanup,
     apply_moc_transformations,
     apply_user_transformations,

From 680e383cd8c1c9e6b5453e926b75f9f75e458ecd Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 19 Jul 2023 11:41:07 +0400
Subject: [PATCH 02/38] fixes for seq2seq

---
 optimum/intel/openvino/export.py              | 74 +++++++++++++------
 optimum/intel/openvino/modeling_base.py       |  5 +-
 .../intel/openvino/modeling_base_seq2seq.py   | 13 ++--
 optimum/intel/openvino/modeling_decoder.py    |  5 ++
 optimum/intel/openvino/modeling_seq2seq.py    |  1 +
 5 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 24636f7266..a2012954bf 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,29 +1,24 @@
-import os
-import logging
 import inspect
-from inspect import signature
-from itertools import chain
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union, Any
+import functools
 import time
 
-import numpy as np
 from transformers.utils import is_tf_available, is_torch_available
 
-from optimum.utils import TORCH_MINIMUM_VERSION, is_diffusers_available, is_torch_onnx_support_available, logging
+from optimum.utils import is_diffusers_available
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.convert import export_tensorflow, check_dummy_inputs_are_allowed
+from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
 
-from openvino.tools import mo
+from openvino.tools.mo import convert_model 
 from openvino.runtime import serialize, PartialShape
 from openvino.runtime.utils.types import get_element_type
 from .utils import OV_XML_FILE_NAME
 
 if is_torch_available():
-    import torch
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
-    from transformers.pytorch_utils import is_torch_less_than_1_11
 
 if is_diffusers_available():
     from diffusers import ModelMixin
@@ -101,6 +96,7 @@ def export_pytorch(
     output: Path,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an ONNX Intermediate Representation.
@@ -132,7 +128,8 @@ def export_pytorch(
 
     with torch.no_grad():
         model.config.return_dict = True
-        model.config.torchscript = True
+        custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
+        model.config.torchscript = not custom_patcher
         model.eval()
 
         # Check if we need to override certain configuration item
@@ -157,23 +154,42 @@ def export_pytorch(
         inputs = config.ordered_inputs(model)
         input_names = list(inputs.keys())
         output_names = list(config.outputs.keys())
-
-        if hasattr(config, "patch_ops"):
-            config.patch_ops()
-        
         if hasattr(model, "forward"):
             sig = inspect.signature(model.forward)
         else:
             sig = inspect.signature(model.call)
 
+        dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
         input_info = get_input_shapes(dummy_inputs, inputs)
         start0 = time.perf_counter()
-        ov_model = mo.convert_model(model, example_input=dummy_inputs, input=input_info)
+        try:
+            if custom_patcher:
+                patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
+                patched_forward = patcher.patched_forward
+                @functools.wraps(patched_forward)
+                def ts_patched_forward(*args, **kwargs):
+                    outputs = patched_forward(*args, **kwargs)
+                    return tuple(outputs.values())
+                patcher.patched_forward = ts_patched_forward
+                with patcher:
+                    ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
+            else:
+                ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
+        except Exception:
+            onnx_output = output.with_suffix(".onnx")
+            input_names, output_names = export_pytorch_to_onnx(model, config, opset, onnx_output, device, input_shapes, model_kwargs)
+            ov_model = convert_model(onnx_output)
+            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+            return input_names, output_names
+
         end0 = time.perf_counter()
         print(f"Convert model took {end0 - start0}s")
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
         flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
+        for idx, out_tensor in enumerate(ov_model.outputs):
+            if idx < len(output_names):
+                out_tensor.get_tensor().set_names({output_names[idx]})
                 
         for idx, inp_tensor in enumerate(ov_model.inputs):
             input_name = ordered_input_names[idx]
@@ -186,18 +202,11 @@ def export_pytorch(
                 static_shape[dim] = -1 
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
-
-        for idx, out_tensor in enumerate(ov_model.outputs):
-            if idx < len(output_names):
-                out_tensor.get_tensor().set_names({output_names[idx]})
         ov_model.validate_nodes_and_infer_types()
         start1 = time.perf_counter()
-        serialize(ov_model, output.parent / OV_XML_FILE_NAME)
+        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
         end1 = time.perf_counter()
         print(f"Serailize model took {end1 - start1}s")
-        if hasattr(config, "restore_ops"):
-            config.restore_ops()
-
     return input_names, output_names
 
 
@@ -265,6 +274,8 @@ def export_models(
 def flattenize_inputs(inputs):
     flatten_inputs = []
     for input_data in inputs:
+        if input_data is None:
+            continue
         if isinstance(input_data, (list, tuple)):
             flatten_inputs.extend(flattenize_inputs(input_data))
         else:
@@ -272,6 +283,21 @@ def flattenize_inputs(inputs):
     return flatten_inputs
 
 
+def remove_none_from_dummy_inputs(dummy_inputs):
+    def remove_none_from_list_tuple(item):
+        new_item = [i for i in item if i is not None]
+        return type(item)(new_item)
+
+    upd_dummy = {} 
+    for k, v in dummy_inputs.items():
+        if v is None:
+            continue
+        if isinstance(v, (tuple, list)):
+            upd_dummy[k] = remove_none_from_list_tuple(v)
+            continue
+        upd_dummy[k] = v
+    return upd_dummy
+
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index bc1bf6cbbd..71e74b154e 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Union
+import time
 
 import openvino
 from huggingface_hub import hf_hub_download
@@ -131,8 +132,10 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
         if isinstance(file_name, str):
             file_name = Path(file_name)
         bin_file_name = file_name.with_suffix(".bin") if file_name.suffix == ".xml" else None
-
+        s = time.perf_counter()
         model = core.read_model(file_name, bin_file_name) if not file_name.suffix == ".onnx" else mo.convert_model(file_name)
+        e = time.perf_counter()
+        print(f"Read model took {e - s}s")
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index a8ce3d0bf5..af5e2388f8 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -24,8 +24,9 @@
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
-from optimum.exporters.onnx import export_models, get_encoder_decoder_models_for_export
-from optimum.exporters.tasks import TasksManager
+from optimum.exporters import TasksManager
+from optimum.exporters.onnx import get_encoder_decoder_models_for_export
+from .export import export_models
 
 from ..utils.import_utils import is_transformers_version
 from .modeling_base import OVBaseModel
@@ -243,9 +244,6 @@ def _from_transformers(
             kwargs (`Dict`, *optional*):
                 kwargs will be passed to the model during initialization
         """
-        encoder_file_name = os.path.join("encoder", ONNX_ENCODER_NAME)
-        decoder_file_name = os.path.join("decoder", ONNX_DECODER_NAME)
-        decoder_with_past_file_name = os.path.join("decoder_with_past", ONNX_DECODER_WITH_PAST_NAME)
         task = task or cls.export_feature
 
         save_dir = TemporaryDirectory()
@@ -265,6 +263,9 @@ def _from_transformers(
         onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
         onnx_config = onnx_config_constructor(model.config, use_past=use_cache)
         models_and_onnx_configs = get_encoder_decoder_models_for_export(model, onnx_config)
+        encoder_file_name = os.path.join("encoder", OV_ENCODER_NAME)
+        decoder_file_name = os.path.join("decoder", OV_DECODER_NAME)
+        decoder_with_past_file_name = os.path.join("decoder_with_past", OV_DECODER_WITH_PAST_NAME)
 
         output_names = [encoder_file_name, decoder_file_name]
         if use_cache is True:
@@ -281,7 +282,7 @@ def _from_transformers(
             model_id=save_dir_path,
             config=config,
             use_cache=use_cache,
-            from_onnx=True,
+            from_onnx=False,
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 01c3ffae26..cf2100bac3 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Tuple, Union
+import time
 
 import numpy as np
 import openvino
@@ -30,6 +31,7 @@
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import export
 from optimum.utils import NormalizedConfigManager
+#from optimum.exporters.onnx import export
 
 from ..utils.import_utils import is_transformers_version
 from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
@@ -225,7 +227,10 @@ def _from_transformers(
             "force_download": force_download,
             "trust_remote_code": trust_remote_code,
         }
+        start0 = time.perf_counter()
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
+        end0 = time.perf_counter()
+        print(f"Reading PT model took {end0 - start0}")
         config.is_decoder = True
         config.is_encoder_decoder = False
         onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 0f52335639..9994b70d64 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -413,6 +413,7 @@ def forward(
         if "encoder_hidden_states" in self.input_names and encoder_hidden_states is not None:
             inputs["encoder_hidden_states"] = encoder_hidden_states
 
+        print(self.model)
         # Run inference
         self.request.start_async(inputs, shared_memory=True)
         self.request.wait()

From a4d7d265dca4525256a38cdce5374e1a80bce5f7 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 28 Jul 2023 08:33:38 +0400
Subject: [PATCH 03/38] wip

---
 optimum/intel/openvino/export.py             | 328 ++++++++++++++++++-
 optimum/intel/openvino/modeling_decoder.py   |   3 +-
 optimum/intel/openvino/modeling_diffusion.py |   2 +-
 3 files changed, 324 insertions(+), 9 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index a2012954bf..bf5cc7b933 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,15 +1,20 @@
 import inspect
+import os
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union, Any
+from typing import Dict, List, Optional, Tuple, Union, Any, Callable
 import functools
 import time
 
 from transformers.utils import is_tf_available, is_torch_available
+from transformers import AutoTokenizer
 
-from optimum.utils import is_diffusers_available
-from optimum.exporters.onnx.base import OnnxConfig
+from optimum.utils import is_diffusers_available, DEFAULT_DUMMY_SHAPES
+from optimum.exporters import TasksManager
+from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
 from optimum.exporters.onnx.convert import export_tensorflow, check_dummy_inputs_are_allowed
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
+from optimum.utils.save_utils import maybe_save_preprocessors
+from optimum.exporters.onnx import __main__
 
 from openvino.tools.mo import convert_model 
 from openvino.runtime import serialize, PartialShape
@@ -38,6 +43,7 @@ def export(
     opset: Optional[int] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an ONNX Intermediate Representation.
@@ -71,7 +77,7 @@ def export(
         raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
 
     if is_torch_available() and isinstance(model, nn.Module):
-        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes)
+        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs)
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
         output.parent.mkdir(parents=True, exist_ok=True)
@@ -219,6 +225,7 @@ def export_models(
     output_names: Optional[List[str]] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[List[str]], List[List[str]]]:
     """
     Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
@@ -264,6 +271,7 @@ def export_models(
                 opset=opset,
                 device=device,
                 input_shapes=input_shapes,
+                model_kwargs=model_kwargs,
             )
         )
 
@@ -292,6 +300,10 @@ def remove_none_from_list_tuple(item):
     for k, v in dummy_inputs.items():
         if v is None:
             continue
+        if isinstance(v, dict):
+            for kk, vv in v.items():
+                upd_dummy[kk] = vv
+            continue
         if isinstance(v, (tuple, list)):
             upd_dummy[k] = remove_none_from_list_tuple(v)
             continue
@@ -301,7 +313,7 @@ def remove_none_from_list_tuple(item):
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
-        if isinstance(data, (tuple, list)):
+        if isinstance(data, (tuple, list, dict)):
             return None
         static_shape = PartialShape(data.shape)
         if input_name in inputs:
@@ -309,4 +321,308 @@ def get_input_shapes(dummy_inputs, inputs):
             for dim in dynamic_dims:
                 static_shape[dim] = -1
         input_info.append((input_name, static_shape))
-    return input_info
\ No newline at end of file
+    return input_info
+
+
+def main_export(
+    model_name_or_path: str,
+    output: Union[str, Path],
+    task: str = "auto",
+    device: str = "cpu",
+    fp16: Optional[bool] = False,
+    optimize: Optional[str] = None,
+    monolith: bool = False,
+    framework: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    trust_remote_code: bool = False,
+    pad_token_id: Optional[int] = None,
+    subfolder: str = "",
+    revision: str = "main",
+    force_download: bool = False,
+    local_files_only: bool = False,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    for_ort: bool = False,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+    custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
+    fn_get_submodels: Optional[Callable] = None,
+    **kwargs_shapes,
+):
+    """
+    Full-suite ONNX export.
+
+    Args:
+        > Required parameters
+
+        model_name_or_path (`str`):
+            Model ID on huggingface.co or path on disk to the model repository to export.
+        output (`Union[str, Path]`):
+            Path indicating the directory where to store the generated ONNX model.
+
+        > Optional parameters
+
+        task (`Optional[str]`, defaults to `None`):
+            The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models,
+            use `xxx-with-past` to export the model using past key values in the decoder.
+        opset (`Optional[int]`, defaults to `None`):
+            If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture
+            will be used.
+        device (`str`, defaults to `"cpu"`):
+            The device to use to do the export. Defaults to "cpu".
+        fp16 (`Optional[bool]`, defaults to `"False"`):
+            Use half precision during the export. PyTorch-only, requires `device="cuda"`.
+        optimize (`Optional[str]`, defaults to `None`):
+            Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to
+            ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT.
+            Available options: `"O1", "O2", "O3", "O4"`. Reference: [`~optimum.onnxruntime.AutoOptimizationConfig`]
+        monolith (`bool`, defaults to `False`):
+            Forces to export the model as a single ONNX file.
+        no_post_process (`bool`, defaults to `False`):
+            Allows to disable any post-processing done by default on the exported ONNX models.
+        framework (`Optional[str]`, defaults to `None`):
+            The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect
+            the framework for the checkpoint.
+        atol (`Optional[float]`, defaults to `None`):
+            If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
+        cache_dir (`Optional[str]`, defaults to `None`):
+            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+            model repository.
+        pad_token_id (`Optional[int]`, defaults to `None`):
+            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
+        subfolder (`str`, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+            specify the folder name here.
+        revision (`str`, defaults to `"main"`):
+            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+        force_download (`bool`, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        local_files_only (`Optional[bool]`, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        use_auth_token (`Optional[str]`, defaults to `None`):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
+            Experimental usage: keyword arguments to pass to the model during
+            the export. This argument should be used along the `custom_onnx_configs` argument
+            in case, for example, the model inputs/outputs are changed (for example, if
+            `model_kwargs={"output_attentions": True}` is passed).
+        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
+            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
+        fn_get_submodels (`Optional[Callable]`, defaults to `None`):
+            Experimental usage: Override the default submodels that are used at the export. This is
+            especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
+        use_subprocess (`bool`):
+            Do the ONNX exported model validation in subprocesses. This is especially useful when
+            exporting on CUDA device, where ORT does not release memory at inference session
+            destruction. When set to `True`, the `main_export` call should be guarded in
+            `if __name__ == "__main__":` block.
+        **kwargs_shapes (`Dict`):
+            Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
+
+    Example usage:
+    ```python
+    >>> from optimum.exporters.onnx import main_export
+
+    >>> main_export("gpt2", output="gpt2_onnx/")
+    ```
+    """
+    if optimize == "O4" and device != "cuda":
+        raise ValueError(
+            "Requested O4 optimization, but this optimization requires to do the export on GPU."
+            " Please pass the argument `--device cuda`."
+        )
+
+    if (framework == "tf" and fp16 is True) or not is_torch_available():
+        raise ValueError("The --fp16 option is supported only for PyTorch.")
+
+    if fp16 is True and device == "cpu":
+        raise ValueError(
+            "The --fp16 option is supported only when exporting on GPU. Please pass the option `--device cuda`."
+        )
+
+    output = Path(output)
+    if not output.exists():
+        output.mkdir(parents=True)
+
+    if for_ort:
+        logger.warning(
+            "The option --for-ort was passed, but its behavior is now the default in the ONNX exporter"
+            " and passing it is not required anymore."
+        )
+
+    original_task = task
+    task = TasksManager.map_from_synonym(task)
+
+    framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
+
+    # get the shapes to be used to generate dummy inputs
+    input_shapes = {}
+    for input_name in DEFAULT_DUMMY_SHAPES.keys():
+        input_shapes[input_name] = (
+            kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
+        )
+
+    torch_dtype = None if fp16 is False else torch.float16
+
+    if task == "auto":
+        try:
+            task = TasksManager.infer_task_from_model(model_name_or_path)
+        except KeyError as e:
+            raise KeyError(
+                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+        except RequestsConnectionError as e:
+            raise RequestsConnectionError(
+                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+
+    model = TasksManager.get_model_from_task(
+        task,
+        model_name_or_path,
+        subfolder=subfolder,
+        revision=revision,
+        cache_dir=cache_dir,
+        use_auth_token=use_auth_token,
+        local_files_only=local_files_only,
+        force_download=force_download,
+        trust_remote_code=trust_remote_code,
+        framework=framework,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+
+    custom_architecture = False
+    is_stable_diffusion = "stable-diffusion" in task
+    model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
+
+    if not is_stable_diffusion:
+        if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
+            raise ValueError(
+                f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
+                f"If you want to support {model_type} please propose a PR or open up an issue."
+            )
+        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
+            task, exporter="onnx"
+        ):
+            custom_architecture = True
+
+    # TODO: support onnx_config.py in the model repo
+    if custom_architecture and custom_onnx_configs is None:
+        raise ValueError(
+            "Trying to export a model with a custom architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models."
+        )
+
+    if custom_architecture and original_task == "auto":
+        raise ValueError(
+            f'Automatic task detection is not supported with custom architectures. Please specify the `task` argument. Suggestion: task="{task}" (or task="{task}-with-past" if the model is decoder-based and supports KV cache)'
+        )
+
+    if (
+        not custom_architecture
+        and not is_stable_diffusion
+        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
+    ):
+        if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
+            task = task + "-with-past"
+        else:
+            print(
+                f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
+                f" if needed, please pass `--task {task}-with-past` to export using the past key values."
+            )
+
+    if task.endswith("-with-past") and monolith is True:
+        task_non_past = task.replace("-with-past", "")
+        raise ValueError(
+            f"The task {task} is not compatible with the --monolith argument. Please either use"
+            f" `--task {task_non_past} --monolith`, or `--task {task}` without the monolith argument."
+        )
+
+    if original_task == "auto":
+        synonyms_for_task = sorted(TasksManager.synonyms_for_task(task))
+        if synonyms_for_task:
+            synonyms_for_task = ", ".join(synonyms_for_task)
+            possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
+        else:
+            possible_synonyms = ""
+        print(f"Automatic task detection to {task}{possible_synonyms}.")
+
+    onnx_config, models_and_onnx_configs = __main__._get_submodels_and_onnx_configs(
+        model=model,
+        task=task,
+        monolith=monolith,
+        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
+        custom_architecture=custom_architecture,
+        fn_get_submodels=fn_get_submodels,
+    )
+
+    if not is_stable_diffusion:
+        needs_pad_token_id = (
+            isinstance(onnx_config, OnnxConfigWithPast)
+            and getattr(model.config, "pad_token_id", None) is None
+            and task in ["text-classification"]
+        )
+        if needs_pad_token_id:
+            if pad_token_id is not None:
+                model.config.pad_token_id = pad_token_id
+            else:
+                try:
+                    tok = AutoTokenizer.from_pretrained(model_name_or_path)
+                    model.config.pad_token_id = tok.pad_token_id
+                except Exception:
+                    raise ValueError(
+                        "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
+                    )
+        # Saving the model config and preprocessor as this is needed sometimes.
+        model.config.save_pretrained(output)
+        generation_config = getattr(model, "generation_config", None)
+        if generation_config is not None:
+            generation_config.save_pretrained(output)
+        maybe_save_preprocessors(model_name_or_path, output)
+
+        if model.config.is_encoder_decoder and task.startswith("text-generation"):
+            raise ValueError(
+                f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
+                f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
+                f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`."
+            )
+
+        files_subpaths = None
+    else:
+        # save the subcomponent configuration
+        for model_name in models_and_onnx_configs:
+            subcomponent = models_and_onnx_configs[model_name][0]
+            if hasattr(subcomponent, "save_config"):
+                subcomponent.save_config(output / model_name)
+            elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
+                subcomponent.config.save_pretrained(output / model_name)
+
+        files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs]
+
+        # Saving the additional components needed to perform inference.
+        model.scheduler.save_pretrained(output.joinpath("scheduler"))
+
+        feature_extractor = getattr(model, "feature_extractor", None)
+        if feature_extractor is not None:
+            feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+
+        tokenizer = getattr(model, "tokenizer", None)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(output.joinpath("tokenizer"))
+
+        tokenizer_2 = getattr(model, "tokenizer_2", None)
+        if tokenizer_2 is not None:
+            tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
+
+        model.save_config(output)
+
+    export_models(
+        models_and_onnx_configs=models_and_onnx_configs,
+        output_dir=output,
+        output_names=files_subpaths,
+        input_shapes=input_shapes,
+        device=device,
+        model_kwargs=model_kwargs,
+    )
\ No newline at end of file
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index cf2100bac3..bcdc077a3d 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -29,14 +29,13 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from optimum.exporters import TasksManager
-from optimum.exporters.onnx import export
 from optimum.utils import NormalizedConfigManager
-#from optimum.exporters.onnx import export
 
 from ..utils.import_utils import is_transformers_version
 from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
+from .export import export, is_torch_model
 
 
 if is_transformers_version("<", "4.25.0"):
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 4c54e69614..c0d0870db3 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -36,7 +36,7 @@
 from openvino.runtime import Core
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
-from optimum.exporters.onnx import main_export
+from .export import main_export
 from optimum.pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
 from optimum.pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
 from optimum.pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin

From 323776375d1d01b443bcecc8fffe141322f6e9d5 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 13:57:17 +0400
Subject: [PATCH 04/38] cleanup

---
 optimum/intel/openvino/export.py           | 181 ++++-----------------
 optimum/intel/openvino/modeling_base.py    |   7 +-
 optimum/intel/openvino/modeling_decoder.py |   4 -
 optimum/intel/openvino/modeling_seq2seq.py |   2 -
 optimum/intel/openvino/quantization.py     |  49 +++---
 optimum/intel/openvino/trainer.py          |   2 +-
 setup.py                                   |   4 +-
 7 files changed, 65 insertions(+), 184 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index bf5cc7b933..ba12f53e26 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,9 +1,9 @@
+import logging
 import inspect
 import os
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union, Any, Callable
 import functools
-import time
 
 from transformers.utils import is_tf_available, is_torch_available
 from transformers import AutoTokenizer
@@ -16,11 +16,13 @@
 from optimum.utils.save_utils import maybe_save_preprocessors
 from optimum.exporters.onnx import __main__
 
-from openvino.tools.mo import convert_model 
+from openvino.tools.mo import convert_model
 from openvino.runtime import serialize, PartialShape
 from openvino.runtime.utils.types import get_element_type
 from .utils import OV_XML_FILE_NAME
 
+logger = logging.getLogger(__name__)
+
 if is_torch_available():
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
@@ -31,11 +33,13 @@
 if is_tf_available():
     from transformers.modeling_tf_utils import TFPreTrainedModel
 
+
 def is_torch_model(model):
     if not is_torch_available():
         return False
     return isinstance(model, nn.Module)
 
+
 def export(
     model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
     config: OnnxConfig,
@@ -77,7 +81,9 @@ def export(
         raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
 
     if is_torch_available() and isinstance(model, nn.Module):
-        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs)
+        return export_pytorch(
+            model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
+        )
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
         output.parent.mkdir(parents=True, exist_ok=True)
@@ -86,7 +92,7 @@ def export(
         if device == "cuda":
             raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
         if input_shapes is not None:
-            print("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
+            logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
         return export_tensorflow(model, config, opset, output)
 
     else:
@@ -127,10 +133,10 @@ def export_pytorch(
         the ONNX configuration.
     """
     import torch
-    from torch.onnx import export as onnx_export
     from torch.utils._pytree import tree_map
 
-    print(f"Using framework PyTorch: {torch.__version__}")
+    logger.info(f"Using framework PyTorch: {torch.__version__}")
+    output = Path(output)
 
     with torch.no_grad():
         model.config.return_dict = True
@@ -140,9 +146,9 @@ def export_pytorch(
 
         # Check if we need to override certain configuration item
         if config.values_override is not None:
-            print(f"Overriding {len(config.values_override)} configuration item(s)")
+            logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
             for override_config_key, override_config_value in config.values_override.items():
-                print(f"\t- {override_config_key} -> {override_config_value}")
+                logger.info(f"\t- {override_config_key} -> {override_config_value}")
                 setattr(model.config, override_config_key, override_config_value)
 
         if input_shapes is None:
@@ -167,36 +173,39 @@ def export_pytorch(
 
         dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
         input_info = get_input_shapes(dummy_inputs, inputs)
-        start0 = time.perf_counter()
         try:
             if custom_patcher:
                 patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
                 patched_forward = patcher.patched_forward
+
                 @functools.wraps(patched_forward)
                 def ts_patched_forward(*args, **kwargs):
                     outputs = patched_forward(*args, **kwargs)
                     return tuple(outputs.values())
+
                 patcher.patched_forward = ts_patched_forward
                 with patcher:
                     ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
             else:
                 ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
         except Exception:
+            model.config.torchscript = False
+            model.config.return_dict = True
             onnx_output = output.with_suffix(".onnx")
-            input_names, output_names = export_pytorch_to_onnx(model, config, opset, onnx_output, device, input_shapes, model_kwargs)
+            input_names, output_names = export_pytorch_to_onnx(
+                model, config, opset, onnx_output, device, input_shapes, model_kwargs
+            )
             ov_model = convert_model(onnx_output)
             serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-            return input_names, output_names
+            return input_names, output_names, True
 
-        end0 = time.perf_counter()
-        print(f"Convert model took {end0 - start0}s")
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
         flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
         for idx, out_tensor in enumerate(ov_model.outputs):
             if idx < len(output_names):
                 out_tensor.get_tensor().set_names({output_names[idx]})
-                
+
         for idx, inp_tensor in enumerate(ov_model.inputs):
             input_name = ordered_input_names[idx]
             inp_tensor.get_tensor().set_names({input_name})
@@ -205,15 +214,12 @@ def ts_patched_forward(*args, **kwargs):
             dims = inputs[input_name]
 
             for dim in dims:
-                static_shape[dim] = -1 
+                static_shape[dim] = -1
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
-        start1 = time.perf_counter()
         serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-        end1 = time.perf_counter()
-        print(f"Serailize model took {end1 - start1}s")
-    return input_names, output_names
+    return input_names, output_names, False
 
 
 def export_models(
@@ -227,30 +233,6 @@ def export_models(
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[List[str]], List[List[str]]]:
-    """
-    Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
-    The following method exports the encoder and decoder components of the model as separate
-    ONNX files.
-
-    Args:
-        models_and_onnx_configs (`Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `OnnxConfig`]]):
-            A dictionnary containing the models to export and their corresponding onnx configs.
-        output_dir (`Path`):
-            Output directory to store the exported ONNX models.
-        opset (`Optional[int]`, defaults to `None`):
-            The version of the ONNX operator set to use.
-        output_names (`Optional[List[str]]`, defaults to `None`):
-            The names to use for the exported ONNX files. The order must be the same as the order of submodels in the ordered dict `models_and_onnx_configs`.
-            If None, will use the keys from `models_and_onnx_configs` as names.
-        device (`str`, defaults to `"cpu"`):
-            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
-            export on CUDA devices.
-        input_shapes (`Optional[Dict]`, defaults to `None`):
-            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
-    Returns:
-        `Tuple[List[List[str]], List[List[str]]]`: A tuple with an ordered list of the model's inputs, and the named
-        inputs from the ONNX configuration.
-    """
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
@@ -296,7 +278,7 @@ def remove_none_from_list_tuple(item):
         new_item = [i for i in item if i is not None]
         return type(item)(new_item)
 
-    upd_dummy = {} 
+    upd_dummy = {}
     for k, v in dummy_inputs.items():
         if v is None:
             continue
@@ -310,6 +292,7 @@ def remove_none_from_list_tuple(item):
         upd_dummy[k] = v
     return upd_dummy
 
+
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
@@ -330,7 +313,6 @@ def main_export(
     task: str = "auto",
     device: str = "cpu",
     fp16: Optional[bool] = False,
-    optimize: Optional[str] = None,
     monolith: bool = False,
     framework: Optional[str] = None,
     cache_dir: Optional[str] = None,
@@ -341,118 +323,15 @@ def main_export(
     force_download: bool = False,
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
-    for_ort: bool = False,
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
     **kwargs_shapes,
 ):
-    """
-    Full-suite ONNX export.
-
-    Args:
-        > Required parameters
-
-        model_name_or_path (`str`):
-            Model ID on huggingface.co or path on disk to the model repository to export.
-        output (`Union[str, Path]`):
-            Path indicating the directory where to store the generated ONNX model.
-
-        > Optional parameters
-
-        task (`Optional[str]`, defaults to `None`):
-            The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models,
-            use `xxx-with-past` to export the model using past key values in the decoder.
-        opset (`Optional[int]`, defaults to `None`):
-            If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture
-            will be used.
-        device (`str`, defaults to `"cpu"`):
-            The device to use to do the export. Defaults to "cpu".
-        fp16 (`Optional[bool]`, defaults to `"False"`):
-            Use half precision during the export. PyTorch-only, requires `device="cuda"`.
-        optimize (`Optional[str]`, defaults to `None`):
-            Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to
-            ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT.
-            Available options: `"O1", "O2", "O3", "O4"`. Reference: [`~optimum.onnxruntime.AutoOptimizationConfig`]
-        monolith (`bool`, defaults to `False`):
-            Forces to export the model as a single ONNX file.
-        no_post_process (`bool`, defaults to `False`):
-            Allows to disable any post-processing done by default on the exported ONNX models.
-        framework (`Optional[str]`, defaults to `None`):
-            The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect
-            the framework for the checkpoint.
-        atol (`Optional[float]`, defaults to `None`):
-            If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
-        cache_dir (`Optional[str]`, defaults to `None`):
-            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
-        trust_remote_code (`bool`, defaults to `False`):
-            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
-            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
-            model repository.
-        pad_token_id (`Optional[int]`, defaults to `None`):
-            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
-        subfolder (`str`, defaults to `""`):
-            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
-            specify the folder name here.
-        revision (`str`, defaults to `"main"`):
-            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
-        force_download (`bool`, defaults to `False`):
-            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-            cached versions if they exist.
-        local_files_only (`Optional[bool]`, defaults to `False`):
-            Whether or not to only look at local files (i.e., do not try to download the model).
-        use_auth_token (`Optional[str]`, defaults to `None`):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
-            Experimental usage: keyword arguments to pass to the model during
-            the export. This argument should be used along the `custom_onnx_configs` argument
-            in case, for example, the model inputs/outputs are changed (for example, if
-            `model_kwargs={"output_attentions": True}` is passed).
-        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
-            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
-        fn_get_submodels (`Optional[Callable]`, defaults to `None`):
-            Experimental usage: Override the default submodels that are used at the export. This is
-            especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
-        use_subprocess (`bool`):
-            Do the ONNX exported model validation in subprocesses. This is especially useful when
-            exporting on CUDA device, where ORT does not release memory at inference session
-            destruction. When set to `True`, the `main_export` call should be guarded in
-            `if __name__ == "__main__":` block.
-        **kwargs_shapes (`Dict`):
-            Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
-
-    Example usage:
-    ```python
-    >>> from optimum.exporters.onnx import main_export
-
-    >>> main_export("gpt2", output="gpt2_onnx/")
-    ```
-    """
-    if optimize == "O4" and device != "cuda":
-        raise ValueError(
-            "Requested O4 optimization, but this optimization requires to do the export on GPU."
-            " Please pass the argument `--device cuda`."
-        )
-
-    if (framework == "tf" and fp16 is True) or not is_torch_available():
-        raise ValueError("The --fp16 option is supported only for PyTorch.")
-
-    if fp16 is True and device == "cpu":
-        raise ValueError(
-            "The --fp16 option is supported only when exporting on GPU. Please pass the option `--device cuda`."
-        )
-
     output = Path(output)
     if not output.exists():
         output.mkdir(parents=True)
 
-    if for_ort:
-        logger.warning(
-            "The option --for-ort was passed, but its behavior is now the default in the ONNX exporter"
-            " and passing it is not required anymore."
-        )
-
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
@@ -528,7 +407,7 @@ def main_export(
         if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
             task = task + "-with-past"
         else:
-            print(
+            logger.info(
                 f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
                 f" if needed, please pass `--task {task}-with-past` to export using the past key values."
             )
@@ -547,7 +426,7 @@ def main_export(
             possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
         else:
             possible_synonyms = ""
-        print(f"Automatic task detection to {task}{possible_synonyms}.")
+        logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
 
     onnx_config, models_and_onnx_configs = __main__._get_submodels_and_onnx_configs(
         model=model,
@@ -625,4 +504,4 @@ def main_export(
         input_shapes=input_shapes,
         device=device,
         model_kwargs=model_kwargs,
-    )
\ No newline at end of file
+    )
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 71e74b154e..cf8e94c0c7 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -133,9 +133,12 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
             file_name = Path(file_name)
         bin_file_name = file_name.with_suffix(".bin") if file_name.suffix == ".xml" else None
         s = time.perf_counter()
-        model = core.read_model(file_name, bin_file_name) if not file_name.suffix == ".onnx" else mo.convert_model(file_name)
+        model = (
+            core.read_model(file_name, bin_file_name)
+            if not file_name.suffix == ".onnx"
+            else mo.convert_model(file_name)
+        )
         e = time.perf_counter()
-        print(f"Read model took {e - s}s")
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index bcdc077a3d..cf81437b01 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -17,7 +17,6 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Tuple, Union
-import time
 
 import numpy as np
 import openvino
@@ -226,10 +225,7 @@ def _from_transformers(
             "force_download": force_download,
             "trust_remote_code": trust_remote_code,
         }
-        start0 = time.perf_counter()
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
-        end0 = time.perf_counter()
-        print(f"Reading PT model took {end0 - start0}")
         config.is_decoder = True
         config.is_encoder_decoder = False
         onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 9994b70d64..4d5f4e2934 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -412,8 +412,6 @@ def forward(
         # Add the encoder_hidden_states inputs when needed
         if "encoder_hidden_states" in self.input_names and encoder_hidden_states is not None:
             inputs["encoder_hidden_states"] = encoder_hidden_states
-
-        print(self.model)
         # Run inference
         self.request.start_async(inputs, shared_memory=True)
         self.request.wait()
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 99e22e72f5..6c7db722e2 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -33,7 +33,8 @@
 from torch.utils.data import DataLoader, RandomSampler, TensorDataset
 from transformers import DataCollator, PreTrainedModel, default_data_collator
 
-from optimum.exporters.onnx import export
+from optimum.exporters.onnx import export as onnx_export
+from .export import export
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 
@@ -384,28 +385,32 @@ def data_collator(batch):
         else:
             onnx_config = onnx_config_class(model.config)
 
-        onnx_path = save_directory / ONNX_WEIGHTS_NAME
-
-        # Export the model to the ONNX format
-        opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
-        opset = max(opset, MIN_ONNX_QDQ_OPSET)
-        export(
-            model=compressed_model,
-            config=onnx_config,
-            opset=opset,
-            output=onnx_path,
-        )
+        model_path = save_directory / (ONNX_WEIGHTS_NAME if quantization_config.save_onnx_model else OV_XML_FILE_NAME)
+        if quantization_config.save_onnx_model:
+            # Export the model to the ONNX format
+            opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
+            opset = max(opset, MIN_ONNX_QDQ_OPSET)
+            onnx_export(
+                model=compressed_model,
+                config=onnx_config,
+                opset=opset,
+                output=model_path,
+            )
 
-        # Load and save the compressed model
-        model = core.read_model(onnx_path)
-        self._save_pretrained(model, output_path)
-        quantization_config.save_pretrained(save_directory)
-        if not quantization_config.save_onnx_model:
-            os.remove(onnx_path)
-            try:
-                os.remove(f"{onnx_path}_data")
-            except FileNotFoundError:
-                pass
+            # Load and save the compressed model
+            model = core.read_model(model_path)
+            self._save_pretrained(model, output_path)
+        else:
+            _, _, is_onnx = export(model=compressed_model, config=onnx_config, output=output_path)
+            if is_onnx:
+                onnx_path = output_path.replace(".xml", ".onnx")
+                model = core.read_model(onnx_path)
+                self._save_pretrained(model, output_path)
+                os.remove(onnx_path)
+                try:
+                    os.remove(f"{onnx_path}_data")
+                except FileNotFoundError:
+                    pass
 
     @staticmethod
     def _save_pretrained(model: openvino.runtime.Model, output_path: str):
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 091d28047a..811309806a 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -41,7 +41,7 @@
 from nncf.torch.quantization.algo import QuantizationController
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, PartialShape, serialize
-from openvino.tools.ovc.moc_frontend.offline_transformations import (
+from openvino.tools.mo.back.offline_transformations import (
     apply_fused_names_cleanup,
     apply_moc_transformations,
     apply_user_transformations,
diff --git a/setup.py b/setup.py
index c35640226d..6ddc9fdd6e 100644
--- a/setup.py
+++ b/setup.py
@@ -41,8 +41,8 @@
         "onnx",
         "onnxruntime<1.15.0",
     ],
-    "openvino": ["openvino>=2023.0.0", "onnx", "onnxruntime"],
-    "nncf": ["nncf>=2.5.0", "openvino-dev>=2023.0.0"],
+    "openvino": ["openvino==2023.1.0.dev20230728", "onnx", "onnxruntime"],
+    "nncf": ["nncf>=2.5.0", "openvino-dev==2023.1.0.dev20230728"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From 58acfb59434887b8049f39d24667d49856db288f Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 14:23:41 +0400
Subject: [PATCH 05/38] fix style

---
 optimum/intel/openvino/export.py              | 23 +++++++++++--------
 optimum/intel/openvino/modeling_base.py       |  7 ++----
 .../intel/openvino/modeling_base_seq2seq.py   |  2 +-
 optimum/intel/openvino/modeling_decoder.py    |  2 +-
 optimum/intel/openvino/modeling_diffusion.py  |  3 +--
 optimum/intel/openvino/quantization.py        |  2 +-
 6 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index ba12f53e26..143f725fc7 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,29 +1,32 @@
-import logging
+import functools
 import inspect
+import logging
 import os
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union, Any, Callable
-import functools
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-from transformers.utils import is_tf_available, is_torch_available
+from openvino.runtime import PartialShape, serialize
+from openvino.runtime.utils.types import get_element_type
+from openvino.tools.mo import convert_model
+from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
+from transformers.utils import is_tf_available, is_torch_available
 
-from optimum.utils import is_diffusers_available, DEFAULT_DUMMY_SHAPES
 from optimum.exporters import TasksManager
+from optimum.exporters.onnx import __main__
 from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
-from optimum.exporters.onnx.convert import export_tensorflow, check_dummy_inputs_are_allowed
+from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
+from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
 from optimum.utils.save_utils import maybe_save_preprocessors
-from optimum.exporters.onnx import __main__
 
-from openvino.tools.mo import convert_model
-from openvino.runtime import serialize, PartialShape
-from openvino.runtime.utils.types import get_element_type
 from .utils import OV_XML_FILE_NAME
 
+
 logger = logging.getLogger(__name__)
 
 if is_torch_available():
+    import torch
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
 
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index cf8e94c0c7..1388497b55 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -17,22 +17,21 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Union
-import time
 
 import openvino
 from huggingface_hub import hf_hub_download
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from openvino.tools import mo
 from openvino.runtime import Core
+from openvino.tools import mo
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
 from optimum.exporters.onnx import OnnxConfig
 from optimum.exporters.tasks import TasksManager
-from .export import export, is_torch_model
 from optimum.modeling_base import OptimizedModel
 
 from ..utils.import_utils import is_transformers_version
+from .export import export, is_torch_model
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
 
 
@@ -132,13 +131,11 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
         if isinstance(file_name, str):
             file_name = Path(file_name)
         bin_file_name = file_name.with_suffix(".bin") if file_name.suffix == ".xml" else None
-        s = time.perf_counter()
         model = (
             core.read_model(file_name, bin_file_name)
             if not file_name.suffix == ".onnx"
             else mo.convert_model(file_name)
         )
-        e = time.perf_counter()
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index af5e2388f8..5a5e195845 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -26,9 +26,9 @@
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import get_encoder_decoder_models_for_export
-from .export import export_models
 
 from ..utils.import_utils import is_transformers_version
+from .export import export_models
 from .modeling_base import OVBaseModel
 from .utils import (
     ONNX_DECODER_NAME,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index cf81437b01..7cc2b34f2c 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -32,9 +32,9 @@
 
 from ..utils.import_utils import is_transformers_version
 from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
+from .export import export, is_torch_model
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
-from .export import export, is_torch_model
 
 
 if is_transformers_version("<", "4.25.0"):
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index c0d0870db3..c807f61b3f 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -36,7 +36,6 @@
 from openvino.runtime import Core
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
-from .export import main_export
 from optimum.pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
 from optimum.pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
 from optimum.pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
@@ -51,7 +50,7 @@
 )
 
 from .loaders import OVTextualInversionLoaderMixin
-from .export import export_models
+from .export import main_export
 from .modeling_base import OVBaseModel
 from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 6c7db722e2..94b809bd39 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -34,12 +34,12 @@
 from transformers import DataCollator, PreTrainedModel, default_data_collator
 
 from optimum.exporters.onnx import export as onnx_export
-from .export import export
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 
 from ..utils.constant import _TASK_ALIASES
 from .configuration import INT8_WEIGHT_COMPRESSION_CONFIG, OVConfig
+from .export import export
 from .modeling_base import OVBaseModel
 from .modeling_decoder import OVBaseDecoderModel
 from .utils import (

From bf94ecc60eec265d1d33df4885b24cf18323d516 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 14:14:43 +0400
Subject: [PATCH 06/38] revert changes not related to pr

---
 optimum/intel/openvino/modeling_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 7cc2b34f2c..b5cbc2ba48 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -234,7 +234,7 @@ def _from_transformers(
         # TODO : create ModelPatcher to patch each architecture
         if config.model_type == "bloom":
             model.transformer._prepare_attn_mask = _prepare_attn_mask
-        elif config.model_type in {"llama", "longllama"}:
+        elif config.model_type == "llama":
             model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
         elif config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
             model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask

From 744c2b643c0e882a617daec3bf6836b6c9976a71 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 16:19:35 +0400
Subject: [PATCH 07/38] clear ts registry:

---
 optimum/intel/openvino/export.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 143f725fc7..bb637a5353 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,4 +1,5 @@
 import functools
+import gc
 import inspect
 import logging
 import os
@@ -201,7 +202,7 @@ def ts_patched_forward(*args, **kwargs):
             ov_model = convert_model(onnx_output)
             serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
             return input_names, output_names, True
-
+        clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
         flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
@@ -222,9 +223,17 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
         serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+        del model
+        gc.collect()
     return input_names, output_names, False
 
 
+def clear_class_registry():
+    torch._C._jit_clear_class_registry()
+    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
+    torch.jit._state._clear_class_state()
+
+
 def export_models(
     models_and_onnx_configs: Dict[
         str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]

From 1ca1edb6b6a9f1253d986911be5f41253ea7c720 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 18:00:44 +0400
Subject: [PATCH 08/38] remove ov dev from deps

---
 optimum/intel/openvino/export.py        |  7 +++----
 optimum/intel/openvino/modeling_base.py |  7 ++-----
 optimum/intel/openvino/quantization.py  |  1 +
 optimum/intel/openvino/trainer.py       | 12 ++++++------
 setup.py                                |  2 +-
 5 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index bb637a5353..5852195d63 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -6,9 +6,8 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-from openvino.runtime import PartialShape, serialize
+from openvino import PartialShape, convert_model, save_model
 from openvino.runtime.utils.types import get_element_type
-from openvino.tools.mo import convert_model
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
 from transformers.utils import is_tf_available, is_torch_available
@@ -200,7 +199,7 @@ def ts_patched_forward(*args, **kwargs):
                 model, config, opset, onnx_output, device, input_shapes, model_kwargs
             )
             ov_model = convert_model(onnx_output)
-            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+            save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
             return input_names, output_names, True
         clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
@@ -222,7 +221,7 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
-        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+        save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
         del model
         gc.collect()
     return input_names, output_names, False
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 1388497b55..4c99bef09f 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -20,9 +20,8 @@
 
 import openvino
 from huggingface_hub import hf_hub_download
+from openvino import Core, convert_model
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from openvino.runtime import Core
-from openvino.tools import mo
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
@@ -132,9 +131,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
             file_name = Path(file_name)
         bin_file_name = file_name.with_suffix(".bin") if file_name.suffix == ".xml" else None
         model = (
-            core.read_model(file_name, bin_file_name)
-            if not file_name.suffix == ".onnx"
-            else mo.convert_model(file_name)
+            core.read_model(file_name, bin_file_name) if not file_name.suffix == ".onnx" else convert_model(file_name)
         )
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 94b809bd39..311ad211f9 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -411,6 +411,7 @@ def data_collator(batch):
                     os.remove(f"{onnx_path}_data")
                 except FileNotFoundError:
                     pass
+        quantization_config.save_pretrained(save_directory)
 
     @staticmethod
     def _save_pretrained(model: openvino.runtime.Model, output_path: str):
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 811309806a..22d402c80f 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -39,13 +39,13 @@
 from nncf.torch.compression_method_api import PTCompressionAlgorithmController
 from nncf.torch.nncf_network import NNCFNetwork
 from nncf.torch.quantization.algo import QuantizationController
-from openvino._offline_transformations import compress_quantize_weights_transformation
-from openvino.runtime import Core, PartialShape, serialize
-from openvino.tools.mo.back.offline_transformations import (
+from openvino._offline_transformations import (
     apply_fused_names_cleanup,
     apply_moc_transformations,
-    apply_user_transformations,
+    apply_pruning_transformation,
+    compress_quantize_weights_transformation,
 )
+from openvino.runtime import Core, PartialShape, save_model
 from torch.onnx import export as onnx_export
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader, Dataset, RandomSampler
@@ -755,7 +755,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
                     apply_moc_transformations(ov_model)
                     if self._get_compression_controller_by_cls(QuantizationController) is not None:
                         compress_quantize_weights_transformation(ov_model)
-                    apply_user_transformations(ov_model, [("Pruning", {})])
+                    apply_pruning_transformation(ov_model)
                     apply_fused_names_cleanup(ov_model)
                     # Reshape back to dynamic shape IR
                     ov_model = self._reshape_ir(ov_model, static_shape=False)
@@ -772,7 +772,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
                     compress_quantize_weights_transformation(ov_model)
 
             # Serialize IR xml and bin
-            serialize(ov_model, output_path)
+            save_model(ov_model, output_path)
 
     def _get_compression_controller_by_cls(
         self, controller_cls: Type[PTCompressionAlgorithmController]
diff --git a/setup.py b/setup.py
index 6ddc9fdd6e..7bdb9c062e 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino==2023.1.0.dev20230728", "onnx", "onnxruntime"],
-    "nncf": ["nncf>=2.5.0", "openvino-dev==2023.1.0.dev20230728"],
+    "nncf": ["nncf>=2.5.0"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From 3430ab024f433d541dbc1fce8a5286c721543910 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 18:14:35 +0400
Subject: [PATCH 09/38] update tests

---
 tests/openvino/test_modeling.py     | 4 ++++
 tests/openvino/test_quantization.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index b56b7766e7..2ffbbd6fba 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -257,6 +257,7 @@ def test_pipeline(self, model_arch):
             self.assertTrue(not model.is_dynamic)
             self.assertGreaterEqual(outputs[0]["score"], 0.0)
             self.assertIsInstance(outputs[0]["label"], str)
+        del model
         gc.collect()
 
 
@@ -293,6 +294,8 @@ def test_compare_to_transformers(self, model_arch):
             self.assertTrue(
                 torch.allclose(torch.Tensor(ov_outputs.end_logits), transformers_outputs.end_logits, atol=1e-4)
             )
+        del ov_model
+        del transformers_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -307,6 +310,7 @@ def test_pipeline(self, model_arch):
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs["score"], 0.0)
         self.assertIsInstance(outputs["answer"], str)
+        del model
         gc.collect()
 
     def test_metric(self):
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index da9ba3b25a..4a2f96447c 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -64,7 +64,7 @@ def get_num_quantized_nodes(ov_model):
 class OVQuantizerTest(unittest.TestCase):
     # TODO : add models
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
-        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 42, 32),
+        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 32),
         (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 21),
     )
 
@@ -146,7 +146,7 @@ def preprocess_function(examples, tokenizer):
 class OVWeightCompressionTest(unittest.TestCase):
     # TODO : add models
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS = (
-        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 39),
+        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 35),
         (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 5),
     )
 

From 661980b6db4f5a45d8618cdc257ae44786dbc40c Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 18:46:46 +0400
Subject: [PATCH 10/38] return serialize back

---
 optimum/intel/openvino/export.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 5852195d63..375df18ac4 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-from openvino import PartialShape, convert_model, save_model
+from openvino import PartialShape, convert_model, serialize
 from openvino.runtime.utils.types import get_element_type
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
@@ -199,7 +199,7 @@ def ts_patched_forward(*args, **kwargs):
                 model, config, opset, onnx_output, device, input_shapes, model_kwargs
             )
             ov_model = convert_model(onnx_output)
-            save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
             return input_names, output_names, True
         clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
@@ -221,7 +221,7 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
-        save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
         del model
         gc.collect()
     return input_names, output_names, False

From ddd98e8e8bca8b9e5dde984d8622e6ca95fe8a04 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Thu, 13 Jul 2023 10:27:10 +0400
Subject: [PATCH 11/38] switch on pytorch frontend

---
 optimum/intel/openvino/export.py           | 124 +++++++++++++++++++++
 optimum/intel/openvino/modeling_base.py    |   2 +
 optimum/intel/openvino/modeling_decoder.py |   3 +-
 3 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 375df18ac4..d736016cf9 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -29,6 +29,10 @@
     import torch
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
+<<<<<<< HEAD
+=======
+    from transformers.pytorch_utils import is_torch_less_than_1_11
+>>>>>>> switch on pytorch frontend
 
 if is_diffusers_available():
     from diffusers import ModelMixin
@@ -36,13 +40,19 @@
 if is_tf_available():
     from transformers.modeling_tf_utils import TFPreTrainedModel
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> switch on pytorch frontend
 def is_torch_model(model):
     if not is_torch_available():
         return False
     return isinstance(model, nn.Module)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> switch on pytorch frontend
 def export(
     model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
     config: OnnxConfig,
@@ -50,7 +60,10 @@ def export(
     opset: Optional[int] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+<<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
+=======
+>>>>>>> switch on pytorch frontend
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an ONNX Intermediate Representation.
@@ -84,9 +97,13 @@ def export(
         raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
 
     if is_torch_available() and isinstance(model, nn.Module):
+<<<<<<< HEAD
         return export_pytorch(
             model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
         )
+=======
+        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes)
+>>>>>>> switch on pytorch frontend
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
         output.parent.mkdir(parents=True, exist_ok=True)
@@ -95,7 +112,11 @@ def export(
         if device == "cuda":
             raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
         if input_shapes is not None:
+<<<<<<< HEAD
             logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
+=======
+            print("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
+>>>>>>> switch on pytorch frontend
         return export_tensorflow(model, config, opset, output)
 
     else:
@@ -111,7 +132,10 @@ def export_pytorch(
     output: Path,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+<<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
+=======
+>>>>>>> switch on pytorch frontend
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an ONNX Intermediate Representation.
@@ -136,6 +160,7 @@ def export_pytorch(
         the ONNX configuration.
     """
     import torch
+<<<<<<< HEAD
     from torch.utils._pytree import tree_map
 
     logger.info(f"Using framework PyTorch: {torch.__version__}")
@@ -145,13 +170,29 @@ def export_pytorch(
         model.config.return_dict = True
         custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
         model.config.torchscript = not custom_patcher
+=======
+    from torch.onnx import export as onnx_export
+    from torch.utils._pytree import tree_map
+
+    print(f"Using framework PyTorch: {torch.__version__}")
+
+    with torch.no_grad():
+        model.config.return_dict = True
+        model.config.torchscript = True
+>>>>>>> switch on pytorch frontend
         model.eval()
 
         # Check if we need to override certain configuration item
         if config.values_override is not None:
+<<<<<<< HEAD
             logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
             for override_config_key, override_config_value in config.values_override.items():
                 logger.info(f"\t- {override_config_key} -> {override_config_value}")
+=======
+            print(f"Overriding {len(config.values_override)} configuration item(s)")
+            for override_config_key, override_config_value in config.values_override.items():
+                print(f"\t- {override_config_key} -> {override_config_value}")
+>>>>>>> switch on pytorch frontend
                 setattr(model.config, override_config_key, override_config_value)
 
         if input_shapes is None:
@@ -169,11 +210,19 @@ def export_pytorch(
         inputs = config.ordered_inputs(model)
         input_names = list(inputs.keys())
         output_names = list(config.outputs.keys())
+<<<<<<< HEAD
+=======
+
+        if hasattr(config, "patch_ops"):
+            config.patch_ops()
+        
+>>>>>>> switch on pytorch frontend
         if hasattr(model, "forward"):
             sig = inspect.signature(model.forward)
         else:
             sig = inspect.signature(model.call)
 
+<<<<<<< HEAD
         dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
         input_info = get_input_shapes(dummy_inputs, inputs)
         try:
@@ -209,6 +258,17 @@ def ts_patched_forward(*args, **kwargs):
             if idx < len(output_names):
                 out_tensor.get_tensor().set_names({output_names[idx]})
 
+=======
+        input_info = get_input_shapes(dummy_inputs, inputs)
+        start0 = time.perf_counter()
+        ov_model = mo.convert_model(model, example_input=dummy_inputs, input=input_info)
+        end0 = time.perf_counter()
+        print(f"Convert model took {end0 - start0}s")
+        ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
+        ordered_input_names = list(inputs)
+        flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
+                
+>>>>>>> switch on pytorch frontend
         for idx, inp_tensor in enumerate(ov_model.inputs):
             input_name = ordered_input_names[idx]
             inp_tensor.get_tensor().set_names({input_name})
@@ -217,6 +277,7 @@ def ts_patched_forward(*args, **kwargs):
             dims = inputs[input_name]
 
             for dim in dims:
+<<<<<<< HEAD
                 static_shape[dim] = -1
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
@@ -231,6 +292,24 @@ def clear_class_registry():
     torch._C._jit_clear_class_registry()
     torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
     torch.jit._state._clear_class_state()
+=======
+                static_shape[dim] = -1 
+            inp_tensor.get_node().set_partial_shape(static_shape)
+            inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
+
+        for idx, out_tensor in enumerate(ov_model.outputs):
+            if idx < len(output_names):
+                out_tensor.get_tensor().set_names({output_names[idx]})
+        ov_model.validate_nodes_and_infer_types()
+        start1 = time.perf_counter()
+        serialize(ov_model, output.parent / OV_XML_FILE_NAME)
+        end1 = time.perf_counter()
+        print(f"Serailize model took {end1 - start1}s")
+        if hasattr(config, "restore_ops"):
+            config.restore_ops()
+
+    return input_names, output_names
+>>>>>>> switch on pytorch frontend
 
 
 def export_models(
@@ -242,8 +321,36 @@ def export_models(
     output_names: Optional[List[str]] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+<<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[List[str]], List[List[str]]]:
+=======
+) -> Tuple[List[List[str]], List[List[str]]]:
+    """
+    Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
+    The following method exports the encoder and decoder components of the model as separate
+    ONNX files.
+
+    Args:
+        models_and_onnx_configs (`Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `OnnxConfig`]]):
+            A dictionnary containing the models to export and their corresponding onnx configs.
+        output_dir (`Path`):
+            Output directory to store the exported ONNX models.
+        opset (`Optional[int]`, defaults to `None`):
+            The version of the ONNX operator set to use.
+        output_names (`Optional[List[str]]`, defaults to `None`):
+            The names to use for the exported ONNX files. The order must be the same as the order of submodels in the ordered dict `models_and_onnx_configs`.
+            If None, will use the keys from `models_and_onnx_configs` as names.
+        device (`str`, defaults to `"cpu"`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`Optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+    Returns:
+        `Tuple[List[List[str]], List[List[str]]]`: A tuple with an ordered list of the model's inputs, and the named
+        inputs from the ONNX configuration.
+    """
+>>>>>>> switch on pytorch frontend
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
@@ -264,7 +371,10 @@ def export_models(
                 opset=opset,
                 device=device,
                 input_shapes=input_shapes,
+<<<<<<< HEAD
                 model_kwargs=model_kwargs,
+=======
+>>>>>>> switch on pytorch frontend
             )
         )
 
@@ -275,8 +385,11 @@ def export_models(
 def flattenize_inputs(inputs):
     flatten_inputs = []
     for input_data in inputs:
+<<<<<<< HEAD
         if input_data is None:
             continue
+=======
+>>>>>>> switch on pytorch frontend
         if isinstance(input_data, (list, tuple)):
             flatten_inputs.extend(flattenize_inputs(input_data))
         else:
@@ -284,6 +397,7 @@ def flattenize_inputs(inputs):
     return flatten_inputs
 
 
+<<<<<<< HEAD
 def remove_none_from_dummy_inputs(dummy_inputs):
     def remove_none_from_list_tuple(item):
         new_item = [i for i in item if i is not None]
@@ -308,6 +422,12 @@ def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
         if isinstance(data, (tuple, list, dict)):
+=======
+def get_input_shapes(dummy_inputs, inputs):
+    input_info = []
+    for input_name, data in dummy_inputs.items():
+        if isinstance(data, (tuple, list)):
+>>>>>>> switch on pytorch frontend
             return None
         static_shape = PartialShape(data.shape)
         if input_name in inputs:
@@ -315,6 +435,7 @@ def get_input_shapes(dummy_inputs, inputs):
             for dim in dynamic_dims:
                 static_shape[dim] = -1
         input_info.append((input_name, static_shape))
+<<<<<<< HEAD
     return input_info
 
 
@@ -516,3 +637,6 @@ def main_export(
         device=device,
         model_kwargs=model_kwargs,
     )
+=======
+    return input_info
+>>>>>>> switch on pytorch frontend
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 4c99bef09f..b5d3ae438b 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -22,6 +22,8 @@
 from huggingface_hub import hf_hub_download
 from openvino import Core, convert_model
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
+from openvino.tools import mo
+from openvino.runtime import Core
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index b5cbc2ba48..c8a7210642 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -28,6 +28,7 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from optimum.exporters import TasksManager
+from optimum.exporters.onnx import export
 from optimum.utils import NormalizedConfigManager
 
 from ..utils.import_utils import is_transformers_version
@@ -234,7 +235,7 @@ def _from_transformers(
         # TODO : create ModelPatcher to patch each architecture
         if config.model_type == "bloom":
             model.transformer._prepare_attn_mask = _prepare_attn_mask
-        elif config.model_type == "llama":
+        elif config.model_type in {"llama", "longllama"}:
             model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
         elif config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
             model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask

From e10b087ad4fb916d1a993763b4d6a7acbca47139 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 19 Jul 2023 11:41:07 +0400
Subject: [PATCH 12/38] fixes for seq2seq

---
 optimum/intel/openvino/export.py              | 71 ++++++++++++++++---
 optimum/intel/openvino/modeling_base.py       |  1 +
 .../intel/openvino/modeling_base_seq2seq.py   |  1 +
 optimum/intel/openvino/modeling_decoder.py    |  5 ++
 4 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index d736016cf9..aa370e5f1f 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -26,13 +26,15 @@
 logger = logging.getLogger(__name__)
 
 if is_torch_available():
-    import torch
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
     from transformers.pytorch_utils import is_torch_less_than_1_11
 >>>>>>> switch on pytorch frontend
+=======
+>>>>>>> fixes for seq2seq
 
 if is_diffusers_available():
     from diffusers import ModelMixin
@@ -132,10 +134,14 @@ def export_pytorch(
     output: Path,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+<<<<<<< HEAD
 <<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
 =======
 >>>>>>> switch on pytorch frontend
+=======
+    model_kwargs: Optional[Dict[str, Any]] = None,
+>>>>>>> fixes for seq2seq
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an ONNX Intermediate Representation.
@@ -178,8 +184,13 @@ def export_pytorch(
 
     with torch.no_grad():
         model.config.return_dict = True
+<<<<<<< HEAD
         model.config.torchscript = True
 >>>>>>> switch on pytorch frontend
+=======
+        custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
+        model.config.torchscript = not custom_patcher
+>>>>>>> fixes for seq2seq
         model.eval()
 
         # Check if we need to override certain configuration item
@@ -211,17 +222,21 @@ def export_pytorch(
         input_names = list(inputs.keys())
         output_names = list(config.outputs.keys())
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
 
         if hasattr(config, "patch_ops"):
             config.patch_ops()
         
 >>>>>>> switch on pytorch frontend
+=======
+>>>>>>> fixes for seq2seq
         if hasattr(model, "forward"):
             sig = inspect.signature(model.forward)
         else:
             sig = inspect.signature(model.call)
 
+<<<<<<< HEAD
 <<<<<<< HEAD
         dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
         input_info = get_input_shapes(dummy_inputs, inputs)
@@ -259,14 +274,39 @@ def ts_patched_forward(*args, **kwargs):
                 out_tensor.get_tensor().set_names({output_names[idx]})
 
 =======
+=======
+        dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
+>>>>>>> fixes for seq2seq
         input_info = get_input_shapes(dummy_inputs, inputs)
         start0 = time.perf_counter()
-        ov_model = mo.convert_model(model, example_input=dummy_inputs, input=input_info)
+        try:
+            if custom_patcher:
+                patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
+                patched_forward = patcher.patched_forward
+                @functools.wraps(patched_forward)
+                def ts_patched_forward(*args, **kwargs):
+                    outputs = patched_forward(*args, **kwargs)
+                    return tuple(outputs.values())
+                patcher.patched_forward = ts_patched_forward
+                with patcher:
+                    ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
+            else:
+                ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
+        except Exception:
+            onnx_output = output.with_suffix(".onnx")
+            input_names, output_names = export_pytorch_to_onnx(model, config, opset, onnx_output, device, input_shapes, model_kwargs)
+            ov_model = convert_model(onnx_output)
+            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+            return input_names, output_names
+
         end0 = time.perf_counter()
         print(f"Convert model took {end0 - start0}s")
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
         flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
+        for idx, out_tensor in enumerate(ov_model.outputs):
+            if idx < len(output_names):
+                out_tensor.get_tensor().set_names({output_names[idx]})
                 
 >>>>>>> switch on pytorch frontend
         for idx, inp_tensor in enumerate(ov_model.inputs):
@@ -296,18 +336,11 @@ def clear_class_registry():
                 static_shape[dim] = -1 
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
-
-        for idx, out_tensor in enumerate(ov_model.outputs):
-            if idx < len(output_names):
-                out_tensor.get_tensor().set_names({output_names[idx]})
         ov_model.validate_nodes_and_infer_types()
         start1 = time.perf_counter()
-        serialize(ov_model, output.parent / OV_XML_FILE_NAME)
+        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
         end1 = time.perf_counter()
         print(f"Serailize model took {end1 - start1}s")
-        if hasattr(config, "restore_ops"):
-            config.restore_ops()
-
     return input_names, output_names
 >>>>>>> switch on pytorch frontend
 
@@ -385,11 +418,16 @@ def export_models(
 def flattenize_inputs(inputs):
     flatten_inputs = []
     for input_data in inputs:
+<<<<<<< HEAD
 <<<<<<< HEAD
         if input_data is None:
             continue
 =======
 >>>>>>> switch on pytorch frontend
+=======
+        if input_data is None:
+            continue
+>>>>>>> fixes for seq2seq
         if isinstance(input_data, (list, tuple)):
             flatten_inputs.extend(flattenize_inputs(input_data))
         else:
@@ -398,11 +436,15 @@ def flattenize_inputs(inputs):
 
 
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> fixes for seq2seq
 def remove_none_from_dummy_inputs(dummy_inputs):
     def remove_none_from_list_tuple(item):
         new_item = [i for i in item if i is not None]
         return type(item)(new_item)
 
+<<<<<<< HEAD
     upd_dummy = {}
     for k, v in dummy_inputs.items():
         if v is None:
@@ -411,18 +453,27 @@ def remove_none_from_list_tuple(item):
             for kk, vv in v.items():
                 upd_dummy[kk] = vv
             continue
+=======
+    upd_dummy = {} 
+    for k, v in dummy_inputs.items():
+        if v is None:
+            continue
+>>>>>>> fixes for seq2seq
         if isinstance(v, (tuple, list)):
             upd_dummy[k] = remove_none_from_list_tuple(v)
             continue
         upd_dummy[k] = v
     return upd_dummy
 
+<<<<<<< HEAD
 
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
         if isinstance(data, (tuple, list, dict)):
 =======
+=======
+>>>>>>> fixes for seq2seq
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index b5d3ae438b..ec480a18a3 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Union
+import time
 
 import openvino
 from huggingface_hub import hf_hub_download
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index 5a5e195845..f80cd58030 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -26,6 +26,7 @@
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import get_encoder_decoder_models_for_export
+from .export import export_models
 
 from ..utils.import_utils import is_transformers_version
 from .export import export_models
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index c8a7210642..d7e678f16c 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Tuple, Union
+import time
 
 import numpy as np
 import openvino
@@ -30,6 +31,7 @@
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import export
 from optimum.utils import NormalizedConfigManager
+#from optimum.exporters.onnx import export
 
 from ..utils.import_utils import is_transformers_version
 from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
@@ -226,7 +228,10 @@ def _from_transformers(
             "force_download": force_download,
             "trust_remote_code": trust_remote_code,
         }
+        start0 = time.perf_counter()
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
+        end0 = time.perf_counter()
+        print(f"Reading PT model took {end0 - start0}")
         config.is_decoder = True
         config.is_encoder_decoder = False
         onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)

From c1b73e10d0341065e9f98e5d12c8cb7dab54a48f Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 28 Jul 2023 08:33:38 +0400
Subject: [PATCH 13/38] wip

---
 optimum/intel/openvino/export.py           | 170 ++++++++++++++++++++-
 optimum/intel/openvino/modeling_decoder.py |   3 +-
 2 files changed, 166 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index aa370e5f1f..063fb8b467 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,24 +1,26 @@
 import functools
 import gc
 import inspect
-import logging
 import os
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union, Any, Callable
+import functools
+import time
 
 from openvino import PartialShape, convert_model, serialize
 from openvino.runtime.utils.types import get_element_type
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
 from transformers.utils import is_tf_available, is_torch_available
+from transformers import AutoTokenizer
 
+from optimum.utils import is_diffusers_available, DEFAULT_DUMMY_SHAPES
 from optimum.exporters import TasksManager
-from optimum.exporters.onnx import __main__
 from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
-from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow
+from optimum.exporters.onnx.convert import export_tensorflow, check_dummy_inputs_are_allowed
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
-from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
 from optimum.utils.save_utils import maybe_save_preprocessors
+from optimum.exporters.onnx import __main__
 
 from .utils import OV_XML_FILE_NAME
 
@@ -62,10 +64,14 @@ def export(
     opset: Optional[int] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+<<<<<<< HEAD
 <<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
 =======
 >>>>>>> switch on pytorch frontend
+=======
+    model_kwargs: Optional[Dict[str, Any]] = None,
+>>>>>>> wip
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an ONNX Intermediate Representation.
@@ -99,6 +105,7 @@ def export(
         raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
 
     if is_torch_available() and isinstance(model, nn.Module):
+<<<<<<< HEAD
 <<<<<<< HEAD
         return export_pytorch(
             model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
@@ -106,6 +113,9 @@ def export(
 =======
         return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes)
 >>>>>>> switch on pytorch frontend
+=======
+        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs)
+>>>>>>> wip
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
         output.parent.mkdir(parents=True, exist_ok=True)
@@ -354,10 +364,14 @@ def export_models(
     output_names: Optional[List[str]] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
+<<<<<<< HEAD
 <<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[List[str]], List[List[str]]]:
 =======
+=======
+    model_kwargs: Optional[Dict[str, Any]] = None,
+>>>>>>> wip
 ) -> Tuple[List[List[str]], List[List[str]]]:
     """
     Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
@@ -404,10 +418,14 @@ def export_models(
                 opset=opset,
                 device=device,
                 input_shapes=input_shapes,
+<<<<<<< HEAD
 <<<<<<< HEAD
                 model_kwargs=model_kwargs,
 =======
 >>>>>>> switch on pytorch frontend
+=======
+                model_kwargs=model_kwargs,
+>>>>>>> wip
             )
         )
 
@@ -458,7 +476,14 @@ def remove_none_from_list_tuple(item):
     for k, v in dummy_inputs.items():
         if v is None:
             continue
+<<<<<<< HEAD
 >>>>>>> fixes for seq2seq
+=======
+        if isinstance(v, dict):
+            for kk, vv in v.items():
+                upd_dummy[kk] = vv
+            continue
+>>>>>>> wip
         if isinstance(v, (tuple, list)):
             upd_dummy[k] = remove_none_from_list_tuple(v)
             continue
@@ -477,8 +502,12 @@ def get_input_shapes(dummy_inputs, inputs):
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
+<<<<<<< HEAD
         if isinstance(data, (tuple, list)):
 >>>>>>> switch on pytorch frontend
+=======
+        if isinstance(data, (tuple, list, dict)):
+>>>>>>> wip
             return None
         static_shape = PartialShape(data.shape)
         if input_name in inputs:
@@ -487,6 +516,9 @@ def get_input_shapes(dummy_inputs, inputs):
                 static_shape[dim] = -1
         input_info.append((input_name, static_shape))
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> wip
     return input_info
 
 
@@ -496,6 +528,10 @@ def main_export(
     task: str = "auto",
     device: str = "cpu",
     fp16: Optional[bool] = False,
+<<<<<<< HEAD
+=======
+    optimize: Optional[str] = None,
+>>>>>>> wip
     monolith: bool = False,
     framework: Optional[str] = None,
     cache_dir: Optional[str] = None,
@@ -506,15 +542,127 @@ def main_export(
     force_download: bool = False,
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
+<<<<<<< HEAD
+=======
+    for_ort: bool = False,
+>>>>>>> wip
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
     **kwargs_shapes,
 ):
+<<<<<<< HEAD
+=======
+    """
+    Full-suite ONNX export.
+
+    Args:
+        > Required parameters
+
+        model_name_or_path (`str`):
+            Model ID on huggingface.co or path on disk to the model repository to export.
+        output (`Union[str, Path]`):
+            Path indicating the directory where to store the generated ONNX model.
+
+        > Optional parameters
+
+        task (`Optional[str]`, defaults to `None`):
+            The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models,
+            use `xxx-with-past` to export the model using past key values in the decoder.
+        opset (`Optional[int]`, defaults to `None`):
+            If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture
+            will be used.
+        device (`str`, defaults to `"cpu"`):
+            The device to use to do the export. Defaults to "cpu".
+        fp16 (`Optional[bool]`, defaults to `"False"`):
+            Use half precision during the export. PyTorch-only, requires `device="cuda"`.
+        optimize (`Optional[str]`, defaults to `None`):
+            Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to
+            ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT.
+            Available options: `"O1", "O2", "O3", "O4"`. Reference: [`~optimum.onnxruntime.AutoOptimizationConfig`]
+        monolith (`bool`, defaults to `False`):
+            Forces to export the model as a single ONNX file.
+        no_post_process (`bool`, defaults to `False`):
+            Allows to disable any post-processing done by default on the exported ONNX models.
+        framework (`Optional[str]`, defaults to `None`):
+            The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect
+            the framework for the checkpoint.
+        atol (`Optional[float]`, defaults to `None`):
+            If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
+        cache_dir (`Optional[str]`, defaults to `None`):
+            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+            model repository.
+        pad_token_id (`Optional[int]`, defaults to `None`):
+            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
+        subfolder (`str`, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+            specify the folder name here.
+        revision (`str`, defaults to `"main"`):
+            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+        force_download (`bool`, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        local_files_only (`Optional[bool]`, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        use_auth_token (`Optional[str]`, defaults to `None`):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
+            Experimental usage: keyword arguments to pass to the model during
+            the export. This argument should be used along the `custom_onnx_configs` argument
+            in case, for example, the model inputs/outputs are changed (for example, if
+            `model_kwargs={"output_attentions": True}` is passed).
+        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
+            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
+        fn_get_submodels (`Optional[Callable]`, defaults to `None`):
+            Experimental usage: Override the default submodels that are used at the export. This is
+            especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
+        use_subprocess (`bool`):
+            Do the ONNX exported model validation in subprocesses. This is especially useful when
+            exporting on CUDA device, where ORT does not release memory at inference session
+            destruction. When set to `True`, the `main_export` call should be guarded in
+            `if __name__ == "__main__":` block.
+        **kwargs_shapes (`Dict`):
+            Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
+
+    Example usage:
+    ```python
+    >>> from optimum.exporters.onnx import main_export
+
+    >>> main_export("gpt2", output="gpt2_onnx/")
+    ```
+    """
+    if optimize == "O4" and device != "cuda":
+        raise ValueError(
+            "Requested O4 optimization, but this optimization requires to do the export on GPU."
+            " Please pass the argument `--device cuda`."
+        )
+
+    if (framework == "tf" and fp16 is True) or not is_torch_available():
+        raise ValueError("The --fp16 option is supported only for PyTorch.")
+
+    if fp16 is True and device == "cpu":
+        raise ValueError(
+            "The --fp16 option is supported only when exporting on GPU. Please pass the option `--device cuda`."
+        )
+
+>>>>>>> wip
     output = Path(output)
     if not output.exists():
         output.mkdir(parents=True)
 
+<<<<<<< HEAD
+=======
+    if for_ort:
+        logger.warning(
+            "The option --for-ort was passed, but its behavior is now the default in the ONNX exporter"
+            " and passing it is not required anymore."
+        )
+
+>>>>>>> wip
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
@@ -590,7 +738,11 @@ def main_export(
         if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
             task = task + "-with-past"
         else:
+<<<<<<< HEAD
             logger.info(
+=======
+            print(
+>>>>>>> wip
                 f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
                 f" if needed, please pass `--task {task}-with-past` to export using the past key values."
             )
@@ -609,7 +761,11 @@ def main_export(
             possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
         else:
             possible_synonyms = ""
+<<<<<<< HEAD
         logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
+=======
+        print(f"Automatic task detection to {task}{possible_synonyms}.")
+>>>>>>> wip
 
     onnx_config, models_and_onnx_configs = __main__._get_submodels_and_onnx_configs(
         model=model,
@@ -687,7 +843,11 @@ def main_export(
         input_shapes=input_shapes,
         device=device,
         model_kwargs=model_kwargs,
+<<<<<<< HEAD
     )
 =======
     return input_info
 >>>>>>> switch on pytorch frontend
+=======
+    )
+>>>>>>> wip
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index d7e678f16c..3dcd74722a 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -29,15 +29,14 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from optimum.exporters import TasksManager
-from optimum.exporters.onnx import export
 from optimum.utils import NormalizedConfigManager
-#from optimum.exporters.onnx import export
 
 from ..utils.import_utils import is_transformers_version
 from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
 from .export import export, is_torch_model
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
+from .export import export, is_torch_model
 
 
 if is_transformers_version("<", "4.25.0"):

From 042dd530b5f716094a561de60c325307847ac9b8 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 13:57:17 +0400
Subject: [PATCH 14/38] cleanup

---
 optimum/intel/openvino/export.py           | 210 ++-------------------
 optimum/intel/openvino/modeling_decoder.py |   4 -
 optimum/intel/openvino/quantization.py     |   1 +
 3 files changed, 15 insertions(+), 200 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 063fb8b467..c10c9d985f 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,11 +1,9 @@
-import functools
-import gc
+import logging
 import inspect
 import os
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union, Any, Callable
 import functools
-import time
 
 from openvino import PartialShape, convert_model, serialize
 from openvino.runtime.utils.types import get_element_type
@@ -22,21 +20,16 @@
 from optimum.utils.save_utils import maybe_save_preprocessors
 from optimum.exporters.onnx import __main__
 
+from openvino.tools.mo import convert_model
+from openvino.runtime import serialize, PartialShape
+from openvino.runtime.utils.types import get_element_type
 from .utils import OV_XML_FILE_NAME
 
-
 logger = logging.getLogger(__name__)
 
 if is_torch_available():
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
-    from transformers.pytorch_utils import is_torch_less_than_1_11
->>>>>>> switch on pytorch frontend
-=======
->>>>>>> fixes for seq2seq
 
 if is_diffusers_available():
     from diffusers import ModelMixin
@@ -44,19 +37,12 @@
 if is_tf_available():
     from transformers.modeling_tf_utils import TFPreTrainedModel
 
-<<<<<<< HEAD
-
-=======
->>>>>>> switch on pytorch frontend
 def is_torch_model(model):
     if not is_torch_available():
         return False
     return isinstance(model, nn.Module)
 
-<<<<<<< HEAD
 
-=======
->>>>>>> switch on pytorch frontend
 def export(
     model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
     config: OnnxConfig,
@@ -64,14 +50,7 @@ def export(
     opset: Optional[int] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
-<<<<<<< HEAD
-<<<<<<< HEAD
-    model_kwargs: Optional[Dict[str, Any]] = None,
-=======
->>>>>>> switch on pytorch frontend
-=======
     model_kwargs: Optional[Dict[str, Any]] = None,
->>>>>>> wip
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an ONNX Intermediate Representation.
@@ -105,17 +84,9 @@ def export(
         raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
 
     if is_torch_available() and isinstance(model, nn.Module):
-<<<<<<< HEAD
-<<<<<<< HEAD
         return export_pytorch(
             model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
         )
-=======
-        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes)
->>>>>>> switch on pytorch frontend
-=======
-        return export_pytorch(model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs)
->>>>>>> wip
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
         output.parent.mkdir(parents=True, exist_ok=True)
@@ -124,11 +95,7 @@ def export(
         if device == "cuda":
             raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
         if input_shapes is not None:
-<<<<<<< HEAD
             logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
-=======
-            print("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
->>>>>>> switch on pytorch frontend
         return export_tensorflow(model, config, opset, output)
 
     else:
@@ -144,14 +111,7 @@ def export_pytorch(
     output: Path,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
-<<<<<<< HEAD
-<<<<<<< HEAD
-    model_kwargs: Optional[Dict[str, Any]] = None,
-=======
->>>>>>> switch on pytorch frontend
-=======
     model_kwargs: Optional[Dict[str, Any]] = None,
->>>>>>> fixes for seq2seq
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an ONNX Intermediate Representation.
@@ -176,7 +136,6 @@ def export_pytorch(
         the ONNX configuration.
     """
     import torch
-<<<<<<< HEAD
     from torch.utils._pytree import tree_map
 
     logger.info(f"Using framework PyTorch: {torch.__version__}")
@@ -186,34 +145,22 @@ def export_pytorch(
         model.config.return_dict = True
         custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
         model.config.torchscript = not custom_patcher
-=======
-    from torch.onnx import export as onnx_export
     from torch.utils._pytree import tree_map
 
-    print(f"Using framework PyTorch: {torch.__version__}")
+    logger.info(f"Using framework PyTorch: {torch.__version__}")
+    output = Path(output)
 
     with torch.no_grad():
         model.config.return_dict = True
-<<<<<<< HEAD
-        model.config.torchscript = True
->>>>>>> switch on pytorch frontend
-=======
         custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
         model.config.torchscript = not custom_patcher
->>>>>>> fixes for seq2seq
         model.eval()
 
         # Check if we need to override certain configuration item
         if config.values_override is not None:
-<<<<<<< HEAD
             logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
             for override_config_key, override_config_value in config.values_override.items():
                 logger.info(f"\t- {override_config_key} -> {override_config_value}")
-=======
-            print(f"Overriding {len(config.values_override)} configuration item(s)")
-            for override_config_key, override_config_value in config.values_override.items():
-                print(f"\t- {override_config_key} -> {override_config_value}")
->>>>>>> switch on pytorch frontend
                 setattr(model.config, override_config_key, override_config_value)
 
         if input_shapes is None:
@@ -231,23 +178,10 @@ def export_pytorch(
         inputs = config.ordered_inputs(model)
         input_names = list(inputs.keys())
         output_names = list(config.outputs.keys())
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
-
-        if hasattr(config, "patch_ops"):
-            config.patch_ops()
-        
->>>>>>> switch on pytorch frontend
-=======
->>>>>>> fixes for seq2seq
         if hasattr(model, "forward"):
             sig = inspect.signature(model.forward)
         else:
             sig = inspect.signature(model.call)
-
-<<<<<<< HEAD
-<<<<<<< HEAD
         dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
         input_info = get_input_shapes(dummy_inputs, inputs)
         try:
@@ -283,42 +217,39 @@ def ts_patched_forward(*args, **kwargs):
             if idx < len(output_names):
                 out_tensor.get_tensor().set_names({output_names[idx]})
 
-=======
-=======
-        dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
->>>>>>> fixes for seq2seq
         input_info = get_input_shapes(dummy_inputs, inputs)
-        start0 = time.perf_counter()
         try:
             if custom_patcher:
                 patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
                 patched_forward = patcher.patched_forward
+
                 @functools.wraps(patched_forward)
                 def ts_patched_forward(*args, **kwargs):
                     outputs = patched_forward(*args, **kwargs)
                     return tuple(outputs.values())
+
                 patcher.patched_forward = ts_patched_forward
                 with patcher:
                     ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
             else:
                 ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
         except Exception:
+            model.config.torchscript = False
+            model.config.return_dict = True
             onnx_output = output.with_suffix(".onnx")
-            input_names, output_names = export_pytorch_to_onnx(model, config, opset, onnx_output, device, input_shapes, model_kwargs)
+            input_names, output_names = export_pytorch_to_onnx(
+                model, config, opset, onnx_output, device, input_shapes, model_kwargs
+            )
             ov_model = convert_model(onnx_output)
             serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-            return input_names, output_names
+            return input_names, output_names, True
 
-        end0 = time.perf_counter()
-        print(f"Convert model took {end0 - start0}s")
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
         flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
         for idx, out_tensor in enumerate(ov_model.outputs):
             if idx < len(output_names):
                 out_tensor.get_tensor().set_names({output_names[idx]})
-                
->>>>>>> switch on pytorch frontend
         for idx, inp_tensor in enumerate(ov_model.inputs):
             input_name = ordered_input_names[idx]
             inp_tensor.get_tensor().set_names({input_name})
@@ -327,7 +258,6 @@ def ts_patched_forward(*args, **kwargs):
             dims = inputs[input_name]
 
             for dim in dims:
-<<<<<<< HEAD
                 static_shape[dim] = -1
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
@@ -342,18 +272,6 @@ def clear_class_registry():
     torch._C._jit_clear_class_registry()
     torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
     torch.jit._state._clear_class_state()
-=======
-                static_shape[dim] = -1 
-            inp_tensor.get_node().set_partial_shape(static_shape)
-            inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
-        ov_model.validate_nodes_and_infer_types()
-        start1 = time.perf_counter()
-        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-        end1 = time.perf_counter()
-        print(f"Serailize model took {end1 - start1}s")
-    return input_names, output_names
->>>>>>> switch on pytorch frontend
-
 
 def export_models(
     models_and_onnx_configs: Dict[
@@ -364,14 +282,7 @@ def export_models(
     output_names: Optional[List[str]] = None,
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
-<<<<<<< HEAD
-<<<<<<< HEAD
     model_kwargs: Optional[Dict[str, Any]] = None,
-) -> Tuple[List[List[str]], List[List[str]]]:
-=======
-=======
-    model_kwargs: Optional[Dict[str, Any]] = None,
->>>>>>> wip
 ) -> Tuple[List[List[str]], List[List[str]]]:
     """
     Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
@@ -397,7 +308,6 @@ def export_models(
         `Tuple[List[List[str]], List[List[str]]]`: A tuple with an ordered list of the model's inputs, and the named
         inputs from the ONNX configuration.
     """
->>>>>>> switch on pytorch frontend
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
@@ -418,14 +328,7 @@ def export_models(
                 opset=opset,
                 device=device,
                 input_shapes=input_shapes,
-<<<<<<< HEAD
-<<<<<<< HEAD
-                model_kwargs=model_kwargs,
-=======
->>>>>>> switch on pytorch frontend
-=======
                 model_kwargs=model_kwargs,
->>>>>>> wip
             )
         )
 
@@ -436,16 +339,8 @@ def export_models(
 def flattenize_inputs(inputs):
     flatten_inputs = []
     for input_data in inputs:
-<<<<<<< HEAD
-<<<<<<< HEAD
         if input_data is None:
             continue
-=======
->>>>>>> switch on pytorch frontend
-=======
-        if input_data is None:
-            continue
->>>>>>> fixes for seq2seq
         if isinstance(input_data, (list, tuple)):
             flatten_inputs.extend(flattenize_inputs(input_data))
         else:
@@ -453,16 +348,11 @@ def flattenize_inputs(inputs):
     return flatten_inputs
 
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> fixes for seq2seq
 def remove_none_from_dummy_inputs(dummy_inputs):
     def remove_none_from_list_tuple(item):
         new_item = [i for i in item if i is not None]
         return type(item)(new_item)
 
-<<<<<<< HEAD
     upd_dummy = {}
     for k, v in dummy_inputs.items():
         if v is None:
@@ -471,43 +361,17 @@ def remove_none_from_list_tuple(item):
             for kk, vv in v.items():
                 upd_dummy[kk] = vv
             continue
-=======
-    upd_dummy = {} 
-    for k, v in dummy_inputs.items():
-        if v is None:
-            continue
-<<<<<<< HEAD
->>>>>>> fixes for seq2seq
-=======
-        if isinstance(v, dict):
-            for kk, vv in v.items():
-                upd_dummy[kk] = vv
-            continue
->>>>>>> wip
         if isinstance(v, (tuple, list)):
             upd_dummy[k] = remove_none_from_list_tuple(v)
             continue
         upd_dummy[k] = v
     return upd_dummy
 
-<<<<<<< HEAD
 
 def get_input_shapes(dummy_inputs, inputs):
     input_info = []
     for input_name, data in dummy_inputs.items():
         if isinstance(data, (tuple, list, dict)):
-=======
-=======
->>>>>>> fixes for seq2seq
-def get_input_shapes(dummy_inputs, inputs):
-    input_info = []
-    for input_name, data in dummy_inputs.items():
-<<<<<<< HEAD
-        if isinstance(data, (tuple, list)):
->>>>>>> switch on pytorch frontend
-=======
-        if isinstance(data, (tuple, list, dict)):
->>>>>>> wip
             return None
         static_shape = PartialShape(data.shape)
         if input_name in inputs:
@@ -515,10 +379,6 @@ def get_input_shapes(dummy_inputs, inputs):
             for dim in dynamic_dims:
                 static_shape[dim] = -1
         input_info.append((input_name, static_shape))
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> wip
     return input_info
 
 
@@ -528,10 +388,6 @@ def main_export(
     task: str = "auto",
     device: str = "cpu",
     fp16: Optional[bool] = False,
-<<<<<<< HEAD
-=======
-    optimize: Optional[str] = None,
->>>>>>> wip
     monolith: bool = False,
     framework: Optional[str] = None,
     cache_dir: Optional[str] = None,
@@ -542,17 +398,11 @@ def main_export(
     force_download: bool = False,
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
-<<<<<<< HEAD
-=======
-    for_ort: bool = False,
->>>>>>> wip
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
     **kwargs_shapes,
 ):
-<<<<<<< HEAD
-=======
     """
     Full-suite ONNX export.
 
@@ -635,12 +485,6 @@ def main_export(
     >>> main_export("gpt2", output="gpt2_onnx/")
     ```
     """
-    if optimize == "O4" and device != "cuda":
-        raise ValueError(
-            "Requested O4 optimization, but this optimization requires to do the export on GPU."
-            " Please pass the argument `--device cuda`."
-        )
-
     if (framework == "tf" and fp16 is True) or not is_torch_available():
         raise ValueError("The --fp16 option is supported only for PyTorch.")
 
@@ -649,20 +493,9 @@ def main_export(
             "The --fp16 option is supported only when exporting on GPU. Please pass the option `--device cuda`."
         )
 
->>>>>>> wip
     output = Path(output)
     if not output.exists():
         output.mkdir(parents=True)
-
-<<<<<<< HEAD
-=======
-    if for_ort:
-        logger.warning(
-            "The option --for-ort was passed, but its behavior is now the default in the ONNX exporter"
-            " and passing it is not required anymore."
-        )
-
->>>>>>> wip
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
@@ -738,11 +571,7 @@ def main_export(
         if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
             task = task + "-with-past"
         else:
-<<<<<<< HEAD
             logger.info(
-=======
-            print(
->>>>>>> wip
                 f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
                 f" if needed, please pass `--task {task}-with-past` to export using the past key values."
             )
@@ -761,11 +590,7 @@ def main_export(
             possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
         else:
             possible_synonyms = ""
-<<<<<<< HEAD
         logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
-=======
-        print(f"Automatic task detection to {task}{possible_synonyms}.")
->>>>>>> wip
 
     onnx_config, models_and_onnx_configs = __main__._get_submodels_and_onnx_configs(
         model=model,
@@ -843,11 +668,4 @@ def main_export(
         input_shapes=input_shapes,
         device=device,
         model_kwargs=model_kwargs,
-<<<<<<< HEAD
-    )
-=======
-    return input_info
->>>>>>> switch on pytorch frontend
-=======
     )
->>>>>>> wip
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 3dcd74722a..b60564c382 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -17,7 +17,6 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Tuple, Union
-import time
 
 import numpy as np
 import openvino
@@ -227,10 +226,7 @@ def _from_transformers(
             "force_download": force_download,
             "trust_remote_code": trust_remote_code,
         }
-        start0 = time.perf_counter()
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
-        end0 = time.perf_counter()
-        print(f"Reading PT model took {end0 - start0}")
         config.is_decoder = True
         config.is_encoder_decoder = False
         onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 311ad211f9..fdbd12dd60 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -34,6 +34,7 @@
 from transformers import DataCollator, PreTrainedModel, default_data_collator
 
 from optimum.exporters.onnx import export as onnx_export
+from .export import export
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 

From 1dd4478970c88936164cf9f8cab4268a4faa4d9a Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 14:23:41 +0400
Subject: [PATCH 15/38] fix style

---
 optimum/intel/openvino/export.py               | 18 +++++++++---------
 optimum/intel/openvino/modeling_base.py        |  3 +--
 .../intel/openvino/modeling_base_seq2seq.py    |  1 -
 optimum/intel/openvino/modeling_decoder.py     |  1 -
 optimum/intel/openvino/quantization.py         |  1 -
 5 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index c10c9d985f..4937e09946 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,9 +1,9 @@
-import logging
+import functools
 import inspect
+import logging
 import os
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union, Any, Callable
-import functools
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from openvino import PartialShape, convert_model, serialize
 from openvino.runtime.utils.types import get_element_type
@@ -11,23 +11,23 @@
 from transformers import AutoTokenizer
 from transformers.utils import is_tf_available, is_torch_available
 from transformers import AutoTokenizer
+from transformers.utils import is_tf_available, is_torch_available
 
-from optimum.utils import is_diffusers_available, DEFAULT_DUMMY_SHAPES
 from optimum.exporters import TasksManager
+from optimum.exporters.onnx import __main__
 from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
-from optimum.exporters.onnx.convert import export_tensorflow, check_dummy_inputs_are_allowed
+from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
+from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
 from optimum.utils.save_utils import maybe_save_preprocessors
-from optimum.exporters.onnx import __main__
 
-from openvino.tools.mo import convert_model
-from openvino.runtime import serialize, PartialShape
-from openvino.runtime.utils.types import get_element_type
 from .utils import OV_XML_FILE_NAME
 
+
 logger = logging.getLogger(__name__)
 
 if is_torch_available():
+    import torch
     import torch.nn as nn
     from transformers.modeling_utils import PreTrainedModel
 
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index ec480a18a3..ef14f8c340 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -17,14 +17,13 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Union
-import time
 
 import openvino
 from huggingface_hub import hf_hub_download
 from openvino import Core, convert_model
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from openvino.tools import mo
 from openvino.runtime import Core
+from openvino.tools import mo
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index f80cd58030..5a5e195845 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -26,7 +26,6 @@
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import get_encoder_decoder_models_for_export
-from .export import export_models
 
 from ..utils.import_utils import is_transformers_version
 from .export import export_models
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index b60564c382..7cc2b34f2c 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -35,7 +35,6 @@
 from .export import export, is_torch_model
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
-from .export import export, is_torch_model
 
 
 if is_transformers_version("<", "4.25.0"):
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index fdbd12dd60..311ad211f9 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -34,7 +34,6 @@
 from transformers import DataCollator, PreTrainedModel, default_data_collator
 
 from optimum.exporters.onnx import export as onnx_export
-from .export import export
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 

From 13d71b23736e8ab4d662adcb7bff9030f40fe32b Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 14:14:43 +0400
Subject: [PATCH 16/38] revert changes not related to pr

---
 optimum/intel/openvino/modeling_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 7cc2b34f2c..b5cbc2ba48 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -234,7 +234,7 @@ def _from_transformers(
         # TODO : create ModelPatcher to patch each architecture
         if config.model_type == "bloom":
             model.transformer._prepare_attn_mask = _prepare_attn_mask
-        elif config.model_type in {"llama", "longllama"}:
+        elif config.model_type == "llama":
             model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
         elif config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
             model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask

From 891ec4aeb66c698db32e4a7f0a0b62558efc1666 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 16:19:35 +0400
Subject: [PATCH 17/38] clear ts registry:

---
 optimum/intel/openvino/export.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 4937e09946..e82d3a66c6 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -1,4 +1,5 @@
 import functools
+import gc
 import inspect
 import logging
 import os
@@ -273,6 +274,7 @@ def clear_class_registry():
     torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
     torch.jit._state._clear_class_state()
 
+
 def export_models(
     models_and_onnx_configs: Dict[
         str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]

From d24a509873eeb01f1f0317462ed56f3189d53f74 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 18:00:44 +0400
Subject: [PATCH 18/38] remove ov dev from deps

---
 optimum/intel/openvino/export.py        | 6 +++---
 optimum/intel/openvino/modeling_base.py | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index e82d3a66c6..218eeb235f 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-from openvino import PartialShape, convert_model, serialize
+from openvino import PartialShape, convert_model, save_model
 from openvino.runtime.utils.types import get_element_type
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
@@ -208,7 +208,7 @@ def ts_patched_forward(*args, **kwargs):
                 model, config, opset, onnx_output, device, input_shapes, model_kwargs
             )
             ov_model = convert_model(onnx_output)
-            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+            save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
             return input_names, output_names, True
         clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
@@ -263,7 +263,7 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
-        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+        save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
         del model
         gc.collect()
     return input_names, output_names, False
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index ef14f8c340..4c99bef09f 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -22,8 +22,6 @@
 from huggingface_hub import hf_hub_download
 from openvino import Core, convert_model
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from openvino.runtime import Core
-from openvino.tools import mo
 from transformers import PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 

From 740351b569d65a5db0e2d1c99c222a4f4926ef7c Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 2 Aug 2023 18:46:46 +0400
Subject: [PATCH 19/38] return serialize back

---
 optimum/intel/openvino/export.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
index 218eeb235f..e82d3a66c6 100644
--- a/optimum/intel/openvino/export.py
+++ b/optimum/intel/openvino/export.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-from openvino import PartialShape, convert_model, save_model
+from openvino import PartialShape, convert_model, serialize
 from openvino.runtime.utils.types import get_element_type
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoTokenizer
@@ -208,7 +208,7 @@ def ts_patched_forward(*args, **kwargs):
                 model, config, opset, onnx_output, device, input_shapes, model_kwargs
             )
             ov_model = convert_model(onnx_output)
-            save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
             return input_names, output_names, True
         clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
@@ -263,7 +263,7 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
-        save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
+        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
         del model
         gc.collect()
     return input_names, output_names, False

From eaab783c21094c9fa43efb72806a6f32b7f7052a Mon Sep 17 00:00:00 2001
From: Alexander <kozzzloff@list.ru>
Date: Thu, 3 Aug 2023 11:01:33 +0400
Subject: [PATCH 20/38] Added weights compression

---
 optimum/intel/openvino/quantization.py | 54 +++++++++++++-------------
 tests/openvino/test_quantization.py    |  7 ++--
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 311ad211f9..388f2cdfdf 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -24,14 +24,15 @@
 import transformers
 from accelerate.data_loader import DataLoaderStateMixin
 from datasets import Dataset, load_dataset
-from nncf import NNCFConfig
-from nncf.torch import create_compressed_model, register_default_init_args
+from nncf import NNCFConfig, compress_weights
+from nncf.torch import create_compressed_model, register_default_init_args, register_module
 from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
 from nncf.torch.initialization import PTInitializingDataLoader
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, Tensor
 from torch.utils.data import DataLoader, RandomSampler, TensorDataset
 from transformers import DataCollator, PreTrainedModel, default_data_collator
+from transformers.pytorch_utils import Conv1D
 
 from optimum.exporters.onnx import export as onnx_export
 from optimum.exporters.tasks import TasksManager
@@ -50,6 +51,8 @@
 )
 
 
+register_module(ignored_algorithms=[])(Conv1D)
+
 core = Core()
 logger = logging.getLogger(__name__)
 
@@ -345,36 +348,31 @@ def _quantize_torchmodel(
             model_type=model_type,
         )
 
-        if weights_only:
-            calibration_dataset = TensorDataset(torch.tensor([0.0, 1.0]))
-            calibration_dataset.column_names = []
-            remove_unused_columns = False
-            onnx_config = onnx_config_class(self.model.config)
-
-            def data_collator(batch):
-                return onnx_config.generate_dummy_inputs(framework="pt")
-
-        calibration_dataloader = self._get_calibration_dataloader(
-            calibration_dataset=calibration_dataset,
-            batch_size=batch_size,
-            remove_unused_columns=remove_unused_columns,
-            data_collator=data_collator,
-        )
-
         if quantization_config is None:
             logger.info(
                 "No configuration describing the quantization process was provided, a default OVConfig will be generated."
             )
-            quantization_config = OVConfig(compression=INT8_WEIGHT_COMPRESSION_CONFIG) if weights_only else OVConfig()
-
-        model_inputs = next(iter(calibration_dataloader))
-        quantization_config.add_input_info(model_inputs)
-        nncf_config = NNCFConfig.from_dict(quantization_config.__dict__)
-        nncf_config = register_default_init_args(nncf_config, calibration_dataloader)
-        controller, compressed_model = create_compressed_model(
-            self.model, nncf_config, wrap_inputs_fn=wrap_nncf_model_inputs_with_objwalk
-        )
-        compressed_model = controller.strip(do_copy=False)
+            quantization_config = OVConfig()
+            
+        if weights_only:
+            compressed_model = compress_weights(self.model)
+            self.model = compressed_model
+        else:
+            calibration_dataloader = self._get_calibration_dataloader(
+                calibration_dataset=calibration_dataset,
+                batch_size=batch_size,
+                remove_unused_columns=remove_unused_columns,
+                data_collator=data_collator,
+            )
+
+            model_inputs = next(iter(calibration_dataloader))
+            quantization_config.add_input_info(model_inputs)
+            nncf_config = NNCFConfig.from_dict(quantization_config.__dict__)
+            nncf_config = register_default_init_args(nncf_config, calibration_dataloader)
+            controller, compressed_model = create_compressed_model(
+                self.model, nncf_config, wrap_inputs_fn=wrap_nncf_model_inputs_with_objwalk
+            )
+            compressed_model = controller.strip(do_copy=False)
 
         task = self.task
         model = self.model
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 4a2f96447c..51dfe98507 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -146,8 +146,8 @@ def preprocess_function(examples, tokenizer):
 class OVWeightCompressionTest(unittest.TestCase):
     # TODO : add models
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS = (
-        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 35),
-        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 5),
+        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70),
+        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 45),
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS)
@@ -173,9 +173,8 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_int8
             self.assertTrue("logits" in outputs)
 
             # Verify that that the configuration is correctly saved and loaded
-            expected_config = OVConfig(compression=INT8_WEIGHT_COMPRESSION_CONFIG)
             loaded_config = OVConfig.from_pretrained(tmp_dir)
-            self.assertEqual(expected_config.to_dict()["compression"], loaded_config.to_dict()["compression"])
+            self.assertIsNotNone(loaded_config)
 
 
 class OVQuantizerQATest(unittest.TestCase):

From 4d8628465c9ddb9bde0fb23d9b19ced10faef499 Mon Sep 17 00:00:00 2001
From: Alexander <kozzzloff@list.ru>
Date: Thu, 3 Aug 2023 11:12:43 +0400
Subject: [PATCH 21/38] Changed NNCF version to develop

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7bdb9c062e..020d4e8826 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino==2023.1.0.dev20230728", "onnx", "onnxruntime"],
-    "nncf": ["nncf>=2.5.0"],
+    "nncf": ["git+https://github.com/openvinotoolkit/nncf.git"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From 69794aeb1742a5d10bdafc9c583e4fbe52edcc0c Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 9 Aug 2023 22:01:57 +0400
Subject: [PATCH 22/38] resolve dictionary as input

---
 optimum/exporters/openvino/__main__.py        | 227 ++++++++++++++
 optimum/exporters/openvino/convert.py         | 290 ++++++++++++++++++
 optimum/exporters/openvino/utils.py           |  81 +++++
 optimum/intel/openvino/modeling_base.py       |   4 +-
 .../intel/openvino/modeling_base_seq2seq.py   |   2 +-
 optimum/intel/openvino/modeling_decoder.py    |   2 +-
 optimum/intel/openvino/modeling_diffusion.py  |   2 +-
 optimum/intel/openvino/quantization.py        |   2 +-
 optimum/intel/openvino/trainer.py             |   2 +-
 9 files changed, 605 insertions(+), 7 deletions(-)
 create mode 100644 optimum/exporters/openvino/__main__.py
 create mode 100644 optimum/exporters/openvino/convert.py
 create mode 100644 optimum/exporters/openvino/utils.py

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
new file mode 100644
index 0000000000..2c3428aa0c
--- /dev/null
+++ b/optimum/exporters/openvino/__main__.py
@@ -0,0 +1,227 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import logging
+import os
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
+
+from requests.exceptions import ConnectionError as RequestsConnectionError
+from transformers import AutoTokenizer
+from transformers.utils import is_torch_available
+
+from optimum.exporters import TasksManager
+from optimum.exporters.onnx import __main__ as optimum_main
+from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
+from optimum.utils import DEFAULT_DUMMY_SHAPES
+from optimum.utils.save_utils import maybe_save_preprocessors
+
+from ...intel.openvino.utils import OV_XML_FILE_NAME
+from .convert import export_models
+
+
+logger = logging.getLogger(__name__)
+
+if is_torch_available():
+    import torch
+
+
+def main_export(
+    model_name_or_path: str,
+    output: Union[str, Path],
+    task: str = "auto",
+    device: str = "cpu",
+    fp16: Optional[bool] = False,
+    framework: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    trust_remote_code: bool = False,
+    pad_token_id: Optional[int] = None,
+    subfolder: str = "",
+    revision: str = "main",
+    force_download: bool = False,
+    local_files_only: bool = False,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+    custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
+    fn_get_submodels: Optional[Callable] = None,
+    **kwargs_shapes,
+):
+    output = Path(output)
+    if not output.exists():
+        output.mkdir(parents=True)
+
+    original_task = task
+    task = TasksManager.map_from_synonym(task)
+
+    framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
+
+    # get the shapes to be used to generate dummy inputs
+    input_shapes = {}
+    for input_name in DEFAULT_DUMMY_SHAPES.keys():
+        input_shapes[input_name] = (
+            kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
+        )
+
+    torch_dtype = None if fp16 is False else torch.float16
+
+    if task == "auto":
+        try:
+            task = TasksManager.infer_task_from_model(model_name_or_path)
+        except KeyError as e:
+            raise KeyError(
+                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+        except RequestsConnectionError as e:
+            raise RequestsConnectionError(
+                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+
+    model = TasksManager.get_model_from_task(
+        task,
+        model_name_or_path,
+        subfolder=subfolder,
+        revision=revision,
+        cache_dir=cache_dir,
+        use_auth_token=use_auth_token,
+        local_files_only=local_files_only,
+        force_download=force_download,
+        trust_remote_code=trust_remote_code,
+        framework=framework,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+
+    custom_architecture = False
+    is_stable_diffusion = "stable-diffusion" in task
+    model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
+
+    if not is_stable_diffusion:
+        if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
+            raise ValueError(
+                f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
+                f"If you want to support {model_type} please propose a PR or open up an issue."
+            )
+        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
+            task, exporter="onnx"
+        ):
+            custom_architecture = True
+
+    if custom_architecture and custom_onnx_configs is None:
+        raise ValueError(
+            "Trying to export a model with a custom architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models."
+        )
+
+    if custom_architecture and original_task == "auto":
+        raise ValueError(
+            f'Automatic task detection is not supported with custom architectures. Please specify the `task` argument. Suggestion: task="{task}" (or task="{task}-with-past" if the model is decoder-based and supports KV cache)'
+        )
+
+    if (
+        not custom_architecture
+        and not is_stable_diffusion
+        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
+    ):
+        if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
+            task = task + "-with-past"
+        else:
+            logger.info(
+                f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
+                f" if needed, please pass `--task {task}-with-past` to export using the past key values."
+            )
+
+    if original_task == "auto":
+        synonyms_for_task = sorted(TasksManager.synonyms_for_task(task))
+        if synonyms_for_task:
+            synonyms_for_task = ", ".join(synonyms_for_task)
+            possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
+        else:
+            possible_synonyms = ""
+        logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
+    onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
+        model=model,
+        task=task,
+        monolith=False,
+        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
+        custom_architecture=custom_architecture,
+        fn_get_submodels=fn_get_submodels,
+    )
+
+    if not is_stable_diffusion:
+        needs_pad_token_id = (
+            isinstance(onnx_config, OnnxConfigWithPast)
+            and getattr(model.config, "pad_token_id", None) is None
+            and task in ["text-classification"]
+        )
+        if needs_pad_token_id:
+            if pad_token_id is not None:
+                model.config.pad_token_id = pad_token_id
+            else:
+                try:
+                    tok = AutoTokenizer.from_pretrained(model_name_or_path)
+                    model.config.pad_token_id = tok.pad_token_id
+                except Exception:
+                    raise ValueError(
+                        "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
+                    )
+        # Saving the model config and preprocessor as this is needed sometimes.
+        model.config.save_pretrained(output)
+        generation_config = getattr(model, "generation_config", None)
+        if generation_config is not None:
+            generation_config.save_pretrained(output)
+        maybe_save_preprocessors(model_name_or_path, output)
+
+        if model.config.is_encoder_decoder and task.startswith("text-generation"):
+            raise ValueError(
+                f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
+                f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
+                f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`."
+            )
+
+        files_subpaths = None
+    else:
+        # save the subcomponent configuration
+        for model_name in models_and_onnx_configs:
+            subcomponent = models_and_onnx_configs[model_name][0]
+            if hasattr(subcomponent, "save_config"):
+                subcomponent.save_config(output / model_name)
+            elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
+                subcomponent.config.save_pretrained(output / model_name)
+
+        files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs]
+
+        # Saving the additional components needed to perform inference.
+        model.scheduler.save_pretrained(output.joinpath("scheduler"))
+
+        feature_extractor = getattr(model, "feature_extractor", None)
+        if feature_extractor is not None:
+            feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+
+        tokenizer = getattr(model, "tokenizer", None)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(output.joinpath("tokenizer"))
+
+        tokenizer_2 = getattr(model, "tokenizer_2", None)
+        if tokenizer_2 is not None:
+            tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
+
+        model.save_config(output)
+
+    export_models(
+        models_and_onnx_configs=models_and_onnx_configs,
+        output_dir=output,
+        output_names=files_subpaths,
+        input_shapes=input_shapes,
+        device=device,
+        model_kwargs=model_kwargs,
+    )
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
new file mode 100644
index 0000000000..86c4ed7725
--- /dev/null
+++ b/optimum/exporters/openvino/convert.py
@@ -0,0 +1,290 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import functools
+import gc
+import inspect
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from transformers.utils import is_tf_available, is_torch_available
+
+from openvino.runtime import PartialShape, save_model
+from openvino.runtime.utils.types import get_element_type
+from openvino.tools.ovc import convert_model
+from optimum.exporters.onnx.base import OnnxConfig
+from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow
+from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
+from optimum.utils import is_diffusers_available
+
+from ...intel.openvino.utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
+from .utils import (
+    clear_class_registry,
+    flattenize_inputs,
+    get_input_shapes,
+    remove_none_from_dummy_inputs,
+)
+
+
+logger = logging.getLogger(__name__)
+
+if is_torch_available():
+    import torch.nn as nn
+    from transformers.modeling_utils import PreTrainedModel
+
+if is_diffusers_available():
+    from diffusers import ModelMixin
+
+if is_tf_available():
+    from transformers.modeling_tf_utils import TFPreTrainedModel
+
+
+def export(
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+    config: OnnxConfig,
+    output: Path,
+    opset: Optional[int] = None,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[List[str], List[str]]:
+    """
+    Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.
+
+    Args:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model to export.
+        config ([`~exporters.onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+        opset (`Optional[int]`, defaults to `None`):
+            The version of the ONNX operator set to use.
+        device (`str`, *optional*, defaults to `cpu`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`Optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+    if not (is_torch_available() or is_tf_available()):
+        raise ImportError(
+            "Cannot convert because neither PyTorch nor TensorFlow are installed. "
+            "Please install torch or tensorflow first."
+        )
+
+    if "diffusers" in str(model.__class__) and not is_diffusers_available():
+        raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
+
+    if is_torch_available() and isinstance(model, nn.Module):
+        return export_pytorch(
+            model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
+        )
+
+    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
+        output.parent.mkdir(parents=True, exist_ok=True)
+        if opset is None:
+            opset = config.DEFAULT_ONNX_OPSET
+        if device == "cuda":
+            raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
+        if input_shapes is not None:
+            logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
+        return export_tensorflow(model, config, opset, output)
+
+    else:
+        raise RuntimeError(
+            "You either provided a PyTorch model with only TensorFlow installed, or a TensorFlow model with only PyTorch installed."
+        )
+
+
+def export_pytorch(
+    model: Union["PreTrainedModel", "ModelMixin"],
+    config: OnnxConfig,
+    opset: int,
+    output: Path,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[List[str], List[str]]:
+    """
+    Exports a PyTorch model to an OpenVINO Intermediate Representation.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The model to export.
+        config ([`~exporters.onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        opset (`int`):
+            The version of the ONNX operator set to use.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+        device (`str`, defaults to `"cpu"`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+    import torch
+    from torch.utils._pytree import tree_map
+
+    logger.info(f"Using framework PyTorch: {torch.__version__}")
+    output = Path(output)
+
+    with torch.no_grad():
+        model.config.return_dict = True
+        model.eval()
+
+        # Check if we need to override certain configuration item
+        if config.values_override is not None:
+            logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
+            for override_config_key, override_config_value in config.values_override.items():
+                logger.info(f"\t- {override_config_key} -> {override_config_value}")
+                setattr(model.config, override_config_key, override_config_value)
+
+        if input_shapes is None:
+            input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES
+
+        # Check that inputs match, and order them properly
+        dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes)
+        device = torch.device(device)
+        if device.type == "cuda" and torch.cuda.is_available():
+            model.to(device)
+            dummy_inputs = tree_map(
+                lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs
+            )
+        check_dummy_inputs_are_allowed(model, dummy_inputs)
+        inputs = config.ordered_inputs(model)
+        input_names = list(inputs.keys())
+        output_names = list(config.outputs.keys())
+        if hasattr(model, "forward"):
+            sig = inspect.signature(model.forward)
+        else:
+            sig = inspect.signature(model.call)
+
+        dummy_inputs, dict_inputs = remove_none_from_dummy_inputs(dummy_inputs)
+        input_info = get_input_shapes(dummy_inputs, inputs)
+        try:
+            patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
+            patched_forward = patcher.patched_forward
+
+            @functools.wraps(patched_forward)
+            def ts_patched_forward(*args, **kwargs):
+                for i in range(len(dict_inputs)):
+                    input_name = dict_inputs[i][0]
+                    keys = dict_inputs[i][1]
+                    tuple_input = kwargs[input_name]
+                    input_dict = dict(zip(keys, tuple_input))
+                    kwargs[input_name] = input_dict
+                outputs = patched_forward(*args, **kwargs)
+                return tuple(outputs.values())
+
+            patcher.patched_forward = ts_patched_forward
+            with patcher:
+                ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
+        except Exception:
+            orig_torch_onnx_export = torch.onnx.export
+
+            torch.onnx.export = functools.partial(orig_torch_onnx_export, do_constant_folding=True)
+            model.config.torchscript = False
+            model.config.return_dict = True
+            onnx_output = (
+                output.with_suffix(".onnx")
+                if not output.name != OV_XML_FILE_NAME
+                else output.parent / ONNX_WEIGHTS_NAME
+            )
+            input_names, output_names = export_pytorch_to_onnx(
+                model, config, opset, onnx_output, device, input_shapes, model_kwargs
+            )
+            torch.onnx.export = orig_torch_onnx_export
+            ov_model = convert_model(str(onnx_output))
+            save_model(
+                ov_model,
+                output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
+                compress_to_fp16=False,
+            )
+            return input_names, output_names, True
+        clear_class_registry()
+        ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
+        ordered_input_names = list(inputs)
+        flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
+        ov_model.validate_nodes_and_infer_types()
+        for idx, out_tensor in enumerate(ov_model.outputs):
+            if idx < len(output_names):
+                out_tensor.get_tensor().set_names({output_names[idx]})
+
+        for idx, inp_tensor in enumerate(ov_model.inputs):
+            input_name = ordered_input_names[idx]
+            inp_tensor.get_tensor().set_names({input_name})
+            inp_data = flatten_inputs[idx]
+            static_shape = PartialShape(inp_data.shape)
+            dims = inputs[input_name]
+
+            for dim in dims:
+                static_shape[dim] = -1
+            inp_tensor.get_node().set_partial_shape(static_shape)
+            inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
+        ov_model.validate_nodes_and_infer_types()
+        save_model(
+            ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output, compress_to_fp16=False
+        )
+        del model
+        gc.collect()
+    return input_names, output_names, False
+
+
+def export_models(
+    models_and_onnx_configs: Dict[
+        str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
+    ],
+    output_dir: Path,
+    opset: Optional[int] = None,
+    output_names: Optional[List[str]] = None,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[List[List[str]], List[List[str]]]:
+    outputs = []
+
+    if output_names is not None and len(output_names) != len(models_and_onnx_configs):
+        raise ValueError(
+            f"Provided custom names {output_names} for the export of {len(models_and_onnx_configs)} models. Please provide the same number of names as models to export."
+        )
+
+    for i, model_name in enumerate(models_and_onnx_configs.keys()):
+        submodel, sub_onnx_config = models_and_onnx_configs[model_name]
+        output_name = output_names[i] if output_names is not None else Path(model_name + ".xml")
+        output_path = output_dir / output_name
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        outputs.append(
+            export(
+                model=submodel,
+                config=sub_onnx_config,
+                output=output_path,
+                opset=opset,
+                device=device,
+                input_shapes=input_shapes,
+                model_kwargs=model_kwargs,
+            )
+        )
+
+    outputs = list(map(list, zip(*outputs)))
+    return outputs
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
new file mode 100644
index 0000000000..9b1867ba83
--- /dev/null
+++ b/optimum/exporters/openvino/utils.py
@@ -0,0 +1,81 @@
+#  Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from transformers.utils import is_torch_available
+
+from openvino.runtime import PartialShape
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+
+def is_torch_model(model):
+    if not is_torch_available():
+        return False
+    return isinstance(model, nn.Module)
+
+
+def flattenize_inputs(inputs):
+    flatten_inputs = []
+    for input_data in inputs:
+        if input_data is None:
+            continue
+        if isinstance(input_data, (list, tuple)):
+            flatten_inputs.extend(flattenize_inputs(input_data))
+        else:
+            flatten_inputs.append(input_data)
+    return flatten_inputs
+
+
+def remove_none_from_dummy_inputs(dummy_inputs):
+    def remove_none_from_list_tuple(item):
+        new_item = [i for i in item if i is not None]
+        return type(item)(new_item)
+
+    upd_dummy = {}
+    dict_dummy = []
+    for k, v in dummy_inputs.items():
+        if v is None:
+            continue
+        if isinstance(v, dict):
+            dict_dummy.append((k, list(v.keys())))
+            upd_dummy[k] = remove_none_from_list_tuple(tuple(v.values()))
+            continue
+        if isinstance(v, (tuple, list)):
+            upd_dummy[k] = remove_none_from_list_tuple(v)
+            continue
+        upd_dummy[k] = v
+    return upd_dummy, dict_dummy
+
+
+def get_input_shapes(dummy_inputs, inputs):
+    input_info = []
+    for input_name, data in dummy_inputs.items():
+        if isinstance(data, (tuple, list, dict)):
+            return None
+        static_shape = PartialShape(data.shape)
+        if input_name in inputs:
+            dynamic_dims = inputs[input_name]
+            for dim in dynamic_dims:
+                static_shape[dim] = -1
+        input_info.append((input_name, static_shape))
+    return input_info
+
+
+def clear_class_registry():
+    torch._C._jit_clear_class_registry()
+    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
+    torch.jit._state._clear_class_state()
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 4c99bef09f..fda0e9eb5a 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -148,7 +148,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
                 The directory where to save the model files.
         """
         dst_path = os.path.join(save_directory, OV_XML_FILE_NAME)
-        openvino.runtime.serialize(self.model, dst_path)
+        openvino.save_model(self.model, dst_path, compress_to_fp16=False)
 
     @classmethod
     def _from_pretrained(
@@ -199,7 +199,7 @@ def _from_pretrained(
             model_save_dir = model_id
         # Download the model from the hub
         else:
-            model_file_names = [file_name]
+            model_file_names = [file_name] if from_onnx else []
             # If not ONNX then OpenVINO IR
             if not from_onnx:
                 model_file_names.append(file_name.replace(".xml", ".bin"))
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index 5a5e195845..eca3f661a8 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -105,7 +105,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
 
         for src_file, dst_file_name in zip(src_files, dst_file_names):
             dst_path = os.path.join(save_directory, dst_file_name)
-            openvino.runtime.serialize(src_file, dst_path)
+            openvino.save_model(src_file, dst_path, compress_to_fp16=False)
 
     @classmethod
     def _from_pretrained(
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index b5cbc2ba48..85d5f8230f 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -190,7 +190,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
         """
         model_to_save = self.model if self._pkv_precision == Type.f32 else self._original_model
         dst_path = os.path.join(save_directory, OV_XML_FILE_NAME)
-        openvino.runtime.serialize(model_to_save, dst_path)
+        openvino.save_model(model_to_save, dst_path, compress_to_fp16=False)
 
     @classmethod
     def _from_transformers(
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index c807f61b3f..21e8323394 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -156,7 +156,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             if ov_model is not None:
                 dst_path = save_directory / dst_path / OV_XML_FILE_NAME
                 dst_path.parent.mkdir(parents=True, exist_ok=True)
-                openvino.runtime.serialize(ov_model.model, dst_path)
+                openvino.save_model(ov_model.model, dst_path, compress_to_fp16=False)
                 model_dir = ov_model.config.get("_name_or_path", None) or ov_model._model_dir / ov_model._model_name
                 config_path = Path(model_dir) / ov_model.CONFIG_NAME
                 if config_path.is_file():
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 388f2cdfdf..0fb8d9044d 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -414,7 +414,7 @@ def _quantize_torchmodel(
     @staticmethod
     def _save_pretrained(model: openvino.runtime.Model, output_path: str):
         compress_quantize_weights_transformation(model)
-        openvino.runtime.serialize(model, output_path)
+        openvino.save_model(model, output_path, compress_to_fp16=False)
 
     def _set_task(self):
         if self.task is None:
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 22d402c80f..e09293739f 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -772,7 +772,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
                     compress_quantize_weights_transformation(ov_model)
 
             # Serialize IR xml and bin
-            save_model(ov_model, output_path)
+            save_model(ov_model, output_path, compress_to_fp16=False)
 
     def _get_compression_controller_by_cls(
         self, controller_cls: Type[PTCompressionAlgorithmController]

From 989432c5eb39ef675672346958abcdbaf54fc6b3 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Sun, 13 Aug 2023 16:02:44 +0400
Subject: [PATCH 23/38] fix llama export in quantization flow

---
 optimum/exporters/openvino/convert.py      | 70 +++++++++++++++-------
 optimum/intel/openvino/modeling_decoder.py | 11 ++--
 optimum/intel/openvino/quantization.py     | 43 +++++++------
 optimum/intel/utils/modeling_utils.py      | 10 ++++
 4 files changed, 87 insertions(+), 47 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 86c4ed7725..49b54346e5 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -59,6 +59,7 @@ def export(
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
+    from_onnx: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.
@@ -93,7 +94,15 @@ def export(
 
     if is_torch_available() and isinstance(model, nn.Module):
         return export_pytorch(
-            model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
+            model,
+            config,
+            opset,
+            output,
+            device=device,
+            input_shapes=input_shapes,
+            model_kwargs=model_kwargs,
+            opset=opset,
+            from_onnx=from_onnx,
         )
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
@@ -112,6 +121,37 @@ def export(
         )
 
 
+def export_pytorch_via_onnx(
+    model: Union["PreTrainedModel", "ModelMixin"],
+    config: OnnxConfig,
+    opset: int,
+    output: Path,
+    device: str = "cpu",
+    input_shapes: Optional[Dict] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+):
+    import torch
+
+    orig_torch_onnx_export = torch.onnx.export
+    torch.onnx.export = functools.partial(orig_torch_onnx_export, do_constant_folding=False)
+    model.config.torchscript = False
+    model.config.return_dict = True
+    onnx_output = (
+        output.with_suffix(".onnx") if not output.name != OV_XML_FILE_NAME else output.parent / ONNX_WEIGHTS_NAME
+    )
+    input_names, output_names = export_pytorch_to_onnx(
+        model, config, opset, onnx_output, device, input_shapes, model_kwargs
+    )
+    torch.onnx.export = orig_torch_onnx_export
+    ov_model = convert_model(str(onnx_output))
+    save_model(
+        ov_model,
+        output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
+        compress_to_fp16=False,
+    )
+    return input_names, output_names, True
+
+
 def export_pytorch(
     model: Union["PreTrainedModel", "ModelMixin"],
     config: OnnxConfig,
@@ -120,6 +160,7 @@ def export_pytorch(
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
+    from_onnx: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an OpenVINO Intermediate Representation.
@@ -148,6 +189,8 @@ def export_pytorch(
 
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     output = Path(output)
+    if from_onnx:
+        return export_pytorch_via_onnx(model, config, opset, output, device, input_shapes, model_kwargs)
 
     with torch.no_grad():
         model.config.return_dict = True
@@ -200,28 +243,9 @@ def ts_patched_forward(*args, **kwargs):
             patcher.patched_forward = ts_patched_forward
             with patcher:
                 ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
-        except Exception:
-            orig_torch_onnx_export = torch.onnx.export
-
-            torch.onnx.export = functools.partial(orig_torch_onnx_export, do_constant_folding=True)
-            model.config.torchscript = False
-            model.config.return_dict = True
-            onnx_output = (
-                output.with_suffix(".onnx")
-                if not output.name != OV_XML_FILE_NAME
-                else output.parent / ONNX_WEIGHTS_NAME
-            )
-            input_names, output_names = export_pytorch_to_onnx(
-                model, config, opset, onnx_output, device, input_shapes, model_kwargs
-            )
-            torch.onnx.export = orig_torch_onnx_export
-            ov_model = convert_model(str(onnx_output))
-            save_model(
-                ov_model,
-                output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
-                compress_to_fp16=False,
-            )
-            return input_names, output_names, True
+        except Exception as ex:
+            logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX")
+            return export_pytorch_via_onnx(model, config, opset, output, device, input_shapes, model_kwargs)
         clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 85d5f8230f..83a05c4d26 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -31,8 +31,12 @@
 from optimum.utils import NormalizedConfigManager
 
 from ..utils.import_utils import is_transformers_version
+<<<<<<< HEAD
 from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
 from .export import export, is_torch_model
+=======
+from ..utils.modeling_utils import patch_decoder_attention_mask
+>>>>>>> fix llama export in quantization flow
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
@@ -232,12 +236,7 @@ def _from_transformers(
         onnx_config = onnx_config_constructor(model.config, use_past=use_cache)
 
         # TODO : create ModelPatcher to patch each architecture
-        if config.model_type == "bloom":
-            model.transformer._prepare_attn_mask = _prepare_attn_mask
-        elif config.model_type == "llama":
-            model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
-        elif config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
-            model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+        model = patch_decoder_attention_mask(model)
 
         # Export the model to the ONNX format
         export(model=model, config=onnx_config, output=save_dir_path / ONNX_WEIGHTS_NAME)
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 0fb8d9044d..f9c0524747 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -34,13 +34,12 @@
 from transformers import DataCollator, PreTrainedModel, default_data_collator
 from transformers.pytorch_utils import Conv1D
 
-from optimum.exporters.onnx import export as onnx_export
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 
 from ..utils.constant import _TASK_ALIASES
-from .configuration import INT8_WEIGHT_COMPRESSION_CONFIG, OVConfig
-from .export import export
+from ..utils.modeling_utils import patch_decoder_attention_mask
+from .configuration import OVConfig
 from .modeling_base import OVBaseModel
 from .modeling_decoder import OVBaseDecoderModel
 from .utils import (
@@ -336,8 +335,9 @@ def _quantize_torchmodel(
         self._set_task()
         save_directory = Path(save_directory)
         save_directory.mkdir(parents=True, exist_ok=True)
-        file_name = file_name if file_name is not None else OV_XML_FILE_NAME
-        output_path = save_directory.joinpath(file_name)
+        ov_file_name = file_name if file_name is not None else OV_XML_FILE_NAME
+        onnx_file_name = Path(file_name).with_suffix(".onnx") if file_name is not None else ONNX_WEIGHTS_NAME
+        output_path = save_directory.joinpath(ov_file_name)
         output_path = output_path.with_suffix(".xml").as_posix()
 
         model_type = self.model.config.model_type.replace("_", "-")
@@ -379,36 +379,43 @@ def _quantize_torchmodel(
         self.model.config.save_pretrained(save_directory)
 
         if task == "text-generation":
+            model = patch_decoder_attention_mask(model)
             onnx_config = onnx_config_class(model.config, use_past=model.config.use_cache)
         else:
             onnx_config = onnx_config_class(model.config)
 
-        model_path = save_directory / (ONNX_WEIGHTS_NAME if quantization_config.save_onnx_model else OV_XML_FILE_NAME)
-        if quantization_config.save_onnx_model:
-            # Export the model to the ONNX format
-            opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
-            opset = max(opset, MIN_ONNX_QDQ_OPSET)
-            onnx_export(
-                model=compressed_model,
-                config=onnx_config,
-                opset=opset,
-                output=model_path,
-            )
-
+        model_path = save_directory / onnx_file_name if quantization_config.save_onnx_model else ov_file_name
+        onnx_path = save_directory / onnx_file_name
+        opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
+        opset = max(opset, MIN_ONNX_QDQ_OPSET)
+        _, _, is_onnx = export(
+            model=compressed_model,
+            config=onnx_config,
+            output=model_path,
+            opset=opset,
+            from_onnx=quantization_config.save_onnx_model,
+        )
+        if is_onnx:
             # Load and save the compressed model
-            model = core.read_model(model_path)
+            model = core.read_model(onnx_path)
             self._save_pretrained(model, output_path)
+<<<<<<< HEAD
         else:
             _, _, is_onnx = export(model=compressed_model, config=onnx_config, output=output_path)
             if is_onnx:
                 onnx_path = output_path.replace(".xml", ".onnx")
                 model = core.read_model(onnx_path)
                 self._save_pretrained(model, output_path)
+=======
+            # if onnx conversion happens as fallback for pytorch conversion, remove onnx model
+            if not quantization_config.save_onnx_model:
+>>>>>>> fix llama export in quantization flow
                 os.remove(onnx_path)
                 try:
                     os.remove(f"{onnx_path}_data")
                 except FileNotFoundError:
                     pass
+
         quantization_config.save_pretrained(save_directory)
 
     @staticmethod
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index c7be049990..924c65d10a 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -89,3 +89,13 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds,
         )
 
     return combined_attention_mask
+
+
+def patch_decoder_attention_mask(model):
+    if model.config.model_type == "bloom":
+        model.transformer._prepare_attn_mask = _prepare_attn_mask
+    elif model.config.model_type == "llama":
+        model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    elif model.config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
+        model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    return model

From b88a92b24b71a6ebd24d231f9bd1bd4710af1d73 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Sun, 13 Aug 2023 17:36:24 +0400
Subject: [PATCH 24/38] rebase with fixes

---
 optimum/exporters/openvino/__init__.py        |   5 +
 optimum/exporters/openvino/convert.py         |   1 -
 optimum/intel/openvino/export.py              | 673 ------------------
 optimum/intel/openvino/modeling_base.py       |  11 +-
 .../intel/openvino/modeling_base_seq2seq.py   |   2 +-
 optimum/intel/openvino/modeling_decoder.py    |   7 +-
 optimum/intel/openvino/modeling_diffusion.py  |   2 +-
 optimum/intel/openvino/quantization.py        |  21 +-
 setup.py                                      |   2 +-
 9 files changed, 21 insertions(+), 703 deletions(-)
 create mode 100644 optimum/exporters/openvino/__init__.py
 delete mode 100644 optimum/intel/openvino/export.py

diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py
new file mode 100644
index 0000000000..9dc8b1833d
--- /dev/null
+++ b/optimum/exporters/openvino/__init__.py
@@ -0,0 +1,5 @@
+from .__main__ import main_export
+from .convert import export, export_models
+
+
+__all__ = ["main_export", "export", "export_models"]
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 49b54346e5..800d0742f4 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -101,7 +101,6 @@ def export(
             device=device,
             input_shapes=input_shapes,
             model_kwargs=model_kwargs,
-            opset=opset,
             from_onnx=from_onnx,
         )
 
diff --git a/optimum/intel/openvino/export.py b/optimum/intel/openvino/export.py
deleted file mode 100644
index e82d3a66c6..0000000000
--- a/optimum/intel/openvino/export.py
+++ /dev/null
@@ -1,673 +0,0 @@
-import functools
-import gc
-import inspect
-import logging
-import os
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-from openvino import PartialShape, convert_model, serialize
-from openvino.runtime.utils.types import get_element_type
-from requests.exceptions import ConnectionError as RequestsConnectionError
-from transformers import AutoTokenizer
-from transformers.utils import is_tf_available, is_torch_available
-from transformers import AutoTokenizer
-from transformers.utils import is_tf_available, is_torch_available
-
-from optimum.exporters import TasksManager
-from optimum.exporters.onnx import __main__
-from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
-from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow
-from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
-from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
-from optimum.utils.save_utils import maybe_save_preprocessors
-
-from .utils import OV_XML_FILE_NAME
-
-
-logger = logging.getLogger(__name__)
-
-if is_torch_available():
-    import torch
-    import torch.nn as nn
-    from transformers.modeling_utils import PreTrainedModel
-
-if is_diffusers_available():
-    from diffusers import ModelMixin
-
-if is_tf_available():
-    from transformers.modeling_tf_utils import TFPreTrainedModel
-
-def is_torch_model(model):
-    if not is_torch_available():
-        return False
-    return isinstance(model, nn.Module)
-
-
-def export(
-    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
-    config: OnnxConfig,
-    output: Path,
-    opset: Optional[int] = None,
-    device: str = "cpu",
-    input_shapes: Optional[Dict] = None,
-    model_kwargs: Optional[Dict[str, Any]] = None,
-) -> Tuple[List[str], List[str]]:
-    """
-    Exports a Pytorch or TensorFlow model to an ONNX Intermediate Representation.
-
-    Args:
-        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
-            The model to export.
-        config ([`~exporters.onnx.config.OnnxConfig`]):
-            The ONNX configuration associated with the exported model.
-        output (`Path`):
-            Directory to store the exported ONNX model.
-        opset (`Optional[int]`, defaults to `None`):
-            The version of the ONNX operator set to use.
-        device (`str`, *optional*, defaults to `cpu`):
-            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
-            export on CUDA devices.
-        input_shapes (`Optional[Dict]`, defaults to `None`):
-            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
-
-    Returns:
-        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
-        the ONNX configuration.
-    """
-    if not (is_torch_available() or is_tf_available()):
-        raise ImportError(
-            "Cannot convert because neither PyTorch nor TensorFlow are installed. "
-            "Please install torch or tensorflow first."
-        )
-
-    if "diffusers" in str(model.__class__) and not is_diffusers_available():
-        raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
-
-    if is_torch_available() and isinstance(model, nn.Module):
-        return export_pytorch(
-            model, config, opset, output, device=device, input_shapes=input_shapes, model_kwargs=model_kwargs
-        )
-
-    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
-        output.parent.mkdir(parents=True, exist_ok=True)
-        if opset is None:
-            opset = config.DEFAULT_ONNX_OPSET
-        if device == "cuda":
-            raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
-        if input_shapes is not None:
-            logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
-        return export_tensorflow(model, config, opset, output)
-
-    else:
-        raise RuntimeError(
-            "You either provided a PyTorch model with only TensorFlow installed, or a TensorFlow model with only PyTorch installed."
-        )
-
-
-def export_pytorch(
-    model: Union["PreTrainedModel", "ModelMixin"],
-    config: OnnxConfig,
-    opset: int,
-    output: Path,
-    device: str = "cpu",
-    input_shapes: Optional[Dict] = None,
-    model_kwargs: Optional[Dict[str, Any]] = None,
-) -> Tuple[List[str], List[str]]:
-    """
-    Exports a PyTorch model to an ONNX Intermediate Representation.
-
-    Args:
-        model ([`PreTrainedModel`]):
-            The model to export.
-        config ([`~exporters.onnx.config.OnnxConfig`]):
-            The ONNX configuration associated with the exported model.
-        opset (`int`):
-            The version of the ONNX operator set to use.
-        output (`Path`):
-            Directory to store the exported ONNX model.
-        device (`str`, defaults to `"cpu"`):
-            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
-            export on CUDA devices.
-        input_shapes (`optional[Dict]`, defaults to `None`):
-            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
-
-    Returns:
-        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
-        the ONNX configuration.
-    """
-    import torch
-    from torch.utils._pytree import tree_map
-
-    logger.info(f"Using framework PyTorch: {torch.__version__}")
-    output = Path(output)
-
-    with torch.no_grad():
-        model.config.return_dict = True
-        custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
-        model.config.torchscript = not custom_patcher
-    from torch.utils._pytree import tree_map
-
-    logger.info(f"Using framework PyTorch: {torch.__version__}")
-    output = Path(output)
-
-    with torch.no_grad():
-        model.config.return_dict = True
-        custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
-        model.config.torchscript = not custom_patcher
-        model.eval()
-
-        # Check if we need to override certain configuration item
-        if config.values_override is not None:
-            logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
-            for override_config_key, override_config_value in config.values_override.items():
-                logger.info(f"\t- {override_config_key} -> {override_config_value}")
-                setattr(model.config, override_config_key, override_config_value)
-
-        if input_shapes is None:
-            input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES
-
-        # Check that inputs match, and order them properly
-        dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes)
-        device = torch.device(device)
-        if device.type == "cuda" and torch.cuda.is_available():
-            model.to(device)
-            dummy_inputs = tree_map(
-                lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs
-            )
-        check_dummy_inputs_are_allowed(model, dummy_inputs)
-        inputs = config.ordered_inputs(model)
-        input_names = list(inputs.keys())
-        output_names = list(config.outputs.keys())
-        if hasattr(model, "forward"):
-            sig = inspect.signature(model.forward)
-        else:
-            sig = inspect.signature(model.call)
-        dummy_inputs = remove_none_from_dummy_inputs(dummy_inputs)
-        input_info = get_input_shapes(dummy_inputs, inputs)
-        try:
-            if custom_patcher:
-                patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
-                patched_forward = patcher.patched_forward
-
-                @functools.wraps(patched_forward)
-                def ts_patched_forward(*args, **kwargs):
-                    outputs = patched_forward(*args, **kwargs)
-                    return tuple(outputs.values())
-
-                patcher.patched_forward = ts_patched_forward
-                with patcher:
-                    ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
-            else:
-                ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
-        except Exception:
-            model.config.torchscript = False
-            model.config.return_dict = True
-            onnx_output = output.with_suffix(".onnx")
-            input_names, output_names = export_pytorch_to_onnx(
-                model, config, opset, onnx_output, device, input_shapes, model_kwargs
-            )
-            ov_model = convert_model(onnx_output)
-            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-            return input_names, output_names, True
-        clear_class_registry()
-        ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
-        ordered_input_names = list(inputs)
-        flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
-        for idx, out_tensor in enumerate(ov_model.outputs):
-            if idx < len(output_names):
-                out_tensor.get_tensor().set_names({output_names[idx]})
-
-        input_info = get_input_shapes(dummy_inputs, inputs)
-        try:
-            if custom_patcher:
-                patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
-                patched_forward = patcher.patched_forward
-
-                @functools.wraps(patched_forward)
-                def ts_patched_forward(*args, **kwargs):
-                    outputs = patched_forward(*args, **kwargs)
-                    return tuple(outputs.values())
-
-                patcher.patched_forward = ts_patched_forward
-                with patcher:
-                    ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
-            else:
-                ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
-        except Exception:
-            model.config.torchscript = False
-            model.config.return_dict = True
-            onnx_output = output.with_suffix(".onnx")
-            input_names, output_names = export_pytorch_to_onnx(
-                model, config, opset, onnx_output, device, input_shapes, model_kwargs
-            )
-            ov_model = convert_model(onnx_output)
-            serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-            return input_names, output_names, True
-
-        ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
-        ordered_input_names = list(inputs)
-        flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
-        for idx, out_tensor in enumerate(ov_model.outputs):
-            if idx < len(output_names):
-                out_tensor.get_tensor().set_names({output_names[idx]})
-        for idx, inp_tensor in enumerate(ov_model.inputs):
-            input_name = ordered_input_names[idx]
-            inp_tensor.get_tensor().set_names({input_name})
-            inp_data = flatten_inputs[idx]
-            static_shape = PartialShape(inp_data.shape)
-            dims = inputs[input_name]
-
-            for dim in dims:
-                static_shape[dim] = -1
-            inp_tensor.get_node().set_partial_shape(static_shape)
-            inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
-        ov_model.validate_nodes_and_infer_types()
-        serialize(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output)
-        del model
-        gc.collect()
-    return input_names, output_names, False
-
-
-def clear_class_registry():
-    torch._C._jit_clear_class_registry()
-    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
-    torch.jit._state._clear_class_state()
-
-
-def export_models(
-    models_and_onnx_configs: Dict[
-        str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
-    ],
-    output_dir: Path,
-    opset: Optional[int] = None,
-    output_names: Optional[List[str]] = None,
-    device: str = "cpu",
-    input_shapes: Optional[Dict] = None,
-    model_kwargs: Optional[Dict[str, Any]] = None,
-) -> Tuple[List[List[str]], List[List[str]]]:
-    """
-    Exports a Pytorch or TensorFlow encoder decoder model to an ONNX Intermediate Representation.
-    The following method exports the encoder and decoder components of the model as separate
-    ONNX files.
-
-    Args:
-        models_and_onnx_configs (`Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `OnnxConfig`]]):
-            A dictionnary containing the models to export and their corresponding onnx configs.
-        output_dir (`Path`):
-            Output directory to store the exported ONNX models.
-        opset (`Optional[int]`, defaults to `None`):
-            The version of the ONNX operator set to use.
-        output_names (`Optional[List[str]]`, defaults to `None`):
-            The names to use for the exported ONNX files. The order must be the same as the order of submodels in the ordered dict `models_and_onnx_configs`.
-            If None, will use the keys from `models_and_onnx_configs` as names.
-        device (`str`, defaults to `"cpu"`):
-            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
-            export on CUDA devices.
-        input_shapes (`Optional[Dict]`, defaults to `None`):
-            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
-    Returns:
-        `Tuple[List[List[str]], List[List[str]]]`: A tuple with an ordered list of the model's inputs, and the named
-        inputs from the ONNX configuration.
-    """
-    outputs = []
-
-    if output_names is not None and len(output_names) != len(models_and_onnx_configs):
-        raise ValueError(
-            f"Provided custom names {output_names} for the export of {len(models_and_onnx_configs)} models. Please provide the same number of names as models to export."
-        )
-
-    for i, model_name in enumerate(models_and_onnx_configs.keys()):
-        submodel, sub_onnx_config = models_and_onnx_configs[model_name]
-        output_name = output_names[i] if output_names is not None else Path(model_name + ".xml")
-        output_path = output_dir / output_name
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        outputs.append(
-            export(
-                model=submodel,
-                config=sub_onnx_config,
-                output=output_path,
-                opset=opset,
-                device=device,
-                input_shapes=input_shapes,
-                model_kwargs=model_kwargs,
-            )
-        )
-
-    outputs = list(map(list, zip(*outputs)))
-    return outputs
-
-
-def flattenize_inputs(inputs):
-    flatten_inputs = []
-    for input_data in inputs:
-        if input_data is None:
-            continue
-        if isinstance(input_data, (list, tuple)):
-            flatten_inputs.extend(flattenize_inputs(input_data))
-        else:
-            flatten_inputs.append(input_data)
-    return flatten_inputs
-
-
-def remove_none_from_dummy_inputs(dummy_inputs):
-    def remove_none_from_list_tuple(item):
-        new_item = [i for i in item if i is not None]
-        return type(item)(new_item)
-
-    upd_dummy = {}
-    for k, v in dummy_inputs.items():
-        if v is None:
-            continue
-        if isinstance(v, dict):
-            for kk, vv in v.items():
-                upd_dummy[kk] = vv
-            continue
-        if isinstance(v, (tuple, list)):
-            upd_dummy[k] = remove_none_from_list_tuple(v)
-            continue
-        upd_dummy[k] = v
-    return upd_dummy
-
-
-def get_input_shapes(dummy_inputs, inputs):
-    input_info = []
-    for input_name, data in dummy_inputs.items():
-        if isinstance(data, (tuple, list, dict)):
-            return None
-        static_shape = PartialShape(data.shape)
-        if input_name in inputs:
-            dynamic_dims = inputs[input_name]
-            for dim in dynamic_dims:
-                static_shape[dim] = -1
-        input_info.append((input_name, static_shape))
-    return input_info
-
-
-def main_export(
-    model_name_or_path: str,
-    output: Union[str, Path],
-    task: str = "auto",
-    device: str = "cpu",
-    fp16: Optional[bool] = False,
-    monolith: bool = False,
-    framework: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    trust_remote_code: bool = False,
-    pad_token_id: Optional[int] = None,
-    subfolder: str = "",
-    revision: str = "main",
-    force_download: bool = False,
-    local_files_only: bool = False,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    model_kwargs: Optional[Dict[str, Any]] = None,
-    custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
-    fn_get_submodels: Optional[Callable] = None,
-    **kwargs_shapes,
-):
-    """
-    Full-suite ONNX export.
-
-    Args:
-        > Required parameters
-
-        model_name_or_path (`str`):
-            Model ID on huggingface.co or path on disk to the model repository to export.
-        output (`Union[str, Path]`):
-            Path indicating the directory where to store the generated ONNX model.
-
-        > Optional parameters
-
-        task (`Optional[str]`, defaults to `None`):
-            The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models,
-            use `xxx-with-past` to export the model using past key values in the decoder.
-        opset (`Optional[int]`, defaults to `None`):
-            If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture
-            will be used.
-        device (`str`, defaults to `"cpu"`):
-            The device to use to do the export. Defaults to "cpu".
-        fp16 (`Optional[bool]`, defaults to `"False"`):
-            Use half precision during the export. PyTorch-only, requires `device="cuda"`.
-        optimize (`Optional[str]`, defaults to `None`):
-            Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to
-            ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT.
-            Available options: `"O1", "O2", "O3", "O4"`. Reference: [`~optimum.onnxruntime.AutoOptimizationConfig`]
-        monolith (`bool`, defaults to `False`):
-            Forces to export the model as a single ONNX file.
-        no_post_process (`bool`, defaults to `False`):
-            Allows to disable any post-processing done by default on the exported ONNX models.
-        framework (`Optional[str]`, defaults to `None`):
-            The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect
-            the framework for the checkpoint.
-        atol (`Optional[float]`, defaults to `None`):
-            If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
-        cache_dir (`Optional[str]`, defaults to `None`):
-            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
-        trust_remote_code (`bool`, defaults to `False`):
-            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
-            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
-            model repository.
-        pad_token_id (`Optional[int]`, defaults to `None`):
-            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
-        subfolder (`str`, defaults to `""`):
-            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
-            specify the folder name here.
-        revision (`str`, defaults to `"main"`):
-            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
-        force_download (`bool`, defaults to `False`):
-            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-            cached versions if they exist.
-        local_files_only (`Optional[bool]`, defaults to `False`):
-            Whether or not to only look at local files (i.e., do not try to download the model).
-        use_auth_token (`Optional[str]`, defaults to `None`):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
-            Experimental usage: keyword arguments to pass to the model during
-            the export. This argument should be used along the `custom_onnx_configs` argument
-            in case, for example, the model inputs/outputs are changed (for example, if
-            `model_kwargs={"output_attentions": True}` is passed).
-        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
-            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
-        fn_get_submodels (`Optional[Callable]`, defaults to `None`):
-            Experimental usage: Override the default submodels that are used at the export. This is
-            especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
-        use_subprocess (`bool`):
-            Do the ONNX exported model validation in subprocesses. This is especially useful when
-            exporting on CUDA device, where ORT does not release memory at inference session
-            destruction. When set to `True`, the `main_export` call should be guarded in
-            `if __name__ == "__main__":` block.
-        **kwargs_shapes (`Dict`):
-            Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
-
-    Example usage:
-    ```python
-    >>> from optimum.exporters.onnx import main_export
-
-    >>> main_export("gpt2", output="gpt2_onnx/")
-    ```
-    """
-    if (framework == "tf" and fp16 is True) or not is_torch_available():
-        raise ValueError("The --fp16 option is supported only for PyTorch.")
-
-    if fp16 is True and device == "cpu":
-        raise ValueError(
-            "The --fp16 option is supported only when exporting on GPU. Please pass the option `--device cuda`."
-        )
-
-    output = Path(output)
-    if not output.exists():
-        output.mkdir(parents=True)
-    original_task = task
-    task = TasksManager.map_from_synonym(task)
-
-    framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
-
-    # get the shapes to be used to generate dummy inputs
-    input_shapes = {}
-    for input_name in DEFAULT_DUMMY_SHAPES.keys():
-        input_shapes[input_name] = (
-            kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
-        )
-
-    torch_dtype = None if fp16 is False else torch.float16
-
-    if task == "auto":
-        try:
-            task = TasksManager.infer_task_from_model(model_name_or_path)
-        except KeyError as e:
-            raise KeyError(
-                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-        except RequestsConnectionError as e:
-            raise RequestsConnectionError(
-                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-
-    model = TasksManager.get_model_from_task(
-        task,
-        model_name_or_path,
-        subfolder=subfolder,
-        revision=revision,
-        cache_dir=cache_dir,
-        use_auth_token=use_auth_token,
-        local_files_only=local_files_only,
-        force_download=force_download,
-        trust_remote_code=trust_remote_code,
-        framework=framework,
-        torch_dtype=torch_dtype,
-        device=device,
-    )
-
-    custom_architecture = False
-    is_stable_diffusion = "stable-diffusion" in task
-    model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
-
-    if not is_stable_diffusion:
-        if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
-            raise ValueError(
-                f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
-                f"If you want to support {model_type} please propose a PR or open up an issue."
-            )
-        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
-            task, exporter="onnx"
-        ):
-            custom_architecture = True
-
-    # TODO: support onnx_config.py in the model repo
-    if custom_architecture and custom_onnx_configs is None:
-        raise ValueError(
-            "Trying to export a model with a custom architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models."
-        )
-
-    if custom_architecture and original_task == "auto":
-        raise ValueError(
-            f'Automatic task detection is not supported with custom architectures. Please specify the `task` argument. Suggestion: task="{task}" (or task="{task}-with-past" if the model is decoder-based and supports KV cache)'
-        )
-
-    if (
-        not custom_architecture
-        and not is_stable_diffusion
-        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
-    ):
-        if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
-            task = task + "-with-past"
-        else:
-            logger.info(
-                f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
-                f" if needed, please pass `--task {task}-with-past` to export using the past key values."
-            )
-
-    if task.endswith("-with-past") and monolith is True:
-        task_non_past = task.replace("-with-past", "")
-        raise ValueError(
-            f"The task {task} is not compatible with the --monolith argument. Please either use"
-            f" `--task {task_non_past} --monolith`, or `--task {task}` without the monolith argument."
-        )
-
-    if original_task == "auto":
-        synonyms_for_task = sorted(TasksManager.synonyms_for_task(task))
-        if synonyms_for_task:
-            synonyms_for_task = ", ".join(synonyms_for_task)
-            possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
-        else:
-            possible_synonyms = ""
-        logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
-
-    onnx_config, models_and_onnx_configs = __main__._get_submodels_and_onnx_configs(
-        model=model,
-        task=task,
-        monolith=monolith,
-        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
-        custom_architecture=custom_architecture,
-        fn_get_submodels=fn_get_submodels,
-    )
-
-    if not is_stable_diffusion:
-        needs_pad_token_id = (
-            isinstance(onnx_config, OnnxConfigWithPast)
-            and getattr(model.config, "pad_token_id", None) is None
-            and task in ["text-classification"]
-        )
-        if needs_pad_token_id:
-            if pad_token_id is not None:
-                model.config.pad_token_id = pad_token_id
-            else:
-                try:
-                    tok = AutoTokenizer.from_pretrained(model_name_or_path)
-                    model.config.pad_token_id = tok.pad_token_id
-                except Exception:
-                    raise ValueError(
-                        "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
-                    )
-        # Saving the model config and preprocessor as this is needed sometimes.
-        model.config.save_pretrained(output)
-        generation_config = getattr(model, "generation_config", None)
-        if generation_config is not None:
-            generation_config.save_pretrained(output)
-        maybe_save_preprocessors(model_name_or_path, output)
-
-        if model.config.is_encoder_decoder and task.startswith("text-generation"):
-            raise ValueError(
-                f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
-                f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
-                f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`."
-            )
-
-        files_subpaths = None
-    else:
-        # save the subcomponent configuration
-        for model_name in models_and_onnx_configs:
-            subcomponent = models_and_onnx_configs[model_name][0]
-            if hasattr(subcomponent, "save_config"):
-                subcomponent.save_config(output / model_name)
-            elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
-                subcomponent.config.save_pretrained(output / model_name)
-
-        files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs]
-
-        # Saving the additional components needed to perform inference.
-        model.scheduler.save_pretrained(output.joinpath("scheduler"))
-
-        feature_extractor = getattr(model, "feature_extractor", None)
-        if feature_extractor is not None:
-            feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
-
-        tokenizer = getattr(model, "tokenizer", None)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(output.joinpath("tokenizer"))
-
-        tokenizer_2 = getattr(model, "tokenizer_2", None)
-        if tokenizer_2 is not None:
-            tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
-
-        model.save_config(output)
-
-    export_models(
-        models_and_onnx_configs=models_and_onnx_configs,
-        output_dir=output,
-        output_names=files_subpaths,
-        input_shapes=input_shapes,
-        device=device,
-        model_kwargs=model_kwargs,
-    )
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index fda0e9eb5a..b06670dffa 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -29,8 +29,9 @@
 from optimum.exporters.tasks import TasksManager
 from optimum.modeling_base import OptimizedModel
 
+from ...exporters.openvino import export
+from ...exporters.openvino.utils import is_torch_model
 from ..utils.import_utils import is_transformers_version
-from .export import export, is_torch_model
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
 
 
@@ -129,10 +130,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
 
         if isinstance(file_name, str):
             file_name = Path(file_name)
-        bin_file_name = file_name.with_suffix(".bin") if file_name.suffix == ".xml" else None
-        model = (
-            core.read_model(file_name, bin_file_name) if not file_name.suffix == ".onnx" else convert_model(file_name)
-        )
+        model = core.read_model(file_name) if not file_name.suffix == ".onnx" else convert_model(file_name)
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
@@ -199,8 +197,9 @@ def _from_pretrained(
             model_save_dir = model_id
         # Download the model from the hub
         else:
-            model_file_names = [file_name] if from_onnx else []
+            model_file_names = [file_name]
             # If not ONNX then OpenVINO IR
+
             if not from_onnx:
                 model_file_names.append(file_name.replace(".xml", ".bin"))
             file_names = []
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index eca3f661a8..f8e09b2c91 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -27,8 +27,8 @@
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import get_encoder_decoder_models_for_export
 
+from ...exporters.openvino import export_models
 from ..utils.import_utils import is_transformers_version
-from .export import export_models
 from .modeling_base import OVBaseModel
 from .utils import (
     ONNX_DECODER_NAME,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 83a05c4d26..966bec52f4 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -30,13 +30,10 @@
 from optimum.exporters import TasksManager
 from optimum.utils import NormalizedConfigManager
 
+from ...exporters.openvino import export
+from ...exporters.openvino.utils import is_torch_model
 from ..utils.import_utils import is_transformers_version
-<<<<<<< HEAD
-from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
-from .export import export, is_torch_model
-=======
 from ..utils.modeling_utils import patch_decoder_attention_mask
->>>>>>> fix llama export in quantization flow
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 21e8323394..8de9ead5f5 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -49,8 +49,8 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
 )
 
+from ...exporters.openvino import main_export
 from .loaders import OVTextualInversionLoaderMixin
-from .export import main_export
 from .modeling_base import OVBaseModel
 from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index f9c0524747..63083c6d89 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -30,13 +30,14 @@
 from nncf.torch.initialization import PTInitializingDataLoader
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, Tensor
-from torch.utils.data import DataLoader, RandomSampler, TensorDataset
+from torch.utils.data import DataLoader, RandomSampler
 from transformers import DataCollator, PreTrainedModel, default_data_collator
 from transformers.pytorch_utils import Conv1D
 
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 
+from ...exporters.openvino import export
 from ..utils.constant import _TASK_ALIASES
 from ..utils.modeling_utils import patch_decoder_attention_mask
 from .configuration import OVConfig
@@ -353,7 +354,7 @@ def _quantize_torchmodel(
                 "No configuration describing the quantization process was provided, a default OVConfig will be generated."
             )
             quantization_config = OVConfig()
-            
+
         if weights_only:
             compressed_model = compress_weights(self.model)
             self.model = compressed_model
@@ -377,19 +378,18 @@ def _quantize_torchmodel(
         task = self.task
         model = self.model
         self.model.config.save_pretrained(save_directory)
-
+        model = patch_decoder_attention_mask(model)
         if task == "text-generation":
-            model = patch_decoder_attention_mask(model)
             onnx_config = onnx_config_class(model.config, use_past=model.config.use_cache)
         else:
             onnx_config = onnx_config_class(model.config)
 
-        model_path = save_directory / onnx_file_name if quantization_config.save_onnx_model else ov_file_name
+        model_path = save_directory / (onnx_file_name if quantization_config.save_onnx_model else ov_file_name)
         onnx_path = save_directory / onnx_file_name
         opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
         opset = max(opset, MIN_ONNX_QDQ_OPSET)
         _, _, is_onnx = export(
-            model=compressed_model,
+            model=model,
             config=onnx_config,
             output=model_path,
             opset=opset,
@@ -399,17 +399,8 @@ def _quantize_torchmodel(
             # Load and save the compressed model
             model = core.read_model(onnx_path)
             self._save_pretrained(model, output_path)
-<<<<<<< HEAD
-        else:
-            _, _, is_onnx = export(model=compressed_model, config=onnx_config, output=output_path)
-            if is_onnx:
-                onnx_path = output_path.replace(".xml", ".onnx")
-                model = core.read_model(onnx_path)
-                self._save_pretrained(model, output_path)
-=======
             # if onnx conversion happens as fallback for pytorch conversion, remove onnx model
             if not quantization_config.save_onnx_model:
->>>>>>> fix llama export in quantization flow
                 os.remove(onnx_path)
                 try:
                     os.remove(f"{onnx_path}_data")
diff --git a/setup.py b/setup.py
index 020d4e8826..2dd7510e96 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino==2023.1.0.dev20230728", "onnx", "onnxruntime"],
-    "nncf": ["git+https://github.com/openvinotoolkit/nncf.git"],
+    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From 62634d1aeafa54e2c47014e5505b9a129f9b4651 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 18 Aug 2023 13:50:19 +0400
Subject: [PATCH 25/38] update prerelease package

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2dd7510e96..4812ede91a 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
         "onnx",
         "onnxruntime<1.15.0",
     ],
-    "openvino": ["openvino==2023.1.0.dev20230728", "onnx", "onnxruntime"],
+    "openvino": ["openvino==2023.1.0.dev20230811", "onnx", "onnxruntime"],
     "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],

From c54010fdf79894fdde36a2f075b7ea6d00ab598e Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 18 Aug 2023 14:27:41 +0400
Subject: [PATCH 26/38] fix onnx name issues

---
 optimum/exporters/openvino/convert.py  | 44 ++++++++++++++------------
 optimum/intel/openvino/quantization.py |  7 ++--
 tests/openvino/test_quantization.py    |  4 +--
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 800d0742f4..eb8f20f6c9 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -29,7 +29,7 @@
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
 from optimum.utils import is_diffusers_available
 
-from ...intel.openvino.utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
+from ...intel.openvino.utils import OV_XML_FILE_NAME
 from .utils import (
     clear_class_registry,
     flattenize_inputs,
@@ -135,9 +135,7 @@ def export_pytorch_via_onnx(
     torch.onnx.export = functools.partial(orig_torch_onnx_export, do_constant_folding=False)
     model.config.torchscript = False
     model.config.return_dict = True
-    onnx_output = (
-        output.with_suffix(".onnx") if not output.name != OV_XML_FILE_NAME else output.parent / ONNX_WEIGHTS_NAME
-    )
+    onnx_output = output.with_suffix(".onnx")
     input_names, output_names = export_pytorch_to_onnx(
         model, config, opset, onnx_output, device, input_shapes, model_kwargs
     )
@@ -192,6 +190,7 @@ def export_pytorch(
         return export_pytorch_via_onnx(model, config, opset, output, device, input_shapes, model_kwargs)
 
     with torch.no_grad():
+        model.config.torchscript = False
         model.config.return_dict = True
         model.eval()
 
@@ -224,23 +223,28 @@ def export_pytorch(
 
         dummy_inputs, dict_inputs = remove_none_from_dummy_inputs(dummy_inputs)
         input_info = get_input_shapes(dummy_inputs, inputs)
+        custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
         try:
-            patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
-            patched_forward = patcher.patched_forward
-
-            @functools.wraps(patched_forward)
-            def ts_patched_forward(*args, **kwargs):
-                for i in range(len(dict_inputs)):
-                    input_name = dict_inputs[i][0]
-                    keys = dict_inputs[i][1]
-                    tuple_input = kwargs[input_name]
-                    input_dict = dict(zip(keys, tuple_input))
-                    kwargs[input_name] = input_dict
-                outputs = patched_forward(*args, **kwargs)
-                return tuple(outputs.values())
-
-            patcher.patched_forward = ts_patched_forward
-            with patcher:
+            if custom_patcher or dict_inputs:
+                patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
+                patched_forward = patcher.patched_forward
+
+                @functools.wraps(patched_forward)
+                def ts_patched_forward(*args, **kwargs):
+                    for i in range(len(dict_inputs)):
+                        input_name = dict_inputs[i][0]
+                        keys = dict_inputs[i][1]
+                        tuple_input = kwargs[input_name]
+                        input_dict = dict(zip(keys, tuple_input))
+                        kwargs[input_name] = input_dict
+                    outputs = patched_forward(*args, **kwargs)
+                    return tuple(outputs.values())
+
+                patcher.patched_forward = ts_patched_forward
+                with patcher:
+                    ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
+            else:
+                model.config.torchscript = True
                 ov_model = convert_model(model, example_input=dummy_inputs, input=input_info)
         except Exception as ex:
             logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX")
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 63083c6d89..708be320d4 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -337,7 +337,6 @@ def _quantize_torchmodel(
         save_directory = Path(save_directory)
         save_directory.mkdir(parents=True, exist_ok=True)
         ov_file_name = file_name if file_name is not None else OV_XML_FILE_NAME
-        onnx_file_name = Path(file_name).with_suffix(".onnx") if file_name is not None else ONNX_WEIGHTS_NAME
         output_path = save_directory.joinpath(ov_file_name)
         output_path = output_path.with_suffix(".xml").as_posix()
 
@@ -354,7 +353,11 @@ def _quantize_torchmodel(
                 "No configuration describing the quantization process was provided, a default OVConfig will be generated."
             )
             quantization_config = OVConfig()
-
+        onnx_file_name = (
+            ONNX_WEIGHTS_NAME
+            if file_name is None and quantization_config.save_onnx_model
+            else Path(ov_file_name).with_suffix(".onnx")
+        )
         if weights_only:
             compressed_model = compress_weights(self.model)
             self.model = compressed_model
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 51dfe98507..369ad0f836 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -64,8 +64,8 @@ def get_num_quantized_nodes(ov_model):
 class OVQuantizerTest(unittest.TestCase):
     # TODO : add models
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
-        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 32),
-        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 21),
+        (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 35),
+        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 22),
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)

From 2e003c15421abc7572a2f1c82643aa478d3c3e1d Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 21 Aug 2023 11:05:13 +0400
Subject: [PATCH 27/38] experiments with tests

---
 .github/workflows/test_openvino.yml |  6 ++-
 tests/openvino/test_modeling.py     | 72 +++++++++++++++++++++++++++--
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index cb58f412a6..80ab12c2f0 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -35,4 +35,8 @@ jobs:
         pip install .[openvino,nncf,tests,diffusers]
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic
+        pytest tests/openvino/test_modeling.py
+        pytest tests/openvino/test_quantization.py
+        pytest tests/openvino/test_stable_diffusion.py
+        pytest tests/openvino/test_training_examples.py
+        pytest tests/openvino/test_training.py
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 2ffbbd6fba..c2d54893bc 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -117,6 +117,9 @@ def test_load_from_hub_and_save_model(self):
 
         outputs = model(**tokens)
         self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+        del loaded_model
+        del model
+        gc.collect()
 
     def test_load_from_hub_and_save_decoder_model(self):
         tokenizer = AutoTokenizer.from_pretrained(self.OV_DECODER_MODEL_ID)
@@ -134,6 +137,9 @@ def test_load_from_hub_and_save_decoder_model(self):
 
         outputs = model(**tokens)
         self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+        del loaded_model
+        del model
+        gc.collect()
 
     def test_load_from_hub_and_save_seq2seq_model(self):
         tokenizer = AutoTokenizer.from_pretrained(self.OV_SEQ2SEQ_MODEL_ID)
@@ -153,6 +159,9 @@ def test_load_from_hub_and_save_seq2seq_model(self):
 
         outputs = model.generate(**tokens)
         self.assertTrue(torch.equal(loaded_model_outputs, outputs))
+        del loaded_model
+        del model
+        gc.collect()
 
     @require_diffusers
     def test_load_from_hub_and_save_stable_diffusion_model(self):
@@ -186,6 +195,8 @@ def test_load_from_hub_and_save_stable_diffusion_model(self):
         np.random.seed(0)
         outputs = pipeline(**inputs).images
         self.assertTrue(np.array_equal(pipeline_outputs, outputs))
+        del pipeline
+        gc.collect()
 
 
 class OVModelForSequenceClassificationIntegrationTest(unittest.TestCase):
@@ -228,6 +239,9 @@ def test_compare_to_transformers(self, model_arch):
             self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
             # Compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
+        del transformers_model
+        del ov_model
+        gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
@@ -258,6 +272,7 @@ def test_pipeline(self, model_arch):
             self.assertGreaterEqual(outputs[0]["score"], 0.0)
             self.assertIsInstance(outputs[0]["label"], str)
         del model
+        del pipe
         gc.collect()
 
 
@@ -327,6 +342,10 @@ def test_metric(self):
         ov_metric = task_evaluator.compute(model_or_pipeline=ov_pipe, data=data, metric="squad")
         self.assertEqual(ov_metric["exact_match"], transformers_metric["exact_match"])
         self.assertEqual(ov_metric["f1"], transformers_metric["f1"])
+        del transformers_pipe
+        del transformers_model
+        del ov_pipe
+        del ov_model
         gc.collect()
 
 
@@ -356,6 +375,8 @@ def test_compare_to_transformers(self, model_arch):
             self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
             # Compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
+        del transformers_model
+        del ov_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -367,6 +388,8 @@ def test_pipeline(self, model_arch):
         outputs = pipe("My Name is Arthur and I live in Lyon.")
         self.assertEqual(pipe.device, model.device)
         self.assertTrue(all(item["score"] > 0.0 for item in outputs))
+        del model
+        del pipe
         gc.collect()
 
 
@@ -400,6 +423,8 @@ def test_compare_to_transformers(self, model_arch):
                     torch.Tensor(ov_outputs.last_hidden_state), transformers_outputs.last_hidden_state, atol=1e-4
                 )
             )
+        del transformers_model
+        del ov_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -411,6 +436,8 @@ def test_pipeline(self, model_arch):
         outputs = pipe("My Name is Arthur and I live in Lyon.")
         self.assertEqual(pipe.device, model.device)
         self.assertTrue(all(all(isinstance(item, float) for item in row) for row in outputs[0]))
+        del pipe
+        del model
         gc.collect()
 
 
@@ -451,6 +478,8 @@ def test_compare_to_transformers(self, model_arch):
             transformers_outputs = transformers_model(**tokens)
         # Compare tensor outputs
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+        del transformers_model
+        del ov_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -466,6 +495,8 @@ def test_pipeline(self, model_arch):
         outputs = pipe("This is a sample", max_length=10)
         self.assertEqual(pipe.device, model.device)
         self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs))
+        del pipe
+        del model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -481,6 +512,8 @@ def test_multiple_inputs(self, model_arch):
         outputs = model.generate(**tokens, generation_config=generation_config)
         self.assertIsInstance(outputs, torch.Tensor)
         self.assertEqual(outputs.shape[0], 3)
+        del model
+        gc.collect()
 
     def test_model_and_decoder_same_device(self):
         model_id = MODEL_NAMES["gpt2"]
@@ -489,6 +522,8 @@ def test_model_and_decoder_same_device(self):
         self.assertEqual(model._device, "TEST")
         # Verify that request is being reset
         self.assertEqual(model.request, None)
+        del model
+        gc.collect()
 
     def test_compare_with_and_without_past_key_values(self):
         model_id = MODEL_NAMES["gpt2"]
@@ -518,6 +553,9 @@ def test_compare_with_and_without_past_key_values(self):
             f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
             f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
         )
+        del model_with_pkv
+        del model_without_pkv
+        gc.collect()
 
 
 class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
@@ -527,7 +565,7 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
         # "camembert",
         # "convbert",
         # "data2vec_text",
-        "deberta",
+        # "deberta",
         # "deberta_v2",
         "distilbert",
         "electra",
@@ -538,7 +576,7 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
         "roformer",
         "squeezebert",
         "xlm",
-        # "xlm_roberta",
+        "xlm_roberta",
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -560,6 +598,8 @@ def test_compare_to_transformers(self, model_arch):
             self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
             # Compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
+        del transformers_model
+        del ov_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -571,6 +611,8 @@ def test_pipeline(self, model_arch):
         outputs = pipe(f"This is a {tokenizer.mask_token}.")
         self.assertEqual(pipe.device, model.device)
         self.assertTrue(all(item["score"] > 0.0 for item in outputs))
+        del pipe
+        del model
         gc.collect()
 
 
@@ -613,6 +655,8 @@ def test_compare_to_transformers(self, model_arch):
             self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
             # Compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
+        del transformers_model
+        del ov_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -625,6 +669,8 @@ def test_pipeline(self, model_arch):
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs[0]["score"], 0.0)
         self.assertTrue(isinstance(outputs[0]["label"], str))
+        del model
+        del pipe
         gc.collect()
 
     @parameterized.expand(TIMM_MODELS)
@@ -706,6 +752,8 @@ def test_compare_to_transformers(self, model_arch):
             transformers_outputs = transformers_model(**tokens, **decoder_inputs)
         # Compare tensor outputs
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+        del transformers_model
+        del ov_model
 
         gc.collect()
 
@@ -738,7 +786,8 @@ def test_pipeline(self, model_arch):
         outputs = pipe(text)
         self.assertEqual(pipe.device, model.device)
         self.assertIsInstance(outputs[0]["translation_text"], str)
-
+        del pipe
+        del model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -758,6 +807,7 @@ def test_generate_utils(self, model_arch):
         outputs = model.generate(input_ids=tokens["input_ids"])
         outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
+        del model
 
         gc.collect()
 
@@ -789,6 +839,9 @@ def test_compare_with_and_without_past_key_values(self):
             f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
             f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
         )
+        del model_with_pkv
+        del model_without_pkv
+        gc.collect()
 
 
 class OVModelForAudioClassificationIntegrationTest(unittest.TestCase):
@@ -834,6 +887,10 @@ def test_compare_to_transformers(self, model_arch):
             # Compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-3))
 
+        del transformers_model
+        del ov_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -843,6 +900,9 @@ def test_pipeline(self, model_arch):
         outputs = pipe([np.random.random(16000)])
         self.assertEqual(pipe.device, model.device)
         self.assertTrue(all(item["score"] > 0.0 for item in outputs[0]))
+        del pipe
+        del model
+        gc.collect()
 
 
 class OVModelForCTCIntegrationTest(unittest.TestCase):
@@ -896,6 +956,8 @@ def test_compare_to_transformers(self, model_arch):
             # compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
 
+        del transformers_model
+        del ov_model
         gc.collect()
 
 
@@ -948,6 +1010,8 @@ def test_compare_to_transformers(self, model_arch):
                 torch.allclose(torch.Tensor(ov_outputs.embeddings), transformers_outputs.embeddings, atol=1e-4)
             )
 
+        del transformers_model
+        del ov_model
         gc.collect()
 
 
@@ -997,4 +1061,6 @@ def test_compare_to_transformers(self, model_arch):
             # compare tensor outputs
             self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
 
+        del transformers_model
+        del ov_model
         gc.collect()

From d29a0c1385eb1780b565c3b8053dcb3823c25af2 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 23 Aug 2023 08:45:00 +0400
Subject: [PATCH 28/38] better workaround for nncf patch torch ops and apply
 review comments

---
 .github/workflows/test_openvino.yml   |  6 +-----
 optimum/exporters/openvino/convert.py | 21 ++++++++++++++-------
 optimum/exporters/openvino/utils.py   |  6 ++++++
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 80ab12c2f0..cb58f412a6 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -35,8 +35,4 @@ jobs:
         pip install .[openvino,nncf,tests,diffusers]
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/test_modeling.py
-        pytest tests/openvino/test_quantization.py
-        pytest tests/openvino/test_stable_diffusion.py
-        pytest tests/openvino/test_training_examples.py
-        pytest tests/openvino/test_training.py
+        pytest tests/openvino/ --ignore test_modeling_basic
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index eb8f20f6c9..1a0d77b357 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -70,14 +70,16 @@ def export(
         config ([`~exporters.onnx.config.OnnxConfig`]):
             The ONNX configuration associated with the exported model.
         output (`Path`):
-            Directory to store the exported ONNX model.
+            Directory to store the exported model.
         opset (`Optional[int]`, defaults to `None`):
             The version of the ONNX operator set to use.
         device (`str`, *optional*, defaults to `cpu`):
-            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
             export on CUDA devices.
         input_shapes (`Optional[Dict]`, defaults to `None`):
-            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+            If specified, allows to use specific shapes for the example input provided to the exporter.
+        from_onnx (`bool`, defaults to False):
+            If set to True, model will be converted vie exporting to ONNX.
 
     Returns:
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -166,16 +168,18 @@ def export_pytorch(
         model ([`PreTrainedModel`]):
             The model to export.
         config ([`~exporters.onnx.config.OnnxConfig`]):
-            The ONNX configuration associated with the exported model.
+            The configuration associated with the exported model.
         opset (`int`):
             The version of the ONNX operator set to use.
         output (`Path`):
-            Directory to store the exported ONNX model.
+            Directory to store the exported model.
         device (`str`, defaults to `"cpu"`):
-            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
             export on CUDA devices.
         input_shapes (`optional[Dict]`, defaults to `None`):
-            If specified, allows to use specific shapes for the example input provided to the ONNX exporter.
+            If specified, allows to use specific shapes for the example input provided to the exporter.
+        from_onnx (`bool`, defaults to False):
+            If set to True, model will be converted vie exporting to ONNX.
 
     Returns:
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -225,6 +229,9 @@ def export_pytorch(
         input_info = get_input_shapes(dummy_inputs, inputs)
         custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
         try:
+            # TorchScript used behaind OpenVINO conversion. Optimum supports only return_dict=True models for patching,
+            # while TorchScript do not support dictionary with values of mixed types (e.g. Tensor and None) in model input/output
+            # To handle it, additional wrapper on patcher forward applied.
             if custom_patcher or dict_inputs:
                 patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
                 patched_forward = patcher.patched_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 9b1867ba83..eafb8da62f 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -16,6 +16,8 @@
 
 from openvino.runtime import PartialShape
 
+from ...intel.utils.import_utils import is_nncf_available
+
 
 if is_torch_available():
     import torch
@@ -79,3 +81,7 @@ def clear_class_registry():
     torch._C._jit_clear_class_registry()
     torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
     torch.jit._state._clear_class_state()
+    if is_nncf_available():
+        from nncf.torch import patch_torch_operators
+
+        patch_torch_operators()

From 6ccb6d77be5299f4ccd5448f8edad39ac544c610 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 23 Aug 2023 08:55:07 +0400
Subject: [PATCH 29/38] remove flag from_onnx

---
 optimum/exporters/openvino/__init__.py |  2 +-
 optimum/exporters/openvino/convert.py  | 10 +---------
 optimum/intel/openvino/quantization.py | 11 +++--------
 optimum/intel/openvino/trainer.py      |  2 +-
 4 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py
index 9dc8b1833d..d87d8dda9e 100644
--- a/optimum/exporters/openvino/__init__.py
+++ b/optimum/exporters/openvino/__init__.py
@@ -1,5 +1,5 @@
 from .__main__ import main_export
-from .convert import export, export_models
+from .convert import export, export_models, export_pytorch_via_onnx
 
 
 __all__ = ["main_export", "export", "export_models"]
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 1a0d77b357..54482d3dc1 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -59,7 +59,6 @@ def export(
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
-    from_onnx: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.
@@ -78,8 +77,6 @@ def export(
             export on CUDA devices.
         input_shapes (`Optional[Dict]`, defaults to `None`):
             If specified, allows to use specific shapes for the example input provided to the exporter.
-        from_onnx (`bool`, defaults to False):
-            If set to True, model will be converted vie exporting to ONNX.
 
     Returns:
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -103,7 +100,6 @@ def export(
             device=device,
             input_shapes=input_shapes,
             model_kwargs=model_kwargs,
-            from_onnx=from_onnx,
         )
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
@@ -133,6 +129,7 @@ def export_pytorch_via_onnx(
 ):
     import torch
 
+    output = Path(output)
     orig_torch_onnx_export = torch.onnx.export
     torch.onnx.export = functools.partial(orig_torch_onnx_export, do_constant_folding=False)
     model.config.torchscript = False
@@ -159,7 +156,6 @@ def export_pytorch(
     device: str = "cpu",
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
-    from_onnx: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an OpenVINO Intermediate Representation.
@@ -178,8 +174,6 @@ def export_pytorch(
             export on CUDA devices.
         input_shapes (`optional[Dict]`, defaults to `None`):
             If specified, allows to use specific shapes for the example input provided to the exporter.
-        from_onnx (`bool`, defaults to False):
-            If set to True, model will be converted vie exporting to ONNX.
 
     Returns:
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
@@ -190,8 +184,6 @@ def export_pytorch(
 
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     output = Path(output)
-    if from_onnx:
-        return export_pytorch_via_onnx(model, config, opset, output, device, input_shapes, model_kwargs)
 
     with torch.no_grad():
         model.config.torchscript = False
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 708be320d4..c758675c97 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -37,7 +37,7 @@
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer
 
-from ...exporters.openvino import export
+from ...exporters.openvino import export, export_pytorch_via_onnx
 from ..utils.constant import _TASK_ALIASES
 from ..utils.modeling_utils import patch_decoder_attention_mask
 from .configuration import OVConfig
@@ -389,15 +389,10 @@ def _quantize_torchmodel(
 
         model_path = save_directory / (onnx_file_name if quantization_config.save_onnx_model else ov_file_name)
         onnx_path = save_directory / onnx_file_name
+        export_fn = export if not quantization_config.save_onnx_model else export_pytorch_via_onnx
         opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
         opset = max(opset, MIN_ONNX_QDQ_OPSET)
-        _, _, is_onnx = export(
-            model=model,
-            config=onnx_config,
-            output=model_path,
-            opset=opset,
-            from_onnx=quantization_config.save_onnx_model,
-        )
+        _, _, is_onnx = export_fn(model=model, config=onnx_config, output=model_path, opset=opset)
         if is_onnx:
             # Load and save the compressed model
             model = core.read_model(onnx_path)
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index e09293739f..2935c20dbf 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -752,7 +752,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
                 try:
                     # OpenVINO IR pruning requires static-shaped input
                     ov_model = self._reshape_ir(ov_model, static_shape=True)
-                    apply_moc_transformations(ov_model)
+                    apply_moc_transformations(ov_model, cf=False)
                     if self._get_compression_controller_by_cls(QuantizationController) is not None:
                         compress_quantize_weights_transformation(ov_model)
                     apply_pruning_transformation(ov_model)

From 8775ab2ce8fc469023cdf9f4d692ab605cd885d5 Mon Sep 17 00:00:00 2001
From: Aidova <ekaterina.aidova@intel.com>
Date: Wed, 30 Aug 2023 17:44:03 +0400
Subject: [PATCH 30/38] refactoring

---
 optimum/exporters/openvino/convert.py      | 23 ++++++++++++++++------
 optimum/exporters/openvino/utils.py        |  4 ----
 optimum/intel/openvino/modeling_base.py    |  8 ++++----
 optimum/intel/openvino/modeling_decoder.py | 11 +++++------
 optimum/intel/openvino/quantization.py     |  1 +
 5 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 54482d3dc1..8ec6576796 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -25,7 +25,7 @@
 from openvino.runtime.utils.types import get_element_type
 from openvino.tools.ovc import convert_model
 from optimum.exporters.onnx.base import OnnxConfig
-from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow
+from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow as export_tensorflow_onnx
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
 from optimum.utils import is_diffusers_available
 
@@ -118,6 +118,18 @@ def export(
         )
 
 
+def export_tensorflow(model: Union["PreTrainedModel", "ModelMixin"], config: OnnxConfig, opset: int, output: Path):
+    onnx_path = Path(output).with_suffix(".onnx")
+    input_names, output_names = export_tensorflow_onnx(model, config, opset, onnx_path)
+    ov_model = convert_model(str(onnx_path))
+    save_model(
+        ov_model,
+        output.parent / output,
+        compress_to_fp16=False,
+    )
+    return input_names, output_names, True
+
+
 def export_pytorch_via_onnx(
     model: Union["PreTrainedModel", "ModelMixin"],
     config: OnnxConfig,
@@ -221,9 +233,10 @@ def export_pytorch(
         input_info = get_input_shapes(dummy_inputs, inputs)
         custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export
         try:
-            # TorchScript used behaind OpenVINO conversion. Optimum supports only return_dict=True models for patching,
+            # TorchScript used behind OpenVINO conversion. Optimum supports only return_dict=True models for patching,
             # while TorchScript do not support dictionary with values of mixed types (e.g. Tensor and None) in model input/output
             # To handle it, additional wrapper on patcher forward applied.
+            # model.config.torchscript = True can not be used for patching, because it overrides return_dict to Flase
             if custom_patcher or dict_inputs:
                 patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
                 patched_forward = patcher.patched_forward
@@ -248,7 +261,6 @@ def ts_patched_forward(*args, **kwargs):
         except Exception as ex:
             logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX")
             return export_pytorch_via_onnx(model, config, opset, output, device, input_shapes, model_kwargs)
-        clear_class_registry()
         ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs}
         ordered_input_names = list(inputs)
         flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values())
@@ -269,9 +281,8 @@ def ts_patched_forward(*args, **kwargs):
             inp_tensor.get_node().set_partial_shape(static_shape)
             inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype))
         ov_model.validate_nodes_and_infer_types()
-        save_model(
-            ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output, compress_to_fp16=False
-        )
+        save_model(ov_model, output, compress_to_fp16=False)
+        clear_class_registry()
         del model
         gc.collect()
     return input_names, output_names, False
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index eafb8da62f..af9951a8f4 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -81,7 +81,3 @@ def clear_class_registry():
     torch._C._jit_clear_class_registry()
     torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
     torch.jit._state._clear_class_state()
-    if is_nncf_available():
-        from nncf.torch import patch_torch_operators
-
-        patch_torch_operators()
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index b06670dffa..0fba2e8d3e 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -27,10 +27,10 @@
 
 from optimum.exporters.onnx import OnnxConfig
 from optimum.exporters.tasks import TasksManager
+from optimum.exporters.onnx.base import OnnxConfig
 from optimum.modeling_base import OptimizedModel
 
 from ...exporters.openvino import export
-from ...exporters.openvino.utils import is_torch_model
 from ..utils.import_utils import is_transformers_version
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
 
@@ -305,18 +305,18 @@ def _to_onnx_to_load(
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
 
-        # Export the model to the ONNX format
+        # Export the model to the OpenVINO IR format
         export(
             model=model,
             config=onnx_config,
             opset=onnx_config.DEFAULT_ONNX_OPSET,
-            output=save_dir_path / ONNX_WEIGHTS_NAME,
+            output=save_dir_path / OV_XML_FILE_NAME,
         )
 
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=not is_torch_model(model),
+            from_onnx=False,
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 966bec52f4..a9cd8e309b 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -31,11 +31,10 @@
 from optimum.utils import NormalizedConfigManager
 
 from ...exporters.openvino import export
-from ...exporters.openvino.utils import is_torch_model
 from ..utils.import_utils import is_transformers_version
 from ..utils.modeling_utils import patch_decoder_attention_mask
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
-from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
+from .utils import OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
 
 if is_transformers_version("<", "4.25.0"):
@@ -235,18 +234,18 @@ def _from_transformers(
         # TODO : create ModelPatcher to patch each architecture
         model = patch_decoder_attention_mask(model)
 
-        # Export the model to the ONNX format
-        export(model=model, config=onnx_config, output=save_dir_path / ONNX_WEIGHTS_NAME)
+        # Export the model to the OpenVINO IR format
+        export(model=model, config=onnx_config, output=save_dir_path / OV_XML_FILE_NAME)
 
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=not is_torch_model(model),
+            from_onnx=False,
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
             cache_dir=cache_dir,
-            file_name=ONNX_WEIGHTS_NAME if not is_torch_model(model) else OV_XML_FILE_NAME,
+            file_name=OV_XML_FILE_NAME,
             local_files_only=local_files_only,
             use_cache=use_cache,
             **kwargs,
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index c758675c97..a56f5222ba 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -396,6 +396,7 @@ def _quantize_torchmodel(
         if is_onnx:
             # Load and save the compressed model
             model = core.read_model(onnx_path)
+            # Model required second saving for appling weights compression transformations
             self._save_pretrained(model, output_path)
             # if onnx conversion happens as fallback for pytorch conversion, remove onnx model
             if not quantization_config.save_onnx_model:

From 0cd1c028338809a0b9f822f8e64f544fd44da497 Mon Sep 17 00:00:00 2001
From: Aidova <ekaterina.aidova@intel.com>
Date: Wed, 30 Aug 2023 20:29:05 +0400
Subject: [PATCH 31/38] docstrings and typehints

---
 optimum/exporters/openvino/convert.py   | 69 ++++++++++++++++++++++--
 optimum/exporters/openvino/utils.py     | 70 ++++++++++++++++++++++---
 optimum/intel/openvino/modeling_base.py |  8 +--
 optimum/intel/utils/modeling_utils.py   | 12 ++++-
 setup.py                                |  2 +-
 5 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 8ec6576796..5bcbf95088 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -25,8 +25,9 @@
 from openvino.runtime.utils.types import get_element_type
 from openvino.tools.ovc import convert_model
 from optimum.exporters.onnx.base import OnnxConfig
-from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed, export_tensorflow as export_tensorflow_onnx
+from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
+from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx
 from optimum.utils import is_diffusers_available
 
 from ...intel.openvino.utils import OV_XML_FILE_NAME
@@ -119,6 +120,20 @@ def export(
 
 
 def export_tensorflow(model: Union["PreTrainedModel", "ModelMixin"], config: OnnxConfig, opset: int, output: Path):
+    """
+    Export the TensorFlow model to OpenVINO format.
+
+    Args:
+        model (Union[): The model to export.
+        config (OnnxConfig): The configuration of the model.
+        opset (int): The ONNX opset version to use.
+        output (Path): The path to save the model.
+
+    Returns:
+        input_names: list of input names from ONNX configuration
+        output_names: list of output names from ONNX configuration
+        bool:  True if the model was exported successfully.
+    """
     onnx_path = Path(output).with_suffix(".onnx")
     input_names, output_names = export_tensorflow_onnx(model, config, opset, onnx_path)
     ov_model = convert_model(str(onnx_path))
@@ -139,6 +154,30 @@ def export_pytorch_via_onnx(
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
 ):
+    """
+    Exports a PyTorch model to an OpenVINO Intermediate Representation via ONNX export.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The model to export.
+        config ([`~exporters.onnx.config.OnnxConfig`]):
+            The configuration associated with the exported model.
+        opset (`int`):
+            The version of the ONNX operator set to use.
+        output (`Path`):
+            Directory to store the exported model.
+        device (`str`, defaults to `"cpu"`):
+            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (`optional[Dict]`, defaults to `None`):
+            If specified, allows to use specific shapes for the example input provided to the exporter.
+        model_kwargs (optional[Dict[str, Any]], defaults to `None`):
+            Additional kwargs for model export
+
+    Returns:
+        `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration and boolean flag - was legacy ONNX path were applied to model or not.
+    """
     import torch
 
     output = Path(output)
@@ -186,10 +225,12 @@ def export_pytorch(
             export on CUDA devices.
         input_shapes (`optional[Dict]`, defaults to `None`):
             If specified, allows to use specific shapes for the example input provided to the exporter.
+        model_kwargs (optional[Dict[str, Any]], defaults to `None`):
+            Additional kwargs for model export
 
     Returns:
-        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
-        the ONNX configuration.
+        `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration and boolean flag - was legacy ONNX path were applied to model or not.
     """
     import torch
     from torch.utils._pytree import tree_map
@@ -299,6 +340,28 @@ def export_models(
     input_shapes: Optional[Dict] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[List[List[str]], List[List[str]]]:
+    """
+    Export the models to OpenVINO IR format
+
+    Args:
+        models_and_onnx_configs (Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]):
+        output_dir (Path): output directory for saving models
+        opset (Optional[int], optional, Default to None): ONNX export opset
+        output_names (Optional[List[str]], optional, Defaults to None): model output names
+        device (str, optional, Defaults to "cpu"):
+            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+        input_shapes (Optional[Dict], optional, Defaults to None):
+            If specified, allows to use specific shapes for the example input provided to the exporter.
+        model_kwargs (Optional[Dict[str, Any]], optional):
+            Additional kwargs for model export
+
+    Raises:
+        ValueError: if custom names set not equal of number of models
+
+    Returns:
+        list of input_names and output_names from ONNX configuration
+    """
     outputs = []
 
     if output_names is not None and len(output_names) != len(models_and_onnx_configs):
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index af9951a8f4..ebd7ec646e 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -12,25 +12,48 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from typing import Any, Dict, List, Tuple, Union
+
 from transformers.utils import is_torch_available
 
 from openvino.runtime import PartialShape
-
-from ...intel.utils.import_utils import is_nncf_available
+from optimum.utils import is_diffusers_available
 
 
 if is_torch_available():
     import torch
     import torch.nn as nn
+    from transformers.modeling_utils import PreTrainedModel
+
+if is_diffusers_available():
+    from diffusers import ModelMixin
+
 
+def is_torch_model(model: Union["PreTrainedModel", "ModelMixin"]):
+    """
+    Checks whether the model is a torch model.
 
-def is_torch_model(model):
+    Args:
+        model (Union[PretrainedModel, ModelMixin]): The model to check.
+
+    Returns:
+        bool: True if the model is a torch model.
+    """
     if not is_torch_available():
         return False
     return isinstance(model, nn.Module)
 
 
-def flattenize_inputs(inputs):
+def flattenize_inputs(inputs: List[Any]):
+    """
+    Flatten the inputs into a list.
+
+    Args:
+        inputs (List[Any]): The inputs to flatten.
+
+    Returns:
+        List[Any]:  The flattened inputs.
+    """
     flatten_inputs = []
     for input_data in inputs:
         if input_data is None:
@@ -42,8 +65,27 @@ def flattenize_inputs(inputs):
     return flatten_inputs
 
 
-def remove_none_from_dummy_inputs(dummy_inputs):
-    def remove_none_from_list_tuple(item):
+def remove_none_from_dummy_inputs(dummy_inputs: Dict[str, Any]):
+    """
+    Removes None values from the dictionary.
+
+    Args:
+        dummy_inputs (Dict[str, Any]): Dictionary with None values.
+    Returns:
+        upd_dummy (Dict[str, Any]): updated dictionary with removed None values
+        dict_dummy (List[Tuple[str, List[str]]]): list of inputs represented as dictionary provided as pair name and list of nested keys
+    """
+
+    def remove_none_from_list_tuple(item: Union[List[Any], Tuple[Any]]):
+        """
+        Removes None values from a list or tuple.
+
+        Args:
+            item (list or tuple): The list or tuple to remove None values from.
+
+        Returns:
+            list or tuple: The list or tuple with None values removed.
+        """
         new_item = [i for i in item if i is not None]
         return type(item)(new_item)
 
@@ -63,7 +105,18 @@ def remove_none_from_list_tuple(item):
     return upd_dummy, dict_dummy
 
 
-def get_input_shapes(dummy_inputs, inputs):
+def get_input_shapes(dummy_inputs: Dict[str, Any], inputs: Dict[str, Any]):
+    """
+    Resolves input shapes based on dynamic axes from input config and dummy input shapes
+
+    Args:
+        dummy_inputs (Dict[str, Any]): A dictionary of dummy inputs.
+        inputs (Dict[str, Any]): A dictionary of input tensors.
+
+    Returns:
+       input_info: List of input info for conversion
+
+    """
     input_info = []
     for input_name, data in dummy_inputs.items():
         if isinstance(data, (tuple, list, dict)):
@@ -78,6 +131,9 @@ def get_input_shapes(dummy_inputs, inputs):
 
 
 def clear_class_registry():
+    """
+    Removes Torchscript cached modules
+    """
     torch._C._jit_clear_class_registry()
     torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
     torch.jit._state._clear_class_state()
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 0fba2e8d3e..b06670dffa 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -27,10 +27,10 @@
 
 from optimum.exporters.onnx import OnnxConfig
 from optimum.exporters.tasks import TasksManager
-from optimum.exporters.onnx.base import OnnxConfig
 from optimum.modeling_base import OptimizedModel
 
 from ...exporters.openvino import export
+from ...exporters.openvino.utils import is_torch_model
 from ..utils.import_utils import is_transformers_version
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
 
@@ -305,18 +305,18 @@ def _to_onnx_to_load(
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
 
-        # Export the model to the OpenVINO IR format
+        # Export the model to the ONNX format
         export(
             model=model,
             config=onnx_config,
             opset=onnx_config.DEFAULT_ONNX_OPSET,
-            output=save_dir_path / OV_XML_FILE_NAME,
+            output=save_dir_path / ONNX_WEIGHTS_NAME,
         )
 
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=False,
+            from_onnx=not is_torch_model(model),
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index 924c65d10a..f11aadd806 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -15,6 +15,7 @@
 from typing import Tuple
 
 import torch
+from transformers.modeling_utils import PreTrainedModel
 
 
 # Modified from transformers.models.bloom.modeling_bloom._make_causal_mask
@@ -91,7 +92,16 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds,
     return combined_attention_mask
 
 
-def patch_decoder_attention_mask(model):
+def patch_decoder_attention_mask(model: "PreTrainedModel"):
+    """
+    Apply patch on decoder with past model forward to resolve first inference based on model architecture
+
+    Args:
+        model (PretrainedModel): The model to patch.
+
+    Returns:
+        model with applied patch
+    """
     if model.config.model_type == "bloom":
         model.transformer._prepare_attn_mask = _prepare_attn_mask
     elif model.config.model_type == "llama":
diff --git a/setup.py b/setup.py
index 4812ede91a..cee7315781 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino==2023.1.0.dev20230811", "onnx", "onnxruntime"],
-    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git"],
+    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git", "transformers<4.32.0"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From 1985e21165c420d67673c930042918bbe95f9bbc Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 4 Sep 2023 20:34:58 +0400
Subject: [PATCH 32/38] small fixes

---
 optimum/intel/openvino/modeling_base.py | 8 ++++----
 optimum/intel/openvino/quantization.py  | 3 +++
 optimum/intel/openvino/trainer.py       | 4 ++--
 setup.py                                | 2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index b06670dffa..5ed250ff8d 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -278,7 +278,7 @@ def _from_transformers(
 
         onnx_config = onnx_config_class(model.config)
 
-        return cls._to_onnx_to_load(
+        return cls._to_load(
             model=model,
             config=config,
             onnx_config=onnx_config,
@@ -290,7 +290,7 @@ def _from_transformers(
         )
 
     @classmethod
-    def _to_onnx_to_load(
+    def _to_load(
         cls,
         model: PreTrainedModel,
         config: PretrainedConfig,
@@ -310,13 +310,13 @@ def _to_onnx_to_load(
             model=model,
             config=onnx_config,
             opset=onnx_config.DEFAULT_ONNX_OPSET,
-            output=save_dir_path / ONNX_WEIGHTS_NAME,
+            output=save_dir_path / OV_XML_FILE_NAME,
         )
 
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
-            from_onnx=not is_torch_model(model),
+            from_onnx=False,
             use_auth_token=use_auth_token,
             revision=revision,
             force_download=force_download,
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index a56f5222ba..3349ce142f 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -359,6 +359,9 @@ def _quantize_torchmodel(
             else Path(ov_file_name).with_suffix(".onnx")
         )
         if weights_only:
+            if getattr(self.model.config, "tie_word_embeddings", True):
+                # to fix problem with shared embedding weights in nncf compress_weights()
+                self.model.tie_weights()
             compressed_model = compress_weights(self.model)
             self.model = compressed_model
         else:
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 2935c20dbf..0bba054ad3 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -134,7 +134,7 @@ def remap(value):
         with torch.no_grad():
             model.eval()
             # Disable node additions to be exported in the graph
-            model.disable_dynamic_graph_building()
+            model.nncf.disable_dynamic_graph_building()
             onnx_export(
                 model,
                 model_inputs,
@@ -145,7 +145,7 @@ def remap(value):
                 do_constant_folding=True,
                 opset_version=opset,
             )
-            model.enable_dynamic_graph_building()
+            model.nncf.enable_dynamic_graph_building()
 
 
 class OVTrainer(Trainer):
diff --git a/setup.py b/setup.py
index cee7315781..9d596442dd 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino==2023.1.0.dev20230811", "onnx", "onnxruntime"],
-    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git", "transformers<4.32.0"],
+    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.gitt@release_v260"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From 6857029fcb042191472f23d62253fc0e8b754a44 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 4 Sep 2023 20:40:07 +0400
Subject: [PATCH 33/38] add docstring to main_export

---
 optimum/exporters/openvino/__main__.py  | 64 +++++++++++++++++++++++++
 optimum/intel/openvino/modeling_base.py |  1 -
 setup.py                                |  2 +-
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 2c3428aa0c..d6dae040de 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -57,6 +57,70 @@ def main_export(
     fn_get_submodels: Optional[Callable] = None,
     **kwargs_shapes,
 ):
+    """
+    Full-suite OpenVINO export.
+
+    Args:
+        > Required parameters
+
+        model_name_or_path (`str`):
+            Model ID on huggingface.co or path on disk to the model repository to export.
+        output (`Union[str, Path]`):
+            Path indicating the directory where to store the generated ONNX model.
+
+        > Optional parameters
+
+        task (`Optional[str]`, defaults to `None`):
+            The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models,
+            use `xxx-with-past` to export the model using past key values in the decoder.
+        device (`str`, defaults to `"cpu"`):
+            The device to use to do the export. Defaults to "cpu".
+        fp16 (`Optional[bool]`, defaults to `"False"`):
+            Use half precision during the export. PyTorch-only, requires `device="cuda"`.
+        framework (`Optional[str]`, defaults to `None`):
+            The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect
+            the framework for the checkpoint.
+        cache_dir (`Optional[str]`, defaults to `None`):
+            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+            model repository.
+        pad_token_id (`Optional[int]`, defaults to `None`):
+            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
+        subfolder (`str`, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+            specify the folder name here.
+        revision (`str`, defaults to `"main"`):
+            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+        force_download (`bool`, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        local_files_only (`Optional[bool]`, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        use_auth_token (`Optional[str]`, defaults to `None`):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
+            Experimental usage: keyword arguments to pass to the model during
+            the export. This argument should be used along the `custom_onnx_configs` argument
+            in case, for example, the model inputs/outputs are changed (for example, if
+            `model_kwargs={"output_attentions": True}` is passed).
+        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
+            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
+        fn_get_submodels (`Optional[Callable]`, defaults to `None`):
+            Experimental usage: Override the default submodels that are used at the export. This is
+            especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
+        **kwargs_shapes (`Dict`):
+            Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
+
+    Example usage:
+    ```python
+    >>> from optimum.exporters.openvino import main_export
+
+    >>> main_export("gpt2", output="gpt2_onnx/")
+    ```
+    """
     output = Path(output)
     if not output.exists():
         output.mkdir(parents=True)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 5ed250ff8d..8f8aa1526a 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -30,7 +30,6 @@
 from optimum.modeling_base import OptimizedModel
 
 from ...exporters.openvino import export
-from ...exporters.openvino.utils import is_torch_model
 from ..utils.import_utils import is_transformers_version
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME
 
diff --git a/setup.py b/setup.py
index 9d596442dd..e8efff3e54 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino==2023.1.0.dev20230811", "onnx", "onnxruntime"],
-    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.gitt@release_v260"],
+    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git@release_v260"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers", "invisible-watermark>=0.2.0"],
     "quality": QUALITY_REQUIRE,

From a127e4309af8b8fb67fa5f3fd3d2c39cff7431a0 Mon Sep 17 00:00:00 2001
From: Aidova <ekaterina.aidova@intel.com>
Date: Tue, 5 Sep 2023 14:07:41 +0400
Subject: [PATCH 34/38] fix timm models

---
 optimum/intel/openvino/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index 1cea230429..95fb0aca8b 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -549,7 +549,7 @@ def from_pretrained(
             model = TimmForImageClassification.from_pretrained(model_id, **kwargs)
             onnx_config = TimmOnnxConfig(model.config)
 
-            return cls._to_onnx_to_load(
+            return cls._to_load(
                 model=model,
                 config=config,
                 onnx_config=onnx_config,

From d96bf756c663fd6ac0c05dc0d1a744389bd095d5 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 13 Sep 2023 11:44:28 +0400
Subject: [PATCH 35/38] fix circular imports

---
 optimum/exporters/openvino/__main__.py | 4 +++-
 optimum/exporters/openvino/convert.py  | 2 +-
 optimum/exporters/openvino/utils.py    | 3 +++
 optimum/intel/utils/modeling_utils.py  | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index d6dae040de..5cf0adb176 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -27,10 +27,11 @@
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.save_utils import maybe_save_preprocessors
 
-from ...intel.openvino.utils import OV_XML_FILE_NAME
 from .convert import export_models
 
 
+OV_XML_FILE_NAME = "openvino_model.xml"
+
 logger = logging.getLogger(__name__)
 
 if is_torch_available():
@@ -219,6 +220,7 @@ def main_export(
         custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
         custom_architecture=custom_architecture,
         fn_get_submodels=fn_get_submodels,
+        _variant="default",
     )
 
     if not is_stable_diffusion:
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 5bcbf95088..ab688f92fa 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -30,8 +30,8 @@
 from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx
 from optimum.utils import is_diffusers_available
 
-from ...intel.openvino.utils import OV_XML_FILE_NAME
 from .utils import (
+    OV_XML_FILE_NAME,
     clear_class_registry,
     flattenize_inputs,
     get_input_shapes,
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index ebd7ec646e..f0d5366526 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -29,6 +29,9 @@
     from diffusers import ModelMixin
 
 
+OV_XML_FILE_NAME = "openvino_model.xml"
+
+
 def is_torch_model(model: Union["PreTrainedModel", "ModelMixin"]):
     """
     Checks whether the model is a torch model.
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index f11aadd806..17abf1059e 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -102,7 +102,7 @@ def patch_decoder_attention_mask(model: "PreTrainedModel"):
     Returns:
         model with applied patch
     """
-    if model.config.model_type == "bloom":
+    if model.config.model_type in {"bloom", "mpt"}:
         model.transformer._prepare_attn_mask = _prepare_attn_mask
     elif model.config.model_type == "llama":
         model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask

From ab8be3ff38e53f036530fe0f19e6a539787f869b Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 18 Sep 2023 14:15:49 +0400
Subject: [PATCH 36/38] update ov version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 69117feb22..d0f232e7d8 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
         "onnx",
         "onnxruntime<1.15.0",
     ],
-    "openvino": ["openvino==2023.1.0.dev20230811", "onnx", "onnxruntime"],
+    "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime"],
     "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git@release_v260"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers"],

From 20d99e9c6b95ee4d615db095b7dae5a9adbf39e7 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 18 Sep 2023 14:26:19 +0400
Subject: [PATCH 37/38] revert excluding deberta

---
 tests/openvino/test_modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index f9fba267f8..a4bf9b38e0 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -566,7 +566,7 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
         # "camembert",
         # "convbert",
         # "data2vec_text",
-        # "deberta",
+        "deberta",
         # "deberta_v2",
         "distilbert",
         "electra",

From cce69f606e6cec3c46436dfa1c12adf33238e230 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Tue, 19 Sep 2023 08:03:58 +0400
Subject: [PATCH 38/38] update nncf on package

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d0f232e7d8..6d81b98b2a 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@
         "onnxruntime<1.15.0",
     ],
     "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime"],
-    "nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git@release_v260"],
+    "nncf": ["nncf>=2.6.0"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,