3030from nncf .torch .initialization import PTInitializingDataLoader
3131from openvino ._offline_transformations import compress_quantize_weights_transformation
3232from openvino .runtime import Core , Tensor
33- from torch .utils .data import DataLoader , RandomSampler , TensorDataset
33+ from torch .utils .data import DataLoader , RandomSampler
3434from transformers import DataCollator , PreTrainedModel , default_data_collator
3535from transformers .pytorch_utils import Conv1D
3636
3737from optimum .exporters .tasks import TasksManager
3838from optimum .quantization_base import OptimumQuantizer
3939
40+ from ...exporters .openvino import export
4041from ..utils .constant import _TASK_ALIASES
4142from ..utils .modeling_utils import patch_decoder_attention_mask
4243from .configuration import OVConfig
@@ -353,7 +354,7 @@ def _quantize_torchmodel(
353354 "No configuration describing the quantization process was provided, a default OVConfig will be generated."
354355 )
355356 quantization_config = OVConfig ()
356-
357+
357358 if weights_only :
358359 compressed_model = compress_weights (self .model )
359360 self .model = compressed_model
@@ -377,19 +378,18 @@ def _quantize_torchmodel(
377378 task = self .task
378379 model = self .model
379380 self .model .config .save_pretrained (save_directory )
380-
381+ model = patch_decoder_attention_mask ( model )
381382 if task == "text-generation" :
382- model = patch_decoder_attention_mask (model )
383383 onnx_config = onnx_config_class (model .config , use_past = model .config .use_cache )
384384 else :
385385 onnx_config = onnx_config_class (model .config )
386386
387- model_path = save_directory / onnx_file_name if quantization_config .save_onnx_model else ov_file_name
387+ model_path = save_directory / ( onnx_file_name if quantization_config .save_onnx_model else ov_file_name )
388388 onnx_path = save_directory / onnx_file_name
389389 opset = min (onnx_config .DEFAULT_ONNX_OPSET , MAX_ONNX_OPSET )
390390 opset = max (opset , MIN_ONNX_QDQ_OPSET )
391391 _ , _ , is_onnx = export (
392- model = compressed_model ,
392+ model = model ,
393393 config = onnx_config ,
394394 output = model_path ,
395395 opset = opset ,
@@ -399,17 +399,8 @@ def _quantize_torchmodel(
399399 # Load and save the compressed model
400400 model = core .read_model (onnx_path )
401401 self ._save_pretrained (model , output_path )
402- < << << << HEAD
403- else :
404- _ , _ , is_onnx = export (model = compressed_model , config = onnx_config , output = output_path )
405- if is_onnx :
406- onnx_path = output_path .replace (".xml" , ".onnx" )
407- model = core .read_model (onnx_path )
408- self ._save_pretrained (model , output_path )
409- == == == =
410402 # if onnx conversion happens as fallback for pytorch conversion, remove onnx model
411403 if not quantization_config .save_onnx_model :
412- > >> >> >> fix llama export in quantization flow
413404 os .remove (onnx_path )
414405 try :
415406 os .remove (f"{ onnx_path } _data" )
0 commit comments