From 1afee404b50070abd2a43253762aba476181c11a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Sep 2023 15:56:18 +0200
Subject: [PATCH 01/12] Added weight compression for Dolly 2.0

---
 .../240-dolly-2-instruction-following.ipynb   | 192 +++++++++++++-----
 .../README.md                                 |   1 +
 2 files changed, 138 insertions(+), 55 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index 238d15aecf9..3600c7b868a 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -38,16 +38,17 @@
    "metadata": {},
    "source": [
     "### Table of content:\n",
-    "- [Prerequisites](#Prerequisites-Uparrow)\n",
-    "    - [Select inference device](#Select-inference-device-Uparrow)\n",
-    "- [Download and Convert Model](#Download-and-Convert-Model-Uparrow)\n",
-    "- [Create an instruction-following inference pipeline](#Create-an-instruction-following-inference-pipeline-Uparrow)\n",
-    "    - [Setup imports](#Setup-imports-Uparrow)\n",
-    "    - [Prepare template for user prompt](#Prepare-template-for-user-prompt-Uparrow)\n",
-    "    - [Helpers for output parsing](#Helpers-for-output-parsing-Uparrow)\n",
-    "    - [Main generation function](#Main-generation-function-Uparrow)\n",
-    "    - [Helpers for application](#Helpers-for-application-Uparrow)\n",
-    "- [Run instruction-following pipeline](#Run-instruction-following-pipeline-Uparrow)"
+    "- [Prerequisites](#Prerequisites-$\\Uparrow$)\n",
+    "    - [Select inference device](#Select-inference-device-$\\Uparrow$)\n",
+    "- [Download and Convert Model](#Download-and-Convert-Model-$\\Uparrow$)\n",
+    "- [NNCF model weights compression](#NNCF-model-weights-compression-$\\Uparrow$)\n",
+    "- [Create an instruction-following inference pipeline](#Create-an-instruction-following-inference-pipeline-$\\Uparrow$)\n",
+    "    - [Setup imports](#Setup-imports-$\\Uparrow$)\n",
+    "    - [Prepare template for user prompt](#Prepare-template-for-user-prompt-$\\Uparrow$)\n",
+    "    - [Helpers for output parsing](#Helpers-for-output-parsing-$\\Uparrow$)\n",
+    "    - [Main generation function](#Main-generation-function-$\\Uparrow$)\n",
+    "    - [Helpers for application](#Helpers-for-application-$\\Uparrow$)\n",
+    "- [Run instruction-following pipeline](#Run-instruction-following-pipeline-$\\Uparrow$)"
    ]
   },
   {
@@ -63,26 +64,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "4421fc85-bed6-4a62-b8fa-19c7ba474891",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.1.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.2\u001B[0m\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
-      "\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.1.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.2\u001B[0m\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\"\n",
-    "!pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio"
+    "!pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio\n",
+    "!pip install -q \"git+https://github.com/openvinotoolkit/nncf.git@release_v260\"\n",
+    "!pip install -q \"openvino==2023.1.0.dev20230811\" \"openvino_dev==2023.1.0.dev20230811\""
    ]
   },
   {
@@ -97,22 +87,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5bc9f8fc615a4cf7af5cb987afd0211d",
+       "model_id": "c940eca7b64742dbae2fcaf98667af98",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO')"
+       "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value=' AUTO')"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -160,20 +150,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "91f42296-627d-44ff-a1cb-936bb6f87992",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-07-17 14:47:00.308996: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-07-17 14:47:00.348466: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-07-17 14:47:01.039895: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -185,18 +165,25 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
-      "comet_ml is installed but `COMET_API_KEY` is not set.\n",
+      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.7'\n",
+      "2023-09-14 15:39:32.055450: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-09-14 15:39:32.089487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-09-14 15:39:32.706748: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
       "The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead\n",
       "Framework not specified. Using pt to export to ONNX.\n",
       "Using framework PyTorch: 1.13.1+cpu\n",
       "Overriding 1 configuration item(s)\n",
       "\t- use_cache -> True\n",
-      "/home/ea/work/notebooks_convert/notebooks_conv_env/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:504: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
-      "/home/ea/work/notebooks_convert/notebooks_conv_env/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:270: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if seq_len > self.max_seq_len_cached:\n",
-      "/home/ea/work/notebooks_convert/notebooks_conv_env/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if key_length > self.bias.shape[-1]:\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
       "  op1 = operator(*args, **kwargs)\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
@@ -232,7 +219,7 @@
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
       "Saving external data to one file...\n",
       "Compiling the model...\n",
-      "Set CACHE_DIR to /tmp/tmpndw8_20n/model_cache\n"
+      "Set CACHE_DIR to /tmp/tmp3vew161f/model_cache\n"
      ]
     }
    ],
@@ -255,6 +242,101 @@
     "    ov_model.save_pretrained(model_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5b1238c8-dcc9-4495-aeff-1ecbd8bd5082",
+   "metadata": {},
+   "source": [
+    "### NNCF model weights compression [$\\Uparrow$](#Table-of-content:)\n",
+    "\n",
+    "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionaly, Weight Compression usually leads to almost no accuracy drop."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8e5c9e68-3772-432f-b231-f1163442357d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6be9ab974c06454e81077fe735e7cb37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Compression:', index=1, options=('Disable', 'Enable'), value='Enable')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "to_compress = widgets.Dropdown(\n",
+    "    options=['Disable', 'Enable'],\n",
+    "    value='Enable',\n",
+    "    description='Compression:',\n",
+    "    disabled=False,\n",
+    ")\n",
+    "to_compress"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "392940e3-01da-4876-a9d1-2475ed3da882",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Original IR model size: 10590.42 MB\n",
+      "* Compressed IR model size: 2660.28 MB\n",
+      "* Model compression rate: 3.981\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model...\n",
+      "Set CACHE_DIR to dolly-v2-3b_compressed/model_cache\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nncf\n",
+    "import shutil\n",
+    "import openvino.runtime as ov\n",
+    "\n",
+    "compressed_model_path = Path(f'{model_path}_compressed') / 'openvino_model.xml'\n",
+    "\n",
+    "def compress_model(model):\n",
+    "    if not compressed_model_path.exists():\n",
+    "        if not compressed_model_path.parent.exists():\n",
+    "            compressed_model_path.parent.mkdir()\n",
+    "        compressed_model = nncf.compress_weights(model)\n",
+    "        ov.serialize(compressed_model, compressed_model_path)\n",
+    "        shutil.copy(model_path / 'config.json', compressed_model_path.parent / 'config.json') # Copy config.json manually\n",
+    "        del compressed_model\n",
+    "\n",
+    "def calculate_compression_rate(model_path_ov, model_path_ov_compressed):\n",
+    "    model_size_original = model_path_ov.with_suffix(\".bin\").stat().st_size / 2 ** 20\n",
+    "    model_size_compressed = model_path_ov_compressed.with_suffix(\".bin\").stat().st_size / 2 ** 20\n",
+    "    print(f\"* Original IR model size: {model_size_original:.2f} MB\")\n",
+    "    print(f\"* Compressed IR model size: {model_size_compressed:.2f} MB\")\n",
+    "    print(f\"* Model compression rate: {model_size_original / model_size_compressed:.3f}\")\n",
+    "\n",
+    "if to_compress.value == 'Enable':\n",
+    "    compress_model(ov_model.model)\n",
+    "    calculate_compression_rate(model_path / 'openvino_model.xml', compressed_model_path)\n",
+    "    ov_model = OVModelForCausalLM.from_pretrained(compressed_model_path.parent, device=current_device)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "b6d9c4a5-ef75-4076-9f1c-f45a2259ec46",
@@ -306,7 +388,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "id": "6f976094-8603-42c4-8f18-a32ba6d7192e",
    "metadata": {},
    "outputs": [],
@@ -331,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "id": "52ac10a5-3141-4227-8f0b-0617acd027c8",
    "metadata": {},
    "outputs": [],
@@ -371,7 +453,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 9,
    "id": "524e72f4-8750-48ff-b002-e558d03b3302",
    "metadata": {},
    "outputs": [],
@@ -421,7 +503,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
    "id": "67fb4f9d-5877-48d8-8eff-c30ff6974d7a",
    "metadata": {},
    "outputs": [],
@@ -490,7 +572,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "id": "f114944f-c060-44ba-ba59-02cb2516554c",
    "metadata": {},
    "outputs": [],
@@ -571,7 +653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
    "id": "a00c2293-15b1-4734-b9b4-1abb524bb8d6",
    "metadata": {
     "tags": []
@@ -581,7 +663,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_1272681/896135151.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
+      "/tmp/ipykernel_3967369/3994661578.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
       "  demo.launch(enable_queue=True, share=False, height=800)\n"
      ]
     },
@@ -734,4 +816,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/notebooks/240-dolly-2-instruction-following/README.md b/notebooks/240-dolly-2-instruction-following/README.md
index 95ea986b7e1..de04bd3eff4 100644
--- a/notebooks/240-dolly-2-instruction-following/README.md
+++ b/notebooks/240-dolly-2-instruction-following/README.md
@@ -19,6 +19,7 @@ The tutorial consists of the following steps:
 
 - Install prerequisites
 - Download and convert the model from a public source using the [OpenVINO integration with Hugging Face Optimum](https://huggingface.co/blog/openvino).
+- Compress model weights to INT8 with [OpenVINO NNCF](https://github.com/openvinotoolkit/nncf)
 - Create an instruction-following inference pipeline
 - Run instruction-following pipeline
 

From 128917469c5ba54edf8fc7f45730ac20ae2dae00 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Sep 2023 16:41:12 +0200
Subject: [PATCH 02/12] Extra space

---
 .../240-dolly-2-instruction-following.ipynb                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index 3600c7b868a..a77343d0fdf 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -321,7 +321,7 @@
     "            compressed_model_path.parent.mkdir()\n",
     "        compressed_model = nncf.compress_weights(model)\n",
     "        ov.serialize(compressed_model, compressed_model_path)\n",
-    "        shutil.copy(model_path / 'config.json', compressed_model_path.parent / 'config.json') # Copy config.json manually\n",
+    "        shutil.copy(model_path / 'config.json', compressed_model_path.parent / 'config.json')  # Copy config.json manually\n",
     "        del compressed_model\n",
     "\n",
     "def calculate_compression_rate(model_path_ov, model_path_ov_compressed):\n",

From d1206eae71684d5eeec8401941864e4076af99d2 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Sep 2023 17:03:19 +0200
Subject: [PATCH 03/12] Tweak

---
 .../240-dolly-2-instruction-following.ipynb                      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index a77343d0fdf..57efa82d626 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -18,6 +18,7 @@
     "\n",
     "- Install prerequisites\n",
     "- Download and convert the model from a public source using the [OpenVINO integration with Hugging Face Optimum](https://huggingface.co/blog/openvino).\n",
+    "- Compress model weights to INT8 with [OpenVINO NNCF](https://github.com/openvinotoolkit/nncf)\n",
     "- Create an instruction-following inference pipeline\n",
     "- Run instruction-following pipeline\n",
     "\n",

From 54d1bc8a7cfedafe7c16f9a6c83b7d0c60315b52 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 15 Sep 2023 10:21:14 +0200
Subject: [PATCH 04/12] Replaced by-hand compression with optimum

---
 .../240-dolly-2-instruction-following.ipynb   | 168 +++++++-----------
 1 file changed, 66 insertions(+), 102 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index 57efa82d626..bc79d7a9256 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -70,10 +70,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\"\n",
-    "!pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio\n",
-    "!pip install -q \"git+https://github.com/openvinotoolkit/nncf.git@release_v260\"\n",
-    "!pip install -q \"openvino==2023.1.0.dev20230811\" \"openvino_dev==2023.1.0.dev20230811\""
+    "%pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\"\n",
+    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio\n",
+    "%pip install -q \"git+https://github.com/openvinotoolkit/nncf.git@release_v260\"\n",
+    "%pip install -q \"openvino==2023.1.0.dev20230811\" \"openvino_dev==2023.1.0.dev20230811\""
    ]
   },
   {
@@ -88,22 +88,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c940eca7b64742dbae2fcaf98667af98",
+       "model_id": "18b43cd3ea0f4d30b0973918023f3b12",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value=' AUTO')"
+       "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='CPU')"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -116,7 +116,7 @@
     "\n",
     "device = widgets.Dropdown(\n",
     "    options=core.available_devices + [\"AUTO\"],\n",
-    "    value='AUTO',\n",
+    "    value='CPU',\n",
     "    description='Device:',\n",
     "    disabled=False,\n",
     ")\n",
@@ -151,41 +151,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "91f42296-627d-44ff-a1cb-936bb6f87992",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n"
-     ]
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.7'\n",
-      "2023-09-14 15:39:32.055450: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-09-14 15:39:32.089487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-09-14 15:39:32.706748: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
-      "  warnings.warn(\n",
       "The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead\n",
       "Framework not specified. Using pt to export to ONNX.\n",
-      "Using framework PyTorch: 1.13.1+cpu\n",
+      "Using framework PyTorch: 2.0.1+cu117\n",
       "Overriding 1 configuration item(s)\n",
       "\t- use_cache -> True\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if seq_len > self.max_seq_len_cached:\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if key_length > self.bias.shape[-1]:\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  op1 = operator(*args, **kwargs)\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
@@ -218,9 +196,25 @@
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
       "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "Saving external data to one file...\n",
+      "Saving external data to one file...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu117 =============\n",
+      "verbose: False, log level: Level.ERROR\n",
+      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
       "Compiling the model...\n",
-      "Set CACHE_DIR to /tmp/tmp3vew161f/model_cache\n"
+      "Set CACHE_DIR to /tmp/tmpdbawql7m/model_cache\n"
      ]
     }
    ],
@@ -255,14 +249,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "8e5c9e68-3772-432f-b231-f1163442357d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6be9ab974c06454e81077fe735e7cb37",
+       "model_id": "66698b98b669482c97f1a29db1e38a66",
        "version_major": 2,
        "version_minor": 0
       },
@@ -270,7 +264,7 @@
        "Dropdown(description='Compression:', index=1, options=('Disable', 'Enable'), value='Enable')"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -287,43 +281,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
    "id": "392940e3-01da-4876-a9d1-2475ed3da882",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "* Original IR model size: 10590.42 MB\n",
-      "* Compressed IR model size: 2660.28 MB\n",
-      "* Model compression rate: 3.981\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Compiling the model...\n",
-      "Set CACHE_DIR to dolly-v2-3b_compressed/model_cache\n"
+     "ename": "ValueError",
+     "evalue": "`weights_only` currently not supported for `OVModels`, only available for torch.nn.Module.",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[10], line 18\u001B[0m\n\u001B[1;32m     16\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m compressed_model_path\u001B[38;5;241m.\u001B[39mexists():\n\u001B[1;32m     17\u001B[0m     quantizer \u001B[38;5;241m=\u001B[39m OVQuantizer\u001B[38;5;241m.\u001B[39mfrom_pretrained(ov_model)\n\u001B[0;32m---> 18\u001B[0m     \u001B[43mquantizer\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mquantize\u001B[49m\u001B[43m(\u001B[49m\u001B[43msave_directory\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcompressed_model_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mweights_only\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m     19\u001B[0m     \u001B[38;5;28;01mdel\u001B[39;00m quantizer\n\u001B[1;32m     20\u001B[0m     gc\u001B[38;5;241m.\u001B[39mcollect()\n",
+      "File \u001B[0;32m~/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/optimum/intel/openvino/quantization.py:167\u001B[0m, in \u001B[0;36mOVQuantizer.quantize\u001B[0;34m(self, calibration_dataset, save_directory, quantization_config, file_name, batch_size, data_collator, remove_unused_columns, weights_only, **kwargs)\u001B[0m\n\u001B[1;32m    165\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m weights_only:\n\u001B[1;32m    166\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel, OVBaseModel):\n\u001B[0;32m--> 167\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m    168\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m`weights_only` currently not supported for `OVModels`, only available for torch.nn.Module.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    169\u001B[0m         )\n\u001B[1;32m    170\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m calibration_dataset \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m    171\u001B[0m         logger\u001B[38;5;241m.\u001B[39mwarning(\n\u001B[1;32m    172\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m`calibration_dataset` was provided but will not be used as `weights_only` is set to `True`.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    173\u001B[0m         )\n",
+      "\u001B[0;31mValueError\u001B[0m: `weights_only` currently not supported for `OVModels`, only available for torch.nn.Module."
      ]
     }
    ],
    "source": [
+    "import gc\n",
     "import nncf\n",
     "import shutil\n",
     "import openvino.runtime as ov\n",
+    "from optimum.intel import OVQuantizer\n",
     "\n",
-    "compressed_model_path = Path(f'{model_path}_compressed') / 'openvino_model.xml'\n",
-    "\n",
-    "def compress_model(model):\n",
-    "    if not compressed_model_path.exists():\n",
-    "        if not compressed_model_path.parent.exists():\n",
-    "            compressed_model_path.parent.mkdir()\n",
-    "        compressed_model = nncf.compress_weights(model)\n",
-    "        ov.serialize(compressed_model, compressed_model_path)\n",
-    "        shutil.copy(model_path / 'config.json', compressed_model_path.parent / 'config.json')  # Copy config.json manually\n",
-    "        del compressed_model\n",
+    "compressed_model_path = Path(f'{model_path}_compressed')\n",
     "\n",
     "def calculate_compression_rate(model_path_ov, model_path_ov_compressed):\n",
     "    model_size_original = model_path_ov.with_suffix(\".bin\").stat().st_size / 2 ** 20\n",
@@ -333,8 +315,12 @@
     "    print(f\"* Model compression rate: {model_size_original / model_size_compressed:.3f}\")\n",
     "\n",
     "if to_compress.value == 'Enable':\n",
-    "    compress_model(ov_model.model)\n",
-    "    calculate_compression_rate(model_path / 'openvino_model.xml', compressed_model_path)\n",
+    "    if not compressed_model_path.exists():\n",
+    "        quantizer = OVQuantizer.from_pretrained(ov_model)\n",
+    "        quantizer.quantize(save_directory=compressed_model_path, weights_only=True)\n",
+    "        del quantizer\n",
+    "        gc.collect()\n",
+    "    calculate_compression_rate(model_path / 'openvino_model.xml', compressed_model_path / 'openvino_model.xml')\n",
     "    ov_model = OVModelForCausalLM.from_pretrained(compressed_model_path.parent, device=current_device)"
    ]
   },
@@ -389,7 +375,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "6f976094-8603-42c4-8f18-a32ba6d7192e",
    "metadata": {},
    "outputs": [],
@@ -414,7 +400,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "52ac10a5-3141-4227-8f0b-0617acd027c8",
    "metadata": {},
    "outputs": [],
@@ -454,7 +440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "524e72f4-8750-48ff-b002-e558d03b3302",
    "metadata": {},
    "outputs": [],
@@ -504,7 +490,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "67fb4f9d-5877-48d8-8eff-c30ff6974d7a",
    "metadata": {},
    "outputs": [],
@@ -573,7 +559,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "f114944f-c060-44ba-ba59-02cb2516554c",
    "metadata": {},
    "outputs": [],
@@ -654,42 +640,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "a00c2293-15b1-4734-b9b4-1abb524bb8d6",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_3967369/3994661578.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
-      "  demo.launch(enable_queue=True, share=False, height=800)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running on local URL:  http://127.0.0.1:7860\n",
-      "\n",
-      "To create a public link, set `share=True` in `launch()`.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"800\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "available_devices = Core().available_devices + [\"AUTO\"]\n",
     "\n",
@@ -751,6 +707,14 @@
     "    except Exception:\n",
     "        demo.launch(enable_queue=True, share=True, height=800)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af0c8d9d-693d-48d5-8039-54788049236d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 4c2eb573a5503c982a99614959f913b8ea092164 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 21 Sep 2023 13:37:43 +0200
Subject: [PATCH 05/12] Tweaks

---
 .../240-dolly-2-instruction-following.ipynb   | 270 +++++++++++-------
 1 file changed, 174 insertions(+), 96 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index 1efde44ce40..d684c344a38 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -64,13 +64,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "4421fc85-bed6-4a62-b8fa-19c7ba474891",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:09:14.949341Z",
+     "start_time": "2023-09-21T09:09:14.896141Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "%pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\" \"openvino>=2023.1.0\"\n",
-    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio "
+    "# %pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\" \"openvino>=2023.1.0\"\n",
+    "# %pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio"
    ]
   },
   {
@@ -85,22 +90,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:09:15.251947200Z",
+     "start_time": "2023-09-21T09:09:14.943640100Z"
+    }
+   },
    "outputs": [
     {
      "data": {
+      "text/plain": "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "18b43cd3ea0f4d30b0973918023f3b12",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='CPU')"
-      ]
+       "version_minor": 0,
+       "model_id": "7309bb25a65248ffa55393931e38f9e1"
+      }
      },
-     "execution_count": 5,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -148,70 +156,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "id": "91f42296-627d-44ff-a1cb-936bb6f87992",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:10:32.362607300Z",
+     "start_time": "2023-09-21T09:09:15.240383100Z"
+    }
+   },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead\n",
+      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.7'\n",
+      "2023-09-21 11:09:45.834374: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-09-21 11:09:45.869407: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-09-21 11:09:46.543041: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
       "Framework not specified. Using pt to export to ONNX.\n",
-      "Using framework PyTorch: 2.0.1+cu117\n",
+      "Using framework PyTorch: 1.13.1+cpu\n",
       "Overriding 1 configuration item(s)\n",
-      "\t- use_cache -> True\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n",
-      "Saving external data to one file...\n"
+      "\t- use_cache -> True\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu117 =============\n",
-      "verbose: False, log level: Level.ERROR\n",
-      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
-      "\n"
+      "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "[ WARNING ]  Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s.\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if seq_len > self.max_seq_len_cached:\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if key_length > self.bias.shape[-1]:\n",
+      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  op1 = operator(*args, **kwargs)\n",
       "Compiling the model...\n",
-      "Set CACHE_DIR to /tmp/tmpdbawql7m/model_cache\n"
+      "Set CACHE_DIR to /tmp/tmpyx58dss6/model_cache\n"
      ]
     }
    ],
@@ -247,22 +246,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "id": "8e5c9e68-3772-432f-b231-f1163442357d",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:10:32.363705700Z",
+     "start_time": "2023-09-21T09:10:32.347047700Z"
+    }
+   },
    "outputs": [
     {
      "data": {
+      "text/plain": "Dropdown(description='Compression:', index=1, options=('Disable', 'Enable'), value='Enable')",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "66698b98b669482c97f1a29db1e38a66",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Dropdown(description='Compression:', index=1, options=('Disable', 'Enable'), value='Enable')"
-      ]
+       "version_minor": 0,
+       "model_id": "3f11d95141814b69a72f8386b946bcfe"
+      }
      },
-     "execution_count": 7,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -279,28 +281,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 6,
    "id": "392940e3-01da-4876-a9d1-2475ed3da882",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:40.630080700Z",
+     "start_time": "2023-09-21T09:29:31.843362200Z"
+    }
+   },
    "outputs": [
     {
-     "ename": "ValueError",
-     "evalue": "`weights_only` currently not supported for `OVModels`, only available for torch.nn.Module.",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
-      "Cell \u001B[0;32mIn[10], line 18\u001B[0m\n\u001B[1;32m     16\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m compressed_model_path\u001B[38;5;241m.\u001B[39mexists():\n\u001B[1;32m     17\u001B[0m     quantizer \u001B[38;5;241m=\u001B[39m OVQuantizer\u001B[38;5;241m.\u001B[39mfrom_pretrained(ov_model)\n\u001B[0;32m---> 18\u001B[0m     \u001B[43mquantizer\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mquantize\u001B[49m\u001B[43m(\u001B[49m\u001B[43msave_directory\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcompressed_model_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mweights_only\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m     19\u001B[0m     \u001B[38;5;28;01mdel\u001B[39;00m quantizer\n\u001B[1;32m     20\u001B[0m     gc\u001B[38;5;241m.\u001B[39mcollect()\n",
-      "File \u001B[0;32m~/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/optimum/intel/openvino/quantization.py:167\u001B[0m, in \u001B[0;36mOVQuantizer.quantize\u001B[0;34m(self, calibration_dataset, save_directory, quantization_config, file_name, batch_size, data_collator, remove_unused_columns, weights_only, **kwargs)\u001B[0m\n\u001B[1;32m    165\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m weights_only:\n\u001B[1;32m    166\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel, OVBaseModel):\n\u001B[0;32m--> 167\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m    168\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m`weights_only` currently not supported for `OVModels`, only available for torch.nn.Module.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    169\u001B[0m         )\n\u001B[1;32m    170\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m calibration_dataset \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m    171\u001B[0m         logger\u001B[38;5;241m.\u001B[39mwarning(\n\u001B[1;32m    172\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m`calibration_dataset` was provided but will not be used as `weights_only` is set to `True`.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    173\u001B[0m         )\n",
-      "\u001B[0;31mValueError\u001B[0m: `weights_only` currently not supported for `OVModels`, only available for torch.nn.Module."
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Original IR model size: 5297.21 MB\n",
+      "* Compressed IR model size: 2654.90 MB\n",
+      "* Model compression rate: 1.995\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model...\n",
+      "Set CACHE_DIR to dolly-v2-3b_compressed/model_cache\n"
      ]
     }
    ],
    "source": [
     "import gc\n",
-    "import nncf\n",
-    "import shutil\n",
-    "import openvino.runtime as ov\n",
     "from optimum.intel import OVQuantizer\n",
     "\n",
     "compressed_model_path = Path(f'{model_path}_compressed')\n",
@@ -319,7 +328,7 @@
     "        del quantizer\n",
     "        gc.collect()\n",
     "    calculate_compression_rate(model_path / 'openvino_model.xml', compressed_model_path / 'openvino_model.xml')\n",
-    "    ov_model = OVModelForCausalLM.from_pretrained(compressed_model_path.parent, device=current_device)"
+    "    ov_model = OVModelForCausalLM.from_pretrained(compressed_model_path, device=current_device)"
    ]
   },
   {
@@ -373,9 +382,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "6f976094-8603-42c4-8f18-a32ba6d7192e",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.286735600Z",
+     "start_time": "2023-09-21T09:30:40.635196Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from threading import Thread\n",
@@ -398,9 +412,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "52ac10a5-3141-4227-8f0b-0617acd027c8",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.287734300Z",
+     "start_time": "2023-09-21T09:30:41.287234Z"
+    }
+   },
    "outputs": [],
    "source": [
     "INSTRUCTION_KEY = \"### Instruction:\"\n",
@@ -438,9 +457,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "524e72f4-8750-48ff-b002-e558d03b3302",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.291303900Z",
+     "start_time": "2023-09-21T09:30:41.287734300Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:\n",
@@ -488,9 +512,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "67fb4f9d-5877-48d8-8eff-c30ff6974d7a",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.292303800Z",
+     "start_time": "2023-09-21T09:30:41.287734300Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def run_generation(user_text:str, top_p:float, temperature:float, top_k:int, max_new_tokens:int, perf_text:str):\n",
@@ -557,9 +586,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "f114944f-c060-44ba-ba59-02cb2516554c",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.292303800Z",
+     "start_time": "2023-09-21T09:30:41.288234500Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def estimate_latency(current_time:float, current_perf_text:str, new_gen_text:str, per_token_time:List[float], num_tokens:int):\n",
@@ -638,12 +672,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "a00c2293-15b1-4734-b9b4-1abb524bb8d6",
    "metadata": {
-    "tags": []
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.451355700Z",
+     "start_time": "2023-09-21T09:30:41.288733100Z"
+    }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_31372/3268577018.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
+      "  demo.launch(enable_queue=True, share=False, height=800, server_port=7860)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "<IPython.core.display.HTML object>",
+      "text/html": "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"800\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "available_devices = Core().available_devices + [\"AUTO\"]\n",
     "\n",
@@ -701,18 +765,32 @@
     "\n",
     "if __name__ == \"__main__\":\n",
     "    try:\n",
-    "        demo.launch(enable_queue=True, share=False, height=800)\n",
+    "        demo.launch(enable_queue=True, share=False, height=800, server_port=7860)\n",
     "    except Exception:\n",
     "        demo.launch(enable_queue=True, share=True, height=800)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "af0c8d9d-693d-48d5-8039-54788049236d",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-09-21T09:30:41.452610300Z",
+     "start_time": "2023-09-21T09:30:41.397250900Z"
+    }
+   },
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
   }
  ],
  "metadata": {

From 4f97d86f4eaf51319597fd1d4d8d2c2a74769616 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 22 Sep 2023 10:53:42 +0200
Subject: [PATCH 06/12] Added workaround for weights compression

---
 .../240-dolly-2-instruction-following.ipynb   | 202 ++++++++----------
 1 file changed, 86 insertions(+), 116 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index d684c344a38..ffdd3092b53 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -66,16 +66,11 @@
    "cell_type": "code",
    "execution_count": 1,
    "id": "4421fc85-bed6-4a62-b8fa-19c7ba474891",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:09:14.949341Z",
-     "start_time": "2023-09-21T09:09:14.896141Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# %pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\" \"openvino>=2023.1.0\"\n",
-    "# %pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio"
+    "# %pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio "
    ]
   },
   {
@@ -92,21 +87,18 @@
    "cell_type": "code",
    "execution_count": 2,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:09:15.251947200Z",
-     "start_time": "2023-09-21T09:09:14.943640100Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/plain": "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7f17a47330e74340a8a5b01d99d44652",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "7309bb25a65248ffa55393931e38f9e1"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')"
+      ]
      },
      "execution_count": 2,
      "metadata": {},
@@ -158,12 +150,7 @@
    "cell_type": "code",
    "execution_count": 3,
    "id": "91f42296-627d-44ff-a1cb-936bb6f87992",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:10:32.362607300Z",
-     "start_time": "2023-09-21T09:09:15.240383100Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -177,11 +164,11 @@
      "output_type": "stream",
      "text": [
       "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.7'\n",
-      "2023-09-21 11:09:45.834374: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-09-21 11:09:45.869407: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2023-09-22 10:38:02.161912: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-09-22 10:38:02.196302: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-09-21 11:09:46.543041: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "2023-09-22 10:38:02.850895: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
       "  warnings.warn(\n",
       "Framework not specified. Using pt to export to ONNX.\n",
       "Using framework PyTorch: 1.13.1+cpu\n",
@@ -201,16 +188,16 @@
      "output_type": "stream",
      "text": [
       "[ WARNING ]  Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s.\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if seq_len > self.max_seq_len_cached:\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
       "  if key_length > self.bias.shape[-1]:\n",
-      "/home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
       "  op1 = operator(*args, **kwargs)\n",
       "Compiling the model...\n",
-      "Set CACHE_DIR to /tmp/tmpyx58dss6/model_cache\n"
+      "Set CACHE_DIR to /tmp/tmpl4eosgvf/model_cache\n"
      ]
     }
    ],
@@ -241,28 +228,26 @@
    "source": [
     "### NNCF model weights compression [$\\Uparrow$](#Table-of-content:)\n",
     "\n",
-    "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionaly, Weight Compression usually leads to almost no accuracy drop."
+    "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionaly, Weight Compression usually leads to almost no accuracy drop.\n",
+    ">Note: Starting from OpenVINO 2023.2 weight compression will also have an effect when run on a GPU."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "id": "8e5c9e68-3772-432f-b231-f1163442357d",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:10:32.363705700Z",
-     "start_time": "2023-09-21T09:10:32.347047700Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/plain": "Dropdown(description='Compression:', index=1, options=('Disable', 'Enable'), value='Enable')",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3a774ab747fa4f95b5bd76aaab8c0691",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "3f11d95141814b69a72f8386b946bcfe"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Compression:', index=1, options=('Disable', 'Enable'), value='Enable')"
+      ]
      },
      "execution_count": 4,
      "metadata": {},
@@ -272,7 +257,7 @@
    "source": [
     "to_compress = widgets.Dropdown(\n",
     "    options=['Disable', 'Enable'],\n",
-    "    value='Enable',\n",
+    "    value='Disable',\n",
     "    description='Compression:',\n",
     "    disabled=False,\n",
     ")\n",
@@ -281,22 +266,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "392940e3-01da-4876-a9d1-2475ed3da882",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:40.630080700Z",
-     "start_time": "2023-09-21T09:29:31.843362200Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Framework not specified. Using pt to export to ONNX.\n",
+      "Using framework PyTorch: 1.13.1+cpu\n",
+      "Overriding 1 configuration item(s)\n",
+      "\t- use_cache -> True\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if seq_len > self.max_seq_len_cached:\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if key_length > self.bias.shape[-1]:\n",
+      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  op1 = operator(*args, **kwargs)\n",
+      "Compiling the model...\n",
+      "Set CACHE_DIR to /tmp/tmpelmw8467/model_cache\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "* Original IR model size: 5297.21 MB\n",
-      "* Compressed IR model size: 2654.90 MB\n",
-      "* Model compression rate: 1.995\n"
+      "* Compressed IR model size: 2660.29 MB\n",
+      "* Model compression rate: 1.991\n"
      ]
     },
     {
@@ -311,6 +311,7 @@
    "source": [
     "import gc\n",
     "from optimum.intel import OVQuantizer\n",
+    "from transformers import AutoModelForCausalLM\n",
     "\n",
     "compressed_model_path = Path(f'{model_path}_compressed')\n",
     "\n",
@@ -323,10 +324,18 @@
     "\n",
     "if to_compress.value == 'Enable':\n",
     "    if not compressed_model_path.exists():\n",
+    "        # Weight compression can't yet be applied after FP16 was applied to FP32 OV model.\n",
+    "        # Because of this we convert the original model to FP16 first.\n",
+    "        model = AutoModelForCausalLM.from_pretrained(model_id)\n",
+    "        model.half()\n",
+    "        model.save_pretrained(compressed_model_path)\n",
+    "        ov_model = OVModelForCausalLM.from_pretrained(compressed_model_path, device=current_device, export=True)\n",
+    "        \n",
     "        quantizer = OVQuantizer.from_pretrained(ov_model)\n",
     "        quantizer.quantize(save_directory=compressed_model_path, weights_only=True)\n",
     "        del quantizer\n",
     "        gc.collect()\n",
+    "    \n",
     "    calculate_compression_rate(model_path / 'openvino_model.xml', compressed_model_path / 'openvino_model.xml')\n",
     "    ov_model = OVModelForCausalLM.from_pretrained(compressed_model_path, device=current_device)"
    ]
@@ -382,14 +391,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "6f976094-8603-42c4-8f18-a32ba6d7192e",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.286735600Z",
-     "start_time": "2023-09-21T09:30:40.635196Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from threading import Thread\n",
@@ -412,14 +416,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "52ac10a5-3141-4227-8f0b-0617acd027c8",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.287734300Z",
-     "start_time": "2023-09-21T09:30:41.287234Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "INSTRUCTION_KEY = \"### Instruction:\"\n",
@@ -457,14 +456,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "524e72f4-8750-48ff-b002-e558d03b3302",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.291303900Z",
-     "start_time": "2023-09-21T09:30:41.287734300Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:\n",
@@ -512,14 +506,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "67fb4f9d-5877-48d8-8eff-c30ff6974d7a",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.292303800Z",
-     "start_time": "2023-09-21T09:30:41.287734300Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def run_generation(user_text:str, top_p:float, temperature:float, top_k:int, max_new_tokens:int, perf_text:str):\n",
@@ -586,14 +575,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "f114944f-c060-44ba-ba59-02cb2516554c",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.292303800Z",
-     "start_time": "2023-09-21T09:30:41.288234500Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def estimate_latency(current_time:float, current_perf_text:str, new_gen_text:str, per_token_time:List[float], num_tokens:int):\n",
@@ -672,22 +656,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "a00c2293-15b1-4734-b9b4-1abb524bb8d6",
    "metadata": {
-    "tags": [],
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.451355700Z",
-     "start_time": "2023-09-21T09:30:41.288733100Z"
-    }
+    "tags": []
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_31372/3268577018.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
-      "  demo.launch(enable_queue=True, share=False, height=800, server_port=7860)\n"
+      "/tmp/ipykernel_47554/3994661578.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
+      "  demo.launch(enable_queue=True, share=False, height=800)\n"
      ]
     },
     {
@@ -701,8 +681,12 @@
     },
     {
      "data": {
-      "text/plain": "<IPython.core.display.HTML object>",
-      "text/html": "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"800\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"800\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
      },
      "metadata": {},
      "output_type": "display_data"
@@ -765,32 +749,18 @@
     "\n",
     "if __name__ == \"__main__\":\n",
     "    try:\n",
-    "        demo.launch(enable_queue=True, share=False, height=800, server_port=7860)\n",
+    "        demo.launch(enable_queue=True, share=False, height=800)\n",
     "    except Exception:\n",
     "        demo.launch(enable_queue=True, share=True, height=800)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "af0c8d9d-693d-48d5-8039-54788049236d",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2023-09-21T09:30:41.452610300Z",
-     "start_time": "2023-09-21T09:30:41.397250900Z"
-    }
-   },
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "1454e6b8-b055-4349-b087-c84a59a3e75f",
+   "metadata": {},
    "outputs": [],
-   "source": [],
-   "metadata": {
-    "collapsed": false
-   }
+   "source": []
   }
  ],
  "metadata": {

From 1d730d95ba59c863ca10600989fef49a8d6f00e3 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 22 Sep 2023 11:00:52 +0200
Subject: [PATCH 07/12] Fixed typo

---
 .../240-dolly-2-instruction-following.ipynb   | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index ffdd3092b53..44ec80eb840 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -56,7 +56,7 @@
    "id": "08aa16b1-d2f6-4a3a-abfb-5ec278133c80",
    "metadata": {},
    "source": [
-    "## Prerequisites [$\\Uparrow$](#Table-of-content:)\n",
+    "## Prerequisites [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "First, we should install the [Hugging Face Optimum](https://huggingface.co/docs/optimum/installation) library accelerated by OpenVINO integration.\n",
     "The Hugging Face Optimum Intel API is a high-level API that enables us to convert and quantize models from the Hugging Face Transformers library to the OpenVINO™ IR format. For more details, refer to the [Hugging Face Optimum Intel documentation](https://huggingface.co/docs/optimum/intel/inference)."
@@ -78,7 +78,7 @@
    "id": "367f84f8-33e8-4ad6-bd40-e6fd41d2d703",
    "metadata": {},
    "source": [
-    "### Select inference device [$\\Uparrow$](#Table-of-content:)\n",
+    "### Select inference device [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "select device from dropdown list for running inference using OpenVINO"
    ]
@@ -126,7 +126,7 @@
    "id": "93fec698-344d-48aa-8899-6821bf3e16bf",
    "metadata": {},
    "source": [
-    "## Download and Convert Model [$\\Uparrow$](#Table-of-content:)\n",
+    "## Download and Convert Model [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/docs/optimum/intel/hf.co/models) and create pipelines to run an inference with OpenVINO Runtime using Hugging Face APIs. The Optimum Inference models are API compatible with Hugging Face Transformers models.  This means we just need to replace `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.\n",
     "\n",
@@ -226,7 +226,7 @@
    "id": "5b1238c8-dcc9-4495-aeff-1ecbd8bd5082",
    "metadata": {},
    "source": [
-    "### NNCF model weights compression [$\\Uparrow$](#Table-of-content:)\n",
+    "### NNCF model weights compression [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionaly, Weight Compression usually leads to almost no accuracy drop.\n",
     ">Note: Starting from OpenVINO 2023.2 weight compression will also have an effect when run on a GPU."
@@ -345,7 +345,7 @@
    "id": "b6d9c4a5-ef75-4076-9f1c-f45a2259ec46",
    "metadata": {},
    "source": [
-    "## Create an instruction-following inference pipeline [$\\Uparrow$](#Table-of-content:)\n",
+    "## Create an instruction-following inference pipeline [$\\Uparrow$](#Table-of-contents:)\n",
     " \n",
     " The `run_generation` function accepts user-provided text input, tokenizes it, and runs the generation process. Text generation is an iterative process, where each next token depends on previously generated until a maximum number of tokens or stop generation condition is not reached. To obtain intermediate generation results without waiting until when generation is finished, we will use [`TextIteratorStreamer`](https://huggingface.co/docs/transformers/main/en/internal/generation_utils#transformers.TextIteratorStreamer), provided as part of HuggingFace [Streaming API](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).\n",
     " \n",
@@ -386,7 +386,7 @@
    "id": "b9b5da4d-d2fd-440b-b204-7fbc6966dd1f",
    "metadata": {},
    "source": [
-    "### Setup imports [$\\Uparrow$](#Table-of-content:)\n"
+    "### Setup imports [$\\Uparrow$](#Table-of-contents:)\n"
    ]
   },
   {
@@ -409,7 +409,7 @@
    "id": "c58611d6-0a91-4efd-976e-4221acbb43cd",
    "metadata": {},
    "source": [
-    "### Prepare template for user prompt [$\\Uparrow$](#Table-of-content:)\n",
+    "### Prepare template for user prompt [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "For effective generation, model expects to have input in specific format. The code below prepare template for passing user instruction into model with providing additional context."
    ]
@@ -449,7 +449,7 @@
    "id": "27a01739-1363-42ef-927f-6a340bdbe7ba",
    "metadata": {},
    "source": [
-    "### Helpers for output parsing [$\\Uparrow$](#Table-of-content:)\n",
+    "### Helpers for output parsing [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "Model was retrained to finish generation using special token `### End` the code below find its id for using it as generation stop-criteria."
    ]
@@ -499,7 +499,7 @@
    "id": "583202d2-6d29-4729-af2e-232d3ee0bc2c",
    "metadata": {},
    "source": [
-    "### Main generation function [$\\Uparrow$](#Table-of-content:)\n",
+    "### Main generation function [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "As it was discussed above, `run_generation` function is the entry point for starting generation. It gets provided input instruction as parameter and returns model response."
    ]
@@ -568,7 +568,7 @@
    "id": "562f2dcf-75ef-4554-85e3-e04f486776cc",
    "metadata": {},
    "source": [
-    "### Helpers for application [$\\Uparrow$](#Table-of-content:)\n",
+    "### Helpers for application [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "For making interactive user interface we will use Gradio library. The code bellow provides useful functions used for communication with UI elements."
    ]
@@ -643,7 +643,7 @@
    "id": "50d918a9-1cbe-49a5-85ad-5e370c8af7f5",
    "metadata": {},
    "source": [
-    "## Run instruction-following pipeline [$\\Uparrow$](#Table-of-content:)\n",
+    "## Run instruction-following pipeline [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "Now, we are ready to explore model capabilities. This demo provides a simple interface that allows communication with a model using text instruction. Type your instruction into the `User instruction` field or select one from predefined examples and click on the `Submit` button to start generation. Additionally, you can modify advanced generation parameters:\n",
     "\n",

From 66ab81941dd6fded6b26f6ab28d78d66bc947806 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 22 Sep 2023 11:06:24 +0200
Subject: [PATCH 08/12] Tweak

---
 .../240-dolly-2-instruction-following.ipynb                 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index 44ec80eb840..d3f058da7b9 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -37,7 +37,7 @@
    "id": "f97c435a",
    "metadata": {},
    "source": [
-    "### Table of content:\n",
+    "### Table of contents:\n",
     "- [Prerequisites](#Prerequisites-$\\Uparrow$)\n",
     "    - [Select inference device](#Select-inference-device-$\\Uparrow$)\n",
     "- [Download and Convert Model](#Download-and-Convert-Model-$\\Uparrow$)\n",
@@ -69,8 +69,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# %pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\" \"openvino>=2023.1.0\"\n",
-    "# %pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio "
+    "%pip install -q \"diffusers>=0.16.1\" \"transformers>=4.28.0\" \"openvino>=2023.1.0\"\n",
+    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" datasets onnx onnxruntime gradio "
    ]
   },
   {

From b2137d8f94f4ef1bdd7ec714cdc5c9525994272a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 22 Sep 2023 11:15:19 +0200
Subject: [PATCH 09/12] Fixed typo

---
 .../240-dolly-2-instruction-following.ipynb                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index d3f058da7b9..a926f2d40b1 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -228,7 +228,7 @@
    "source": [
     "### NNCF model weights compression [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
-    "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionaly, Weight Compression usually leads to almost no accuracy drop.\n",
+    "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionally, Weight Compression usually leads to almost no accuracy drop.\n",
     ">Note: Starting from OpenVINO 2023.2 weight compression will also have an effect when run on a GPU."
    ]
   },

From e4eec8bca1f56feefb99b14e2c7592c8adb03209 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 22 Sep 2023 14:09:24 +0200
Subject: [PATCH 10/12] Removed unnecessary changes

---
 .../240-dolly-2-instruction-following.ipynb   | 82 ++++++-------------
 1 file changed, 24 insertions(+), 58 deletions(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index a926f2d40b1..5c327c1eeaa 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -107,9 +107,9 @@
    ],
    "source": [
     "import ipywidgets as widgets\n",
-    "from openvino.runtime import Core\n",
+    "import openvino as ov\n",
     "\n",
-    "core = Core()\n",
+    "core = ov.Core()\n",
     "\n",
     "device = widgets.Dropdown(\n",
     "    options=core.available_devices + [\"AUTO\"],\n",
@@ -163,41 +163,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.7'\n",
-      "2023-09-22 10:38:02.161912: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-09-22 10:38:02.196302: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
+      "2023-09-19 18:04:40.914286: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-09-19 18:04:40.953392: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-09-22 10:38:02.850895: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "2023-09-19 18:04:41.581851: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "/home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
       "  warnings.warn(\n",
-      "Framework not specified. Using pt to export to ONNX.\n",
-      "Using framework PyTorch: 1.13.1+cpu\n",
-      "Overriding 1 configuration item(s)\n",
-      "\t- use_cache -> True\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[ WARNING ]  Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s.\n",
-      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:594: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
-      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:314: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if seq_len > self.max_seq_len_cached:\n",
-      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:239: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if key_length > self.bias.shape[-1]:\n",
-      "/home/nsavel/venvs/ov_notebooks/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  op1 = operator(*args, **kwargs)\n",
       "Compiling the model...\n",
-      "Set CACHE_DIR to /tmp/tmpl4eosgvf/model_cache\n"
+      "Set CACHE_DIR to dolly-v2-3b/model_cache\n"
      ]
     }
    ],
@@ -666,7 +640,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_47554/3994661578.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
+      "/tmp/ipykernel_177871/2332051390.py:57: GradioDeprecationWarning: The `enable_queue` parameter has been deprecated. Please use the `.queue()` method instead.\n",
       "  demo.launch(enable_queue=True, share=False, height=800)\n"
      ]
     },
@@ -693,7 +667,7 @@
     }
    ],
    "source": [
-    "available_devices = Core().available_devices + [\"AUTO\"]\n",
+    "available_devices = ov.Core().available_devices + [\"AUTO\"]\n",
     "\n",
     "examples = [\n",
     "    \"Give me recipe for pizza with pineapple\",\n",
@@ -753,19 +727,11 @@
     "    except Exception:\n",
     "        demo.launch(enable_queue=True, share=True, height=800)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1454e6b8-b055-4349-b087-c84a59a3e75f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -789,7 +755,15 @@
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {
-     "5bc9f8fc615a4cf7af5cb987afd0211d": {
+     "0685b60213644b5faa887230d63d2f9d": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "5fe94d76fb364dd4ae8e6e39abe65cd7": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
       "model_name": "DropdownModel",
@@ -800,24 +774,16 @@
         "AUTO"
        ],
        "description": "Device:",
-       "index": 2,
-       "layout": "IPY_MODEL_f16aa744c427462abd7b791b52a88676",
-       "style": "IPY_MODEL_ffce97ef6b86423d96b13e91a6dd913d"
+       "index": 0,
+       "layout": "IPY_MODEL_7cd80d2c20ad4e7384ee8c822ff61d04",
+       "style": "IPY_MODEL_0685b60213644b5faa887230d63d2f9d"
       }
      },
-     "f16aa744c427462abd7b791b52a88676": {
+     "7cd80d2c20ad4e7384ee8c822ff61d04": {
       "model_module": "@jupyter-widgets/base",
       "model_module_version": "2.0.0",
       "model_name": "LayoutModel",
       "state": {}
-     },
-     "ffce97ef6b86423d96b13e91a6dd913d": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
      }
     },
     "version_major": 2,

From 9b8e4e48519086ef33c9a50cea53772aa6b6731e Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 25 Sep 2023 10:27:49 +0200
Subject: [PATCH 11/12] Fixed a note

---
 .../240-dolly-2-instruction-following.ipynb                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index 5c327c1eeaa..b60ad1d117b 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -203,7 +203,7 @@
     "### NNCF model weights compression [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionally, Weight Compression usually leads to almost no accuracy drop.\n",
-    ">Note: Starting from OpenVINO 2023.2 weight compression will also have an effect when run on a GPU."
+    ">Note: In OpenVINO 2023.1 release weight comprssion is supported only for CPU. When GPU device is selected, it is recommended to disable weight compression. "
    ]
   },
   {

From a6187a0ce9258790a8529c7e16f76b49aad67b5b Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 25 Sep 2023 10:29:53 +0200
Subject: [PATCH 12/12] Fixed typo

---
 .../240-dolly-2-instruction-following.ipynb                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
index b60ad1d117b..e26b8333861 100644
--- a/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
+++ b/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb
@@ -203,7 +203,7 @@
     "### NNCF model weights compression [$\\Uparrow$](#Table-of-contents:)\n",
     "\n",
     "NNCF [Weights Compression algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md) compresses  weights of a model to `INT8`. This is an alternative to [Quantization algorithm](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/post_training/Quantization.md) that compresses both weights and activations. Weight compression is effective in optimizing footprint and performance of large models where the size of weights is significantly larger than the size of activations, for example, in Large Language Models (LLMs) such as Dolly 2.0. Additionally, Weight Compression usually leads to almost no accuracy drop.\n",
-    ">Note: In OpenVINO 2023.1 release weight comprssion is supported only for CPU. When GPU device is selected, it is recommended to disable weight compression. "
+    ">Note: In OpenVINO 2023.1 release weight compression is supported only for CPU. When GPU device is selected, it is recommended to disable weight compression. "
    ]
   },
   {