rasbt
diff --git a/‎.github/workflows/check-links.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/check-links.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎appendix-A/01_main-chapter-code/code-part1.ipynb‎
Lines changed: 20 additions & 20 deletions b/‎appendix-A/01_main-chapter-code/code-part1.ipynb‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎appendix-A/01_main-chapter-code/code-part2.ipynb‎
Lines changed: 9 additions & 9 deletions b/‎appendix-A/01_main-chapter-code/code-part2.ipynb‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎ch02/05_bpe-from-scratch/bpe-from-scratch-simple.ipynb‎
Lines changed: 2 additions & 2 deletions b/‎ch02/05_bpe-from-scratch/bpe-from-scratch-simple.ipynb‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb‎
Lines changed: 2 additions & 2 deletions b/‎ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ch05/13_olmo3/standalone-olmo3-plus-kv-cache.ipynb‎
Lines changed: 55 additions & 17 deletions b/‎ch05/13_olmo3/standalone-olmo3-plus-kv-cache.ipynb‎
Lines changed: 55 additions & 17 deletions
@@ -27,12 +27,18 @@ jobs:
         uv add pytest-check-links
 
     - name: Check links
+      env:
+        CHECK_LINKS_TIMEOUT: "10"
       run: |
         source .venv/bin/activate
         pytest --check-links ./ \
           --check-links-ignore "https://platform.openai.com/*" \
           --check-links-ignore "https://openai.com/*" \
           --check-links-ignore "https://arena.lmsys.org" \
+          --check-links-ignore "https?://localhost(:\\d+)?/.*" \
+          --check-links-ignore "https?://127[.]0[.]0[.]1(:\\d+)?/.*" \
+          --check-links-ignore "https://mng\\.bz/.*" \
+          --check-links-ignore "https://github\\.com/.*" \
           --check-links-ignore "https://unsloth.ai/blog/gradient" \
           --check-links-ignore "https://www.reddit.com/r/*" \
           --check-links-ignore "https://code.visualstudio.com/*" \
 
@@ -46,7 +46,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2.4.0\n"
+      "2.9.1\n"
      ]
     }
    ],
@@ -658,13 +658,13 @@
      "output_type": "stream",
      "text": [
       "Parameter containing:\n",
-      "tensor([[ 0.1182,  0.0606, -0.1292,  ..., -0.1126,  0.0735, -0.0597],\n",
-      "        [-0.0249,  0.0154, -0.0476,  ..., -0.1001, -0.1288,  0.1295],\n",
-      "        [ 0.0641,  0.0018, -0.0367,  ..., -0.0990, -0.0424, -0.0043],\n",
+      "tensor([[ 0.0979,  0.0412,  0.1005,  ..., -0.0544, -0.0804,  0.0842],\n",
+      "        [-0.0115,  0.0382, -0.0261,  ...,  0.0573,  0.1094,  0.1364],\n",
+      "        [ 0.0162, -0.0050,  0.0752,  ...,  0.1298,  0.1250, -0.0117],\n",
       "        ...,\n",
-      "        [ 0.0618,  0.0867,  0.1361,  ..., -0.0254,  0.0399,  0.1006],\n",
-      "        [ 0.0842, -0.0512, -0.0960,  ..., -0.1091,  0.1242, -0.0428],\n",
-      "        [ 0.0518, -0.1390, -0.0923,  ..., -0.0954, -0.0668, -0.0037]],\n",
+      "        [-0.0312,  0.1319, -0.0954,  ..., -0.1066, -0.0970, -0.0373],\n",
+      "        [ 0.0563, -0.1373, -0.1226,  ...,  0.0154, -0.0969,  0.0113],\n",
+      "        [-0.0872, -0.0098,  0.0322,  ..., -0.0108,  0.1091, -0.1043]],\n",
       "       requires_grad=True)\n"
      ]
     }
@@ -1002,12 +1002,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n",
-      "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n",
-      "Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n",
-      "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n",
-      "Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n",
-      "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n"
+      "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.75\n",
+      "Epoch: 001/003 | Batch 002/002 | Train/Val Loss: 0.65\n",
+      "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.44\n",
+      "Epoch: 002/003 | Batch 002/002 | Train/Val Loss: 0.13\n",
+      "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.03\n",
+      "Epoch: 003/003 | Batch 002/002 | Train/Val Loss: 0.00\n"
      ]
     }
    ],
@@ -1036,7 +1036,7 @@
     "    \n",
     "        ### LOGGING\n",
     "        print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
-    "              f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
+    "              f\" | Batch {batch_idx+1:03d}/{len(train_loader):03d}\"\n",
     "              f\" | Train/Val Loss: {loss:.2f}\")\n",
     "\n",
     "    model.eval()\n",
@@ -1080,11 +1080,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tensor([[    0.9991,     0.0009],\n",
-      "        [    0.9982,     0.0018],\n",
-      "        [    0.9949,     0.0051],\n",
-      "        [    0.0491,     0.9509],\n",
-      "        [    0.0307,     0.9693]])\n",
+      "tensor([[0.9991, 0.0009],\n",
+      "        [0.9982, 0.0018],\n",
+      "        [0.9949, 0.0051],\n",
+      "        [0.0491, 0.9509],\n",
+      "        [0.0307, 0.9693]])\n",
       "tensor([0, 0, 0, 1, 1])\n"
      ]
     }
@@ -1340,7 +1340,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
 
@@ -301,7 +301,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -314,12 +314,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n",
-      "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n",
-      "Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n",
-      "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n",
-      "Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n",
-      "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n"
+      "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.75\n",
+      "Epoch: 001/003 | Batch 002/002 | Train/Val Loss: 0.65\n",
+      "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.44\n",
+      "Epoch: 002/003 | Batch 002/002 | Train/Val Loss: 0.13\n",
+      "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.03\n",
+      "Epoch: 003/003 | Batch 002/002 | Train/Val Loss: 0.00\n"
      ]
     }
    ],
@@ -355,7 +355,7 @@
     "\n",
     "        ### LOGGING\n",
     "        print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
-    "              f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
+    "              f\" | Batch {batch_idx+1:03d}/{len(train_loader):03d}\"\n",
     "              f\" | Train/Val Loss: {loss:.2f}\")\n",
     "\n",
     "    model.eval()\n",
@@ -493,7 +493,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
 
@@ -36,7 +36,7 @@
     "- This is a standalone notebook implementing the popular byte pair encoding (BPE) tokenization algorithm, which is used in models like GPT-2 to GPT-4, Llama 3, etc., from scratch for educational purposes\n",
     "- For more details about the purpose of tokenization, please refer to [Chapter 2](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/ch02.ipynb); this code here is bonus material explaining the BPE algorithm\n",
     "- The original BPE tokenizer that OpenAI implemented for training the original GPT models can be found [here](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n",
-    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n",
+    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n",
     "- Most projects, including Llama 3, nowadays use OpenAI's open-source [tiktoken library](https://github.com/openai/tiktoken) due to its computational performance; it allows loading pretrained GPT-2 and GPT-4 tokenizers, for example (the Llama 3 models were trained using the GPT-4 tokenizer as well)\n",
     "- The difference between the implementations above and my implementation in this notebook, besides it being is that it also includes a function for training the tokenizer (for educational purposes)\n",
     "- There's also an implementation called [minBPE](https://github.com/karpathy/minbpe) with training support, which is maybe more performant (my implementation here is focused on educational purposes); in contrast to `minbpe` my implementation additionally allows loading the original OpenAI tokenizer vocabulary and merges"
@@ -253,7 +253,7 @@
    "id": "8c0d4420-a4c7-4813-916a-06f4f46bc3f0",
    "metadata": {},
    "source": [
-    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n",
+    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n",
     "- Before we get to the actual code implementation, the form that is used for LLM tokenizers today can be summarized as follows:"
    ]
   },
 
@@ -36,7 +36,7 @@
     "- This is a standalone notebook implementing the popular byte pair encoding (BPE) tokenization algorithm, which is used in models like GPT-2 to GPT-4, Llama 3, etc., from scratch for educational purposes\n",
     "- For more details about the purpose of tokenization, please refer to [Chapter 2](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/ch02.ipynb); this code here is bonus material explaining the BPE algorithm\n",
     "- The original BPE tokenizer that OpenAI implemented for training the original GPT models can be found [here](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n",
-    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n",
+    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n",
     "- Most projects, including Llama 3, nowadays use OpenAI's open-source [tiktoken library](https://github.com/openai/tiktoken) due to its computational performance; it allows loading pretrained GPT-2 and GPT-4 tokenizers, for example (the Llama 3 models were trained using the GPT-4 tokenizer as well)\n",
     "- The difference between the implementations above and my implementation in this notebook, besides it being is that it also includes a function for training the tokenizer (for educational purposes)\n",
     "- There's also an implementation called [minBPE](https://github.com/karpathy/minbpe) with training support, which is maybe more performant (my implementation here is focused on educational purposes); in contrast to `minbpe` my implementation additionally allows loading the original OpenAI tokenizer vocabulary and BPE \"merges\" (additionally, Hugging Face tokenizers are also capable of training and loading various tokenizers; see [this GitHub discussion](https://github.com/rasbt/LLMs-from-scratch/discussions/485) by a reader who trained a BPE tokenizer on the Nepali language for more info)"
@@ -245,7 +245,7 @@
    "id": "8c0d4420-a4c7-4813-916a-06f4f46bc3f0",
    "metadata": {},
    "source": [
-    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n",
+    "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n",
     "- Before we get to the actual code implementation, the form that is used for LLM tokenizers today can be summarized as described in the following sections."
    ]
   },
 
@@ -206,25 +206,60 @@
    },
    "outputs": [],
    "source": [
-    "def compute_rope_params(head_dim, theta_base=10_000, context_length=4096, attention_factor=1.0, rope_type=\"default\", rope_factor=1.0, rope_orig_max=8192, dtype=torch.float32):\n",
+    "import math\n",
+    "\n",
+    "\n",
+    "def compute_rope_params(head_dim, theta_base=10_000, context_length=4096, attention_factor=1.0, rope_type=\"default\", rope_factor=1.0, rope_orig_max=8192, beta_fast=32.0, beta_slow=1.0, dtype=torch.float32):\n",
     "    assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n",
     "\n",
-    "    # Compute the inverse frequencies\n",
-    "    inv_freq = 1.0 / (\n",
-    "        theta_base ** (\n",
-    "            torch.arange(0, head_dim, 2, dtype=dtype)[: head_dim // 2].float()\n",
-    "            / head_dim\n",
+    "    if rope_type == \"yarn\":\n",
+    "        # Compute YaRN-style frequency scaling (as per https://huggingface.co/papers/2309.00071)\n",
+    "\n",
+    "        def find_correction_dim(num_rotations, dim, base, max_position_embeddings):\n",
+    "            \"\"\"Inverse dimension formula to find the dimension based on the number of rotations\"\"\"\n",
+    "            return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))\n",
+    "\n",
+    "        def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):\n",
+    "            \"\"\"Find dimension range bounds based on rotations\"\"\"\n",
+    "            low = find_correction_dim(low_rot, dim, base, max_position_embeddings)\n",
+    "            high = find_correction_dim(high_rot, dim, base, max_position_embeddings)\n",
+    "            low = math.floor(low)\n",
+    "            high = math.ceil(high)\n",
+    "            return max(low, 0), min(high, dim - 1)\n",
+    "\n",
+    "        def linear_ramp_factor(min_val, max_val, dim):\n",
+    "            if min_val == max_val:\n",
+    "                max_val += 0.001  # Prevent singularity\n",
+    "            linear_func = (torch.arange(dim, dtype=torch.float32) - min_val) / (max_val - min_val)\n",
+    "            ramp_func = torch.clamp(linear_func, 0, 1)\n",
+    "            return ramp_func\n",
+    "\n",
+    "        # Base frequencies\n",
+    "        pos_freqs = theta_base ** (torch.arange(0, head_dim, 2, dtype=dtype) / head_dim)\n",
+    "        inv_freq_extrapolation = 1.0 / pos_freqs  # No scaling (extrapolation)\n",
+    "        inv_freq_interpolation = 1.0 / (rope_factor * pos_freqs)  # With scaling (interpolation)\n",
+    "\n",
+    "        # Find the range where we blend between interpolation and extrapolation\n",
+    "        low, high = find_correction_range(beta_fast, beta_slow, head_dim, theta_base, rope_orig_max)\n",
+    "\n",
+    "        # Get n-dimensional rotational scaling corrected for extrapolation\n",
+    "        inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, head_dim // 2).to(dtype=dtype)\n",
+    "        inv_freq = (\n",
+    "            inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)\n",
+    "            + inv_freq_extrapolation * inv_freq_extrapolation_factor\n",
+    "        )\n",
+    "    else:\n",
+    "        # Default RoPE\n",
+    "        inv_freq = 1.0 / (\n",
+    "            theta_base ** (\n",
+    "                torch.arange(0, head_dim, 2, dtype=dtype)[: head_dim // 2].float()\n",
+    "                / head_dim\n",
+    "            )\n",
     "        )\n",
-    "    )\n",
     "\n",
     "    # Generate position indices\n",
     "    positions = torch.arange(context_length, dtype=dtype)\n",
     "\n",
-    "    # Optional YaRN scaling\n",
-    "    if rope_type == \"yarn\":\n",
-    "        positions = positions / rope_factor\n",
-    "        positions = torch.clamp(positions, max=rope_orig_max - 1)\n",
-    "\n",
     "    # Compute the base angles (shape: [context_length, head_dim // 2])\n",
     "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)\n",
     "\n",
@@ -642,6 +677,8 @@
     "    \"rope_type\": \"yarn\",\n",
     "    \"rope_factor\": 8.0,\n",
     "    \"rope_orig_max\": 8_192,\n",
+    "    \"beta_fast\": 32.0,\n",
+    "    \"beta_slow\": 1.0,\n",
     "    \"rms_norm_eps\": 1e-6,\n",
     "    \"dtype\": torch.bfloat16,\n",
     "    \"eos_token_id\": 100_257,\n",
@@ -727,6 +764,8 @@
     "    \"rope_type\": \"yarn\",\n",
     "    \"rope_factor\": 8.0,\n",
     "    \"rope_orig_max\": 8_192,\n",
+    "    \"beta_fast\": 32.0,\n",
+    "    \"beta_slow\": 1.0,\n",
     "    \"rms_norm_eps\": 1e-6,\n",
     "    \"dtype\": torch.bfloat16,\n",
     "    \"eos_token_id\": 100_257,\n",
@@ -810,9 +849,9 @@
     {
      "data": {
       "text/plain": [
-       "tensor([[[ 0.3594, -0.6289, -0.2754,  ...,  1.1016,  0.4219,  0.0381],\n",
-       "         [ 1.1719,  0.0283,  0.6055,  ...,  0.4863, -0.1953,  0.2246],\n",
-       "         [ 0.4902, -0.0425,  0.6758,  ...,  0.3730, -0.5781, -0.1670]]],\n",
+       "tensor([[[ 0.3867, -0.6328, -0.2734,  ...,  1.1484,  0.4258,  0.0400],\n",
+       "         [ 1.2734,  0.0040,  0.5000,  ...,  0.5625, -0.2383,  0.1855],\n",
+       "         [ 0.5859, -0.0540,  0.7930,  ...,  0.3262, -0.5430, -0.1494]]],\n",
        "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)"
       ]
      },
@@ -1202,8 +1241,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sure! Here’s a brief introduction to large language models:  \n",
-      "Large models are advanced AI systems trained to process vast neural networks capable of understanding and generating text, learning from vast amounts of data, learning language, performing diverse tasks, assisting in many applications, and adapting various tasks.\n",
+      "Large language models are advanced AI systems trained on vast amounts of text to understand and generate human-like language. They can perform a wide range of tasks, from answering questions to writing essays or code. These models have transformed natural language processing and are now foundational in many modern AI applications.\n",
       "\n",
       "GPU memory used: 13.71 GB\n"
      ]