From 6e3abf86ff3e82b8036fc37b73aad32709a88298 Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <tkaruturi@fb.com>
Date: Mon, 3 Feb 2025 13:33:59 -0800
Subject: [PATCH] Adding instructions on how to run the Deepseek R1 Distill
 Llama 8B model

---
 .../deepseek-r1-distill-llama-8B/README.md    | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 examples/models/deepseek-r1-distill-llama-8B/README.md

diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
new file mode 100644
index 00000000000..3a7a723c73b
--- /dev/null
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -0,0 +1,72 @@
+# Summary
+This example demonstrates how to run [Deepseek R1 Distill Llama 8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) 3.8B model via ExecuTorch. The architecture of this distilled model is exactly the same as Llama and thus all the instructions mentioned in the [Llama README](../llama/README.md) apply as is.
+
+# Instructions
+## Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh`
+
+2. Run the installation step for Llama specific requirements
+```
+./examples/models/llama/install_requirements.sh
+```
+
+## Step 2: Prepare and run the model
+1. Download the model
+```
+pip install -U "huggingface_hub[cli]"
+huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-8B --local-dir /target_dir/DeepSeek-R1-Distill-Llama-8B --local-dir-use-symlinks False
+```
+
+2. Download the [tokenizer.model](https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/original/tokenizer.model) from the Llama3.1 repo which will be needed later on when running the model using the runtime.
+
+3. Convert the model to pth file.
+```
+pip install torchtune
+```
+
+Run this python code:
+```
+from torchtune.models import convert_weights
+from torchtune.training import FullModelHFCheckpointer
+import torch
+
+# Convert from safetensors to TorchTune. Suppose the model has been downloaded from Hugging Face
+checkpointer = FullModelHFCheckpointer(
+    checkpoint_dir='/target_dir/DeepSeek-R1-Distill-Llama-8B ',
+    checkpoint_files=['model-00001-of-000002.safetensors', 'model-00002-of-000002.safetensors'],
+    output_dir='/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/' ,
+    model_type='LLAMA3' # or other types that TorchTune supports
+)
+
+print("loading checkpoint")
+sd = checkpointer.load_checkpoint()
+
+# Convert from TorchTune to Meta (PyTorch native)
+sd = convert_weights.tune_to_meta(sd['model'])
+
+print("saving checkpoint")
+torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
+```
+
+4. Download and save the params.json file
+```
+wget https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/original/params.json -o /tmp/params.json
+```
+
+5. Generate a PTE file for use with the Llama runner.
+```
+python -m examples.models.llama.export_llama \
+    --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
+	-p /tmp/params.json \
+	-kv \
+	--use_sdpa_with_kv_cache \
+	-X \
+	-qmode 8da4w \
+	--group_size 128 \
+	-d fp16 \
+	--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+	--embedding-quantize 4,32 \
+	--output_name="DeepSeek-R1-Distill-Llama-8B.pte"
+```
+
+6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3.