forked from karpathy/llama2.c
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.yaml
More file actions
50 lines (37 loc) · 1.17 KB
/
train.yaml
File metadata and controls
50 lines (37 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Pretrain a Llama2-like model on SkyPilot
name: train
resources:
accelerators: V100:1
num_nodes: 1
envs:
HF_TOKEN: ""
WANDB_API_KEY: ""
GIT_USERNAME: ""
GIT_EMAIL: ""
CONFIG: "softmax1-15m"
workdir: .
setup: |
conda activate agi
if [ $? -ne 0 ]; then
conda create -n agi python=3.9 -y
conda activate agi
fi
git config --global credential.helper store
git config --global user.name $GIT_USERNAME
git config --global user.email $GIT_EMAIL
sudo apt install git-lfs
git lfs install
echo 'alias skyy="cd ~/sky_workdir && conda activate agi"' >> ~/.bashrc
source ~/.bashrc
cd ~/sky_workdir
pip install -r requirements.txt
python tinystories.py download
python tinystories.py pretokenize
python login.py # Load from .env: export $(cat .env | xargs)
run: |
set -e # Exit if any command failed.
skyy
export OMP_NUM_THREADS=4 # N CPU threads per distributed process: https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#utilize-openmp
python train.py config/$CONFIG.py
# On local: rsync -Pavz train:/home/gcpuser/sky_workdir/out ./out
# multigpu: tmux -> torchrun --standalone --nproc_per_node=N train.py config/$CONFIG.py