llama2.c-tinystories/train.yaml at master · softmax1/llama2.c-tinystories · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Pretrain a Llama2-like model on SkyPilot

name: train

resources:
  accelerators: V100:1

num_nodes: 1

envs:
  HF_TOKEN: ""
  WANDB_API_KEY: ""
  GIT_USERNAME: ""
  GIT_EMAIL: ""

  CONFIG: "softmax1-15m"

workdir: .

setup: |
  conda activate agi
  if [ $? -ne 0 ]; then
    conda create -n agi python=3.9 -y
    conda activate agi
  fi

  git config --global credential.helper store
  git config --global user.name $GIT_USERNAME
  git config --global user.email $GIT_EMAIL
  sudo apt install git-lfs
  git lfs install

  echo 'alias skyy="cd ~/sky_workdir && conda activate agi"' >> ~/.bashrc
  source ~/.bashrc

  cd ~/sky_workdir
  pip install -r requirements.txt

  python tinystories.py download
  python tinystories.py pretokenize
  python login.py # Load from .env: export $(cat .env | xargs)

run: |
  set -e  # Exit if any command failed.
  skyy

  export OMP_NUM_THREADS=4 # N CPU threads per distributed process: https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#utilize-openmp
  python train.py config/$CONFIG.py
  # On local: rsync -Pavz train:/home/gcpuser/sky_workdir/out ./out
  # multigpu: tmux -> torchrun --standalone --nproc_per_node=N train.py config/$CONFIG.py