Skip to content

Commit 41e2d14

Browse files
committed
feat(template): complete cifar10 classification (#118)
* feat(template): complete cifar10 classification * feat: add log_every_iters in config.yaml and omegaconf req * chore: use idist.Parallel, spawn_kwargs -> kwargs * feat: add support for argparse and hydra Add configuration library selection in tab Templates * fix: merge config in json file * fix!: keep yaml/hydra combo, add logger pkg in requirements.txt remove json/argparse combo since json.load will result an error for plain `int` and argparse will fail to call string "int" for given inputs. * fix: fix cmd run script in readme, add node_rank * fix: disable hydra logging and outputs path
1 parent 3afba9e commit 41e2d14

21 files changed

+554
-193
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ dist-ssr
55
*.local
66
__pycache__
77
*.log
8-
.vscode
8+
.vscode
9+
*.tar.gz

scripts/check_copies.py

+21
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,26 @@ def check_utils():
2121
print(red, "Unmatched", file, reset)
2222

2323

24+
def check_readme():
25+
red = "\033[31m"
26+
green = "\033[32m"
27+
reset = "\033[0m"
28+
29+
with open("./src/templates/template-common/README.md", "r") as f:
30+
common_utils = f.read()
31+
32+
path = Path("./src/templates/")
33+
34+
for file in path.rglob("**/README.md"):
35+
utils = file.read_text("utf-8")
36+
if utils.find(common_utils) > -1:
37+
print(green, "Matched", file, reset)
38+
else:
39+
print(red, "Unmatched", file, reset)
40+
41+
2442
if __name__ == "__main__":
2543
check_utils()
44+
print()
45+
check_readme()
46+
print()

src/components/CodeBlock.vue

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
<script>
1919
import { highlight, languages } from 'prismjs'
2020
import 'prismjs/components/prism-json'
21+
import 'prismjs/components/prism-yaml'
2122
import 'prismjs/components/prism-python'
2223
import 'prismjs/components/prism-markdown'
2324
import 'prismjs/themes/prism-tomorrow.css'
@@ -162,6 +163,7 @@ div[class~='language-bash']::before {
162163
content: 'sh';
163164
}
164165
166+
div[class~='language-yml']::before,
165167
div[class~='language-yaml']::before {
166168
content: 'yaml';
167169
}

src/components/NavBar.vue

+4-1
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,19 @@ import { ref } from 'vue'
9595
export default {
9696
components: { IconDiscord, IconDownload, IconGitHub, IconTwitter },
9797
setup() {
98-
let zip = new JSZip()
9998
const showDownloadMsg = ref(false)
10099
const currentCommit = __COMMIT__ /* from vite.config.js */
101100
102101
const downloadProject = () => {
102+
const zip = new JSZip()
103103
if (store.code && Object.keys(store.code).length) {
104104
msg.color = '#ff0000'
105105
if (!store.config.output_dir) {
106106
msg.showMsg = true
107107
msg.content = `Output directory is required. Please input in Loggers tab.`
108+
} else if (!store.config.log_every_iters) {
109+
msg.showMsg = true
110+
msg.content = `Logging interval is required. Please input in Loggers tab.`
108111
} else {
109112
for (const filename in store.code) {
110113
zip.file(filename, store.code[filename])

src/components/PaneRight.vue

+8-10
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
<template>
2-
<div v-if="tabs">
2+
<div v-if="tabs()">
33
<div class="right-pane-tabs">
44
<div
5-
v-for="tab in tabs"
5+
v-for="tab in tabs()"
66
:key="tab"
77
class="right-pane-tab"
88
:class="{ active: currentTab === tab }"
@@ -38,22 +38,20 @@ export default {
3838
components: { CodeBlock, Instruction },
3939
setup() {
4040
const currentTab = ref('README.md')
41-
const tabs = computed(() => {
41+
const tabs = () => {
4242
if (store.config.template) {
43-
const tabsArr = Object.keys(templates[store.config.template])
44-
if (import.meta.env.DEV) {
45-
tabsArr.push(__DEV_CONFIG_FILE__)
46-
}
47-
return tabsArr
43+
return Object.keys(store.code)
4844
}
49-
})
45+
}
5046
// search more file types mapping on
5147
// https://icones.js.org/collection/vscode-icons
5248
const fileTypes = {
5349
py: 'python',
5450
md: 'markdown',
5551
json: 'json',
56-
txt: 'text'
52+
txt: 'text',
53+
yml: 'yaml',
54+
yaml: 'yaml'
5755
}
5856
5957
const getFileType = (tab) => {

src/components/TabHandlers.vue

+5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
:saveKey="filename_prefix.name"
1616
:type="filename_prefix.type"
1717
/>
18+
<FormInput
19+
:label="save_every_iters.description"
20+
:saveKey="save_every_iters.name"
21+
:type="save_every_iters.type"
22+
/>
1823
<FormInput
1924
:label="n_saved.description"
2025
:saveKey="n_saved.name"

src/components/TabLoggers.vue

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
:saveKey="output_dir.name"
88
required
99
/>
10+
<FormInput
11+
type="number"
12+
:label="log_every_iters.description"
13+
:saveKey="log_every_iters.name"
14+
required
15+
/>
1016
<FormSelect
1117
:label="logger.description"
1218
:options="logger.options"

src/components/TabTemplates.vue

+5-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@ export default {
2626
2727
const downloadTemplates = () => fetchTemplates(store.config.template)
2828
29-
return { templateLabel, templateOptions, downloadTemplates }
29+
return {
30+
templateLabel,
31+
templateOptions,
32+
downloadTemplates
33+
}
3034
}
3135
}
3236
</script>

src/metadata/metadata.json

+18-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"launch": {
99
"name": "launch",
1010
"type": "radio",
11-
"description": "Run the training with torch.distributed.launch"
11+
"description": "Run the training with torch.distributed.launch (recommended)"
1212
},
1313
"spawn": {
1414
"name": "spawn",
@@ -18,13 +18,13 @@
1818
"nproc_per_node": {
1919
"name": "nproc_per_node",
2020
"type": "number",
21-
"description": "Number of processes to launch on each node",
21+
"description": "Number of processes to launch on each node (mandatory for single node, multi gpus distributed training)",
2222
"min": 1
2323
},
2424
"nnodes": {
2525
"name": "nnodes",
2626
"type": "number",
27-
"description": "Number of nodes to use for distributed training",
27+
"description": "Number of nodes to use for distributed training (mandatory for multi nodes, multi gpus distributed training)",
2828
"min": 1
2929
},
3030
"master_addr": {
@@ -43,7 +43,7 @@
4343
"save_training": {
4444
"name": "save_training",
4545
"type": "checkbox",
46-
"description": "Save the training state by every save_every_iters."
46+
"description": "Save the training state (models, optimizers, trainers, ...) by every save_every_iters."
4747
},
4848
"save_evaluation": {
4949
"name": "save_evaluation",
@@ -69,18 +69,24 @@
6969
"name": "filename_prefix",
7070
"type": "text",
7171
"value": "checkpointing",
72-
"description": "What prefix would you like to put in front of saved checkpoint file?"
72+
"description": "What prefix would you like to put in front of saved checkpoint file? (mandatory for saving training states)"
73+
},
74+
"save_every_iters": {
75+
"name": "save_every_iters",
76+
"type": "number",
77+
"value": "checkpointing",
78+
"description": "Iteration interval for saving training states (mandatory for saving training states)"
7379
},
7480
"n_saved": {
7581
"name": "n_saved",
7682
"type": "number",
7783
"value": "checkpointing",
78-
"description": "How many checkpoint file would you like to keep on disk?"
84+
"description": "How many checkpoint file would you like to keep on disk? (mandatory for saving both training and evaluation)"
7985
},
8086
"limit_sec": {
8187
"name": "limit_sec",
8288
"type": "number",
83-
"description": "How long do you want to run for the training and then terminate?"
89+
"description": "How long do you want to run for the training and then terminate? (in seconds)"
8490
}
8591
},
8692
"loggers": {
@@ -89,6 +95,11 @@
8995
"type": "text",
9096
"description": "Directory to save all outputs"
9197
},
98+
"log_every_iters": {
99+
"name": "log_every_iters",
100+
"type": "number",
101+
"description": "Logging interval for training statistics"
102+
},
92103
"logger": {
93104
"name": "logger",
94105
"type": "array",

src/store.js

+7-5
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,17 @@ export function saveConfig(key, value) {
5757
}
5858

5959
// render the code if there are fetched files for current selected template
60-
export async function genCode() {
60+
export function genCode() {
6161
const currentFiles = files[store.config.template]
6262
if (currentFiles && Object.keys(currentFiles).length) {
6363
for (const file in currentFiles) {
64-
store.code[file] = ejs.render(currentFiles[file], store.config)
64+
store.code[file] = ejs
65+
.render(currentFiles[file], store.config)
66+
.replaceAll(/(\n\n\n\n)+/gi, '\n')
67+
}
68+
if (isDev) {
69+
store.code[__DEV_CONFIG_FILE__] = JSON.stringify(store.config, null, 2)
6570
}
66-
}
67-
if (isDev) {
68-
store.code[__DEV_CONFIG_FILE__] = JSON.stringify(store.config, null, 2)
6971
}
7072
}
7173

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#::: if (it.dist === 'launch') { :::#
2+
#::: if (it.nproc_per_node) { :::#
3+
#::: if (it.nnodes && it.master_addr && it.master_port) { :::#
4+
5+
### Multi Node, Multi GPU Training (`torch.distributed.launch`) (recommended)
6+
7+
- Execute on master node
8+
9+
```sh
10+
python -m torch.distributed.launch \
11+
--nproc_per_node #:::= nproc_per_node :::# \
12+
--nnodes #:::= it.nnodes :::# \
13+
--node_rank 0 \
14+
--master_addr #:::= it.master_addr :::# \
15+
--master_port #:::= it.master_port :::# \
16+
--use_env main.py backend=nccl \
17+
hydra.run.dir=. \
18+
hydra.output_subdir=null \
19+
hydra/job_logging=disabled \
20+
hydra/hydra_logging=disabled
21+
```
22+
23+
- Execute on worker nodes
24+
25+
```sh
26+
python -m torch.distributed.launch \
27+
--nproc_per_node #:::= nproc_per_node :::# \
28+
--nnodes #:::= it.nnodes :::# \
29+
--node_rank <node_rank> \
30+
--master_addr #:::= it.master_addr :::# \
31+
--master_port #:::= it.master_port :::# \
32+
--use_env main.py backend=nccl \
33+
hydra.run.dir=. \
34+
hydra.output_subdir=null \
35+
hydra/job_logging=disabled \
36+
hydra/hydra_logging=disabled
37+
```
38+
39+
#::: } else { :::#
40+
41+
### Multi GPU Training (`torch.distributed.launch`) (recommended)
42+
43+
```sh
44+
python -m torch.distributed.launch \
45+
--nproc_per_node #:::= it.nproc_per_node :::# \
46+
--use_env main.py backend=nccl \
47+
hydra.run.dir=. \
48+
hydra.output_subdir=null \
49+
hydra/job_logging=disabled \
50+
hydra/hydra_logging=disabled
51+
```
52+
53+
#::: } :::#
54+
#::: } :::#
55+
#::: } :::#
56+
57+
#::: if (it.dist === 'spawn') { :::#
58+
#::: if (it.nproc_per_node) { :::#
59+
#::: if (it.nnodes && it.master_addr && it.master_port) { :::#
60+
61+
### Multi Node, Multi GPU Training (`torch.multiprocessing.spawn`)
62+
63+
- Execute on master node
64+
65+
```sh
66+
python main.py \
67+
nproc_per_node=#:::= nproc_per_node :::# \
68+
nnodes=#:::= it.nnodes :::# \
69+
node_rank=0 \
70+
master_addr=#:::= it.master_addr :::# \
71+
master_port=#:::= it.master_port :::# \
72+
backend=nccl \
73+
hydra.run.dir=. \
74+
hydra.output_subdir=null \
75+
hydra/job_logging=disabled \
76+
hydra/hydra_logging=disabled
77+
```
78+
79+
- Execute on worker nodes
80+
81+
```sh
82+
python main.py \
83+
nproc_per_node=#:::= nproc_per_node :::# \
84+
nnodes=#:::= it.nnodes :::# \
85+
node_rank=<node_rank> \
86+
master_addr=#:::= it.master_addr :::# \
87+
master_port=#:::= it.master_port :::# \
88+
backend=nccl \
89+
hydra.run.dir=. \
90+
hydra.output_subdir=null \
91+
hydra/job_logging=disabled \
92+
hydra/hydra_logging=disabled
93+
```
94+
95+
#::: } else { :::#
96+
97+
### Multi GPU Training (`torch.multiprocessing.spawn`)
98+
99+
```sh
100+
python main.py \
101+
nproc_per_node=#:::= it.nproc_per_node :::# \
102+
backend=nccl \
103+
hydra.run.dir=. \
104+
hydra.output_subdir=null \
105+
hydra/job_logging=disabled \
106+
hydra/hydra_logging=disabled
107+
```
108+
109+
#::: } :::#
110+
#::: } :::#
111+
#::: } :::#
112+
113+
#::: if (!it.nproc_per_node) { :::#
114+
115+
### 1 GPU Training
116+
117+
```sh
118+
python main.py \
119+
hydra.run.dir=. \
120+
hydra.output_subdir=null \
121+
hydra/job_logging=disabled \
122+
hydra/hydra_logging=disabled
123+
```
124+
125+
#::: } :::#

0 commit comments

Comments
 (0)