Skip to content

Commit 9e93105

Browse files
committed
add demo about Click-Throuth-Rate distributed training with PaddlePaddle on Volcano
1 parent a8fb05c commit 9e93105

File tree

2 files changed

+206
-0
lines changed

2 files changed

+206
-0
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Click-Through-Rate Distributed Training with PaddlePaddle on Volcano
2+
3+
This is an example of running Click-Through-Rate(ctr) distributed training with PaddlePaddle on Volcano. The source code
4+
is taken from PaddlePaddle EDL team's example [here](https://github.com/PaddlePaddle/edl/tree/develop/example/ctr).
5+
6+
The directory contains the following files:
7+
* ctr-paddlepaddle-on-volcano.yaml: The Volcano Job spec.
8+
9+
To run the example, edit `ctr-paddlepaddle-on-volcano.yaml` for your image's name and version. Then run
10+
```
11+
kubectl apply -f ctr-paddlepaddle-on-volcano.yaml -n ${NAMESPACE}
12+
```
13+
to create the job.
14+
15+
Then use
16+
```
17+
kubectl -n ${NAMESPACE} describe job.batch.volcano.sh ctr-volcano
18+
```
19+
to see the status.
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
name: ctr-volcano
5+
spec:
6+
minAvailable: 4
7+
schedulerName: volcano
8+
policies:
9+
- event: PodEvicted
10+
action: RestartJob
11+
- event: PodFailed
12+
action: RestartJob
13+
tasks:
14+
- replicas: 2
15+
name: pserver
16+
template:
17+
metadata:
18+
labels:
19+
paddle-job-pserver: fluid-ctr
20+
spec:
21+
imagePullSecrets:
22+
- name: default-secret
23+
volumes:
24+
- hostPath:
25+
path: /home/work/
26+
type: ""
27+
name: seqdata
28+
containers:
29+
- image: volcanosh/edlctr:v1
30+
command:
31+
- paddle_k8s
32+
- start_fluid
33+
imagePullPolicy: IfNotPresent
34+
name: pserver
35+
volumeMounts:
36+
- mountPath: /mnt/seqdata
37+
name: seqdata
38+
resources:
39+
limits:
40+
cpu: 10
41+
memory: 30Gi
42+
ephemeral-storage: 10Gi
43+
requests:
44+
cpu: 1
45+
memory: 100M
46+
ephemeral-storage: 1Gi
47+
env:
48+
- name: GLOG_v
49+
value: "0"
50+
- name: GLOG_logtostderr
51+
value: "1"
52+
- name: TOPOLOGY
53+
value: ""
54+
- name: TRAINER_PACKAGE
55+
value: /workspace
56+
- name: PADDLE_INIT_NICS
57+
value: eth2
58+
- name: NAMESPACE
59+
valueFrom:
60+
fieldRef:
61+
apiVersion: v1
62+
fieldPath: metadata.namespace
63+
- name: POD_IP
64+
valueFrom:
65+
fieldRef:
66+
apiVersion: v1
67+
fieldPath: status.podIP
68+
- name: POD_NAME
69+
valueFrom:
70+
fieldRef:
71+
apiVersion: v1
72+
fieldPath: metadata.name
73+
- name: PADDLE_CURRENT_IP
74+
valueFrom:
75+
fieldRef:
76+
apiVersion: v1
77+
fieldPath: status.podIP
78+
- name: PADDLE_JOB_NAME
79+
value: fluid-ctr
80+
- name: PADDLE_IS_LOCAL
81+
value: "0"
82+
- name: PADDLE_TRAINERS_NUM
83+
value: "2"
84+
- name: PADDLE_PSERVERS_NUM
85+
value: "2"
86+
- name: FLAGS_rpc_deadline
87+
value: "36000000"
88+
- name: ENTRY
89+
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
90+
- name: PADDLE_PORT
91+
value: "30236"
92+
- name: LD_LIBRARY_PATH
93+
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
94+
- name: PADDLE_TRAINING_ROLE
95+
value: PSERVER
96+
- name: TRAINING_ROLE
97+
value: PSERVER
98+
restartPolicy: OnFailure
99+
- replicas: 2
100+
policies:
101+
- event: TaskCompleted
102+
action: CompleteJob
103+
name: trainer
104+
template:
105+
metadata:
106+
labels:
107+
paddle-job: fluid-ctr
108+
spec:
109+
imagePullSecrets:
110+
- name: default-secret
111+
volumes:
112+
- hostPath:
113+
path: /home/work/
114+
type: ""
115+
name: seqdata
116+
containers:
117+
- image: volcanosh/edlctr:v1
118+
command:
119+
- paddle_k8s
120+
- start_fluid
121+
imagePullPolicy: IfNotPresent
122+
name: trainer
123+
volumeMounts:
124+
- mountPath: /mnt/seqdata
125+
name: seqdata
126+
resources:
127+
limits:
128+
cpu: 10
129+
memory: 30Gi
130+
ephemeral-storage: 10Gi
131+
requests:
132+
cpu: 1
133+
memory: 100M
134+
ephemeral-storage: 10Gi
135+
env:
136+
- name: GLOG_v
137+
value: "0"
138+
- name: GLOG_logtostderr
139+
value: "1"
140+
- name: TOPOLOGY
141+
- name: TRAINER_PACKAGE
142+
value: /workspace
143+
- name: PADDLE_INIT_NICS
144+
value: eth2
145+
- name: CPU_NUM
146+
value: "2"
147+
- name: NAMESPACE
148+
valueFrom:
149+
fieldRef:
150+
apiVersion: v1
151+
fieldPath: metadata.namespace
152+
- name: POD_IP
153+
valueFrom:
154+
fieldRef:
155+
apiVersion: v1
156+
fieldPath: status.podIP
157+
- name: POD_NAME
158+
valueFrom:
159+
fieldRef:
160+
apiVersion: v1
161+
fieldPath: metadata.name
162+
- name: PADDLE_CURRENT_IP
163+
valueFrom:
164+
fieldRef:
165+
apiVersion: v1
166+
fieldPath: status.podIP
167+
- name: PADDLE_JOB_NAME
168+
value: fluid-ctr
169+
- name: PADDLE_IS_LOCAL
170+
value: "0"
171+
- name: FLAGS_rpc_deadline
172+
value: "36000000"
173+
- name: PADDLE_PORT
174+
value: "30236"
175+
- name: PADDLE_PSERVERS_NUM
176+
value: "2"
177+
- name: PADDLE_TRAINERS_NUM
178+
value: "2"
179+
- name: PADDLE_TRAINING_ROLE
180+
value: TRAINER
181+
- name: TRAINING_ROLE
182+
value: TRAINER
183+
- name: LD_LIBRARY_PATH
184+
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
185+
- name: ENTRY
186+
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
187+
restartPolicy: OnFailure

0 commit comments

Comments
 (0)