Skip to content

Commit 18750d1

Browse files
Merge pull request #434 from sivanzcw/paddlepaddle
Add demo about Click-Through-Rate distributed training with PaddlePad…
2 parents 8004f88 + 3d10085 commit 18750d1

2 files changed

Lines changed: 202 additions & 0 deletions

File tree

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Click-Through-Rate Distributed Training with PaddlePaddle on Volcano
2+
3+
This is an example of running Click-Through-Rate(ctr) distributed training with PaddlePaddle on Volcano. The source code
4+
is taken from PaddlePaddle EDL team's example [here](https://github.com/PaddlePaddle/edl/tree/develop/example/ctr).
5+
6+
The directory contains the following files:
7+
* ctr-paddlepaddle-on-volcano.yaml: The Volcano Job spec.
8+
9+
To run the example, edit `ctr-paddlepaddle-on-volcano.yaml` for your image's name and version. Then run
10+
```
11+
kubectl apply -f ctr-paddlepaddle-on-volcano.yaml -n ${NAMESPACE}
12+
```
13+
to create the job.
14+
15+
Then use
16+
```
17+
kubectl -n ${NAMESPACE} describe job.batch.volcano.sh ctr-volcano
18+
```
19+
to see the status.
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
name: ctr-volcano
5+
spec:
6+
minAvailable: 4
7+
schedulerName: volcano
8+
policies:
9+
- event: PodEvicted
10+
action: RestartJob
11+
- event: PodFailed
12+
action: RestartJob
13+
tasks:
14+
- replicas: 2
15+
name: pserver
16+
template:
17+
metadata:
18+
labels:
19+
paddle-job-pserver: fluid-ctr
20+
spec:
21+
imagePullSecrets:
22+
- name: default-secret
23+
volumes:
24+
- hostPath:
25+
path: /home/work/
26+
type: ""
27+
name: seqdata
28+
containers:
29+
- image: volcanosh/edlctr:v1
30+
command:
31+
- paddle_k8s
32+
- start_fluid
33+
imagePullPolicy: IfNotPresent
34+
name: pserver
35+
volumeMounts:
36+
- mountPath: /mnt/seqdata
37+
name: seqdata
38+
resources:
39+
limits:
40+
cpu: 10
41+
memory: 30Gi
42+
ephemeral-storage: 10Gi
43+
requests:
44+
cpu: 1
45+
memory: 100M
46+
ephemeral-storage: 1Gi
47+
env:
48+
- name: GLOG_v
49+
value: "0"
50+
- name: GLOG_logtostderr
51+
value: "1"
52+
- name: TOPOLOGY
53+
value: ""
54+
- name: TRAINER_PACKAGE
55+
value: /workspace
56+
- name: NAMESPACE
57+
valueFrom:
58+
fieldRef:
59+
apiVersion: v1
60+
fieldPath: metadata.namespace
61+
- name: POD_IP
62+
valueFrom:
63+
fieldRef:
64+
apiVersion: v1
65+
fieldPath: status.podIP
66+
- name: POD_NAME
67+
valueFrom:
68+
fieldRef:
69+
apiVersion: v1
70+
fieldPath: metadata.name
71+
- name: PADDLE_CURRENT_IP
72+
valueFrom:
73+
fieldRef:
74+
apiVersion: v1
75+
fieldPath: status.podIP
76+
- name: PADDLE_JOB_NAME
77+
value: fluid-ctr
78+
- name: PADDLE_IS_LOCAL
79+
value: "0"
80+
- name: PADDLE_TRAINERS_NUM
81+
value: "2"
82+
- name: PADDLE_PSERVERS_NUM
83+
value: "2"
84+
- name: FLAGS_rpc_deadline
85+
value: "36000000"
86+
- name: ENTRY
87+
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
88+
- name: PADDLE_PORT
89+
value: "30236"
90+
- name: LD_LIBRARY_PATH
91+
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
92+
- name: PADDLE_TRAINING_ROLE
93+
value: PSERVER
94+
- name: TRAINING_ROLE
95+
value: PSERVER
96+
restartPolicy: OnFailure
97+
- replicas: 2
98+
policies:
99+
- event: TaskCompleted
100+
action: CompleteJob
101+
name: trainer
102+
template:
103+
metadata:
104+
labels:
105+
paddle-job: fluid-ctr
106+
spec:
107+
imagePullSecrets:
108+
- name: default-secret
109+
volumes:
110+
- hostPath:
111+
path: /home/work/
112+
type: ""
113+
name: seqdata
114+
containers:
115+
- image: volcanosh/edlctr:v1
116+
command:
117+
- paddle_k8s
118+
- start_fluid
119+
imagePullPolicy: IfNotPresent
120+
name: trainer
121+
volumeMounts:
122+
- mountPath: /mnt/seqdata
123+
name: seqdata
124+
resources:
125+
limits:
126+
cpu: 10
127+
memory: 30Gi
128+
ephemeral-storage: 10Gi
129+
requests:
130+
cpu: 1
131+
memory: 100M
132+
ephemeral-storage: 10Gi
133+
env:
134+
- name: GLOG_v
135+
value: "0"
136+
- name: GLOG_logtostderr
137+
value: "1"
138+
- name: TOPOLOGY
139+
- name: TRAINER_PACKAGE
140+
value: /workspace
141+
- name: CPU_NUM
142+
value: "2"
143+
- name: NAMESPACE
144+
valueFrom:
145+
fieldRef:
146+
apiVersion: v1
147+
fieldPath: metadata.namespace
148+
- name: POD_IP
149+
valueFrom:
150+
fieldRef:
151+
apiVersion: v1
152+
fieldPath: status.podIP
153+
- name: POD_NAME
154+
valueFrom:
155+
fieldRef:
156+
apiVersion: v1
157+
fieldPath: metadata.name
158+
- name: PADDLE_CURRENT_IP
159+
valueFrom:
160+
fieldRef:
161+
apiVersion: v1
162+
fieldPath: status.podIP
163+
- name: PADDLE_JOB_NAME
164+
value: fluid-ctr
165+
- name: PADDLE_IS_LOCAL
166+
value: "0"
167+
- name: FLAGS_rpc_deadline
168+
value: "36000000"
169+
- name: PADDLE_PORT
170+
value: "30236"
171+
- name: PADDLE_PSERVERS_NUM
172+
value: "2"
173+
- name: PADDLE_TRAINERS_NUM
174+
value: "2"
175+
- name: PADDLE_TRAINING_ROLE
176+
value: TRAINER
177+
- name: TRAINING_ROLE
178+
value: TRAINER
179+
- name: LD_LIBRARY_PATH
180+
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
181+
- name: ENTRY
182+
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
183+
restartPolicy: OnFailure

0 commit comments

Comments
 (0)