-
Notifications
You must be signed in to change notification settings - Fork 51
Expand file tree
/
Copy pathpserver.yaml
More file actions
53 lines (53 loc) · 1.89 KB
/
pserver.yaml
File metadata and controls
53 lines (53 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata: {name: fluid-ctr-pserver}
spec:
replicas: 2
template:
metadata:
labels: {paddle-job-pserver: fluid-ctr}
spec:
containers:
- command: [paddle_k8s, start_fluid]
env:
- {name: GLOG_v, value: '0'}
- {name: GLOG_logtostderr, value: '1'}
- {name: TOPOLOGY, value: ''}
- {name: TRAINER_PACKAGE, value: /workspace}
- {name: PADDLE_INIT_NICS, value: eth2}
- name: NAMESPACE
valueFrom:
fieldRef: {fieldPath: metadata.namespace}
- name: POD_IP
valueFrom:
fieldRef: {fieldPath: status.podIP}
- name: POD_NAME
valueFrom:
fieldRef: {fieldPath: metadata.name}
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef: {fieldPath: status.podIP}
- {name: PADDLE_JOB_NAME, value: fluid-ctr}
- {name: PADDLE_IS_LOCAL, value: '0'}
- {name: PADDLE_TRAINERS_NUM, value: '2'}
- {name: PADDLE_PSERVERS_NUM, value: '2'}
- {name: FLAGS_rpc_deadline, value: '36000000'}
- {name: ENTRY, value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1}
- {name: PADDLE_PORT, value: '30236'}
- {name: LD_LIBRARY_PATH, value: '/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind'}
- {name: PADDLE_TRAINING_ROLE, value: PSERVER}
- {name: TRAINING_ROLE, value: PSERVER}
image: wopeizl/paddle_ctr_distribute
imagePullPolicy: Always
name: pserver
volumeMounts:
- {mountPath: /mnt/seqdata, name: seqdata}
resources:
limits: {cpu: '10', memory: 30Gi}
requests: {cpu: '1', memory: 100M}
hostNetwork: true
imagePullSecrets:
- {name: regcred}
volumes:
- hostPath: {path: /home/work/}
name: seqdata