|
1 | | -[](https://travis-ci.org/alibaba/flink-ai-extended) |
| 1 | +# Deep Learning on Flink |
2 | 2 |
|
3 | | -# deep-learning-on-flink |
| 3 | +Deep Learning on Flink aims to integrate Flink and deep learning frameworks |
| 4 | +(e.g. TensorFlow, PyTorch, etc) to enable distributed deep learning training and |
| 5 | +inference on a Flink cluster. |
4 | 6 |
|
5 | | -Deep Learning on Flink aims to integrate Flink and deep learning frameworks (e.g. TensorFlow, PyTorch, etc). |
6 | | -It runs the deep learning tasks inside a Flink operator, so that Flink can help establish a distributed environment, |
7 | | -manage the resource, read/write the records and handle the failures. |
| 7 | +It runs the deep learning tasks inside a Flink operator so that Flink can help |
| 8 | +establish a distributed environment, manage the resource, read/write the data |
| 9 | +with the rich connectors in Flink and handle the failures. |
8 | 10 |
|
9 | 11 | Currently, Deep Learning on Flink supports TensorFlow and PyTorch. |
10 | 12 |
|
11 | | -**contents** |
12 | | - |
13 | | -- [TensorFlow support](#tensorflow-support) |
14 | | - * [Support Version](#support-version) |
15 | | - * [Quick Start](#quick-start) |
16 | | - + [Setup](#setup) |
17 | | - + [Build From Source](#build-from-source) |
18 | | - + [Build Source in virtual environment](#build-source-in-virtual-environment) |
19 | | - + [Example](#example) |
20 | | - * [Distributed Running](#distributed-running) |
21 | | - + [Deployment](#deployment) |
22 | | - + [Running Distributed Programs](#running-distributed-programs) |
23 | | - * [Distributed Running Example](#distributed-running-example) |
24 | | - + [Setup & Build](#setup---build) |
25 | | - + [Start Service](#start-service) |
26 | | - + [Prepare data & code](#prepare-data---code) |
27 | | - + [Submit train job](#submit-train-job) |
28 | | - + [Visit Flink Cluster](#visit-flink-cluster) |
29 | | - + [Stop all docker containers](#stop-all-docker-containers) |
30 | | - + [Summary](#summary) |
31 | | - * [Optional Tools](#optional-tools) |
32 | | - + [Build framework and tensorflow python package Independently](#build-framework-and-tensorflow-python-package-independently) |
33 | | - + [Build custom virtual environment package](#build-custom-virtual-environment-package) |
34 | | -- [Structure](#structure) |
35 | | -- [For More Information](#for-more-information) |
36 | | -- [License](#license) |
37 | | - |
38 | | -# TensorFlow support |
39 | | -TensorFlow is a deep learning system developed by Google and open source, which is widely used in the field of deep learning. There are many inconveniences in distributed use and resource management of native TensorFlow, but it can not integrate with the existing widely used large data processing framework. |
40 | | - |
41 | | -Flink is a data processing framework. It is widely used in data extraction, feature preprocessing and data cleaning. |
42 | | - |
43 | | -This project combines TensorFlow with Flink and provides users with more convenient and useful tools. |
44 | | -**Currently, Flink job code can be written in both java with Flink Java API and in python with PyFlink. The algorithm code is written in python.** |
45 | | - |
46 | 13 | ## Support Version |
47 | | -TensorFlow: 1.15.0 & 2.3.1 |
48 | | - |
49 | | -Flink: 1.11.x |
| 14 | +TensorFlow: 1.15.x & 2.3.x |
| 15 | +Pytorch: 1.x |
| 16 | +Flink: 1.14.x |
50 | 17 |
|
51 | | -## Quick Start |
| 18 | +## Getting Started |
| 19 | + |
| 20 | +To get you hand dirty, You can follow [quick start](doc/quick_start.md) |
| 21 | +to submit an example job to a local standalone Flink cluster. |
| 22 | + |
| 23 | +## Build |
52 | 24 |
|
53 | 25 | ### Setup |
54 | 26 |
|
@@ -165,140 +137,6 @@ mvn clean install |
165 | 137 | ```shell |
166 | 138 | deactivate |
167 | 139 | ``` |
168 | | - |
169 | | -### Example |
170 | | - |
171 | | -1. tensorflow add example |
172 | | - **<p>python code:</p>** |
173 | | - |
174 | | -```python |
175 | | -import tensorflow as tf |
176 | | -import time |
177 | | -import sys |
178 | | -from flink_ml_tensorflow.tensorflow_context import TFContext |
179 | | - |
180 | | -def build_graph(): |
181 | | - global a |
182 | | - i = 1 |
183 | | - a = tf.placeholder(tf.float32, shape=None, name="a") |
184 | | - b = tf.reduce_mean(a, name="b") |
185 | | - r_list = [] |
186 | | - v = tf.Variable(dtype=tf.float32, initial_value=tf.constant(1.0), name="v_" + str(i)) |
187 | | - c = tf.add(b, v, name="c_" + str(i)) |
188 | | - add = tf.assign(v, c, name="assign_" + str(i)) |
189 | | - sum = tf.summary.scalar(name="sum_" + str(i), tensor=c) |
190 | | - r_list.append(add) |
191 | | - global_step = tf.contrib.framework.get_or_create_global_step() |
192 | | - global_step_inc = tf.assign_add(global_step, 1) |
193 | | - r_list.append(global_step_inc) |
194 | | - return r_list |
195 | | - |
196 | | -def map_func(context): |
197 | | - tf_context = TFContext(context) |
198 | | - job_name = tf_context.get_role_name() |
199 | | - index = tf_context.get_index() |
200 | | - cluster_json = tf_context.get_tf_cluster() |
201 | | - |
202 | | - cluster = tf.train.ClusterSpec(cluster=cluster_json) |
203 | | - server = tf.train.Server(cluster, job_name=job_name, task_index=index) |
204 | | - sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, |
205 | | - device_filters=["/job:ps", "/job:worker/task:%d" % index]) |
206 | | - t = time.time() |
207 | | - if 'ps' == job_name: |
208 | | - from time import sleep |
209 | | - while True: |
210 | | - sleep(1) |
211 | | - else: |
212 | | - with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:' + str(index), cluster=cluster)): |
213 | | - train_ops = build_graph() |
214 | | - hooks = [tf.train.StopAtStepHook(last_step=2)] |
215 | | - with tf.train.MonitoredTrainingSession(master=server.target, config=sess_config, |
216 | | - checkpoint_dir="./target/tmp/s1/" + str(t), |
217 | | - hooks=hooks) as mon_sess: |
218 | | - while not mon_sess.should_stop(): |
219 | | - print (mon_sess.run(train_ops, feed_dict={a: [1.0, 2.0, 3.0]})) |
220 | | - sys.stdout.flush() |
221 | | - |
222 | | -``` |
223 | | - **<p>java code:</p>** |
224 | | - add maven dependencies |
225 | | -```xml |
226 | | -<?xml version="1.0" encoding="UTF-8"?> |
227 | | -<project xmlns="http://maven.apache.org/POM/4.0.0" |
228 | | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
229 | | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
230 | | - <modelVersion>4.0.0</modelVersion> |
231 | | - |
232 | | - <groupId>org.flinkextended</groupId> |
233 | | - <artifactId>flink-ai-extended-examples</artifactId> |
234 | | - <version>0.3.0</version> |
235 | | - <packaging>jar</packaging> |
236 | | - <dependencies> |
237 | | - <dependency> |
238 | | - <groupId>org.flinkextended</groupId> |
239 | | - <artifactId>flink-ml-tensorflow</artifactId> |
240 | | - <version>0.3.0</version> |
241 | | - </dependency> |
242 | | - <dependency> |
243 | | - <groupId>org.apache.curator</groupId> |
244 | | - <artifactId>curator-framework</artifactId> |
245 | | - <version>2.7.1</version> |
246 | | - </dependency> |
247 | | - <dependency> |
248 | | - <groupId>org.apache.curator</groupId> |
249 | | - <artifactId>curator-test</artifactId> |
250 | | - <version>2.7.1</version> |
251 | | - <exclusions> |
252 | | - <exclusion> |
253 | | - <groupId>com.google.guava</groupId> |
254 | | - <artifactId>guava</artifactId> |
255 | | - </exclusion> |
256 | | - </exclusions> |
257 | | - </dependency> |
258 | | - <dependency> |
259 | | - <groupId>com.google.guava</groupId> |
260 | | - <artifactId>guava</artifactId> |
261 | | - <version>20.0</version> |
262 | | - </dependency> |
263 | | - </dependencies> |
264 | | - |
265 | | - <build> |
266 | | - <plugins> |
267 | | - <plugin> |
268 | | - <groupId>org.apache.maven.plugins</groupId> |
269 | | - <artifactId>maven-compiler-plugin</artifactId> |
270 | | - <version>3.1</version> |
271 | | - <configuration> |
272 | | - <source>1.8</source> |
273 | | - <target>1.8</target> |
274 | | - </configuration> |
275 | | - </plugin> |
276 | | - </plugins> |
277 | | - </build> |
278 | | -</project> |
279 | | -``` |
280 | | -*You can refer to the following POM* |
281 | | - |
282 | | -[example pom.xml](flink-ml-examples/pom.xml) |
283 | | - |
284 | | -```java |
285 | | -class Add{ |
286 | | - public static void main(String args[]) throws Exception{ |
287 | | - // local zookeeper server. |
288 | | - TestingServer server = new TestingServer(2181, true); |
289 | | - String script = "./add.py"; |
290 | | - StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); |
291 | | - // if zookeeper has other address |
292 | | - Map<String, String> prop = new HashMap<>(); |
293 | | - prop.put(MLConstants.CONFIG_STORAGE_TYPE, MLConstants.STORAGE_ZOOKEEPER); |
294 | | - prop.put(MLConstants.CONFIG_ZOOKEEPER_CONNECT_STR, "localhost:2181"); |
295 | | - TFConfig config = new TFConfig(2, 1, prop, script, "map_func", null); |
296 | | - TFUtils.train(streamEnv, null, config); |
297 | | - JobExecutionResult result = streamEnv.execute(); |
298 | | - server.stop(); |
299 | | - } |
300 | | -} |
301 | | -``` |
302 | 140 |
|
303 | 141 | ## Distributed Running |
304 | 142 | ### Deployment |
|
0 commit comments