-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Closed
Description
For the below code (modified slightly from test_recognize_digits), the CPU version with the seed value 100 gives consistent and same results.
The random seed is set for 2 things:
- initial weights:
fluid.default_startup_program().random_seed = 100 - shuffle for reader:
random.seed(100)
However, when use_cuda = True, the results are non-deterministic for a given seed value. Note that this code uses the mlp version and doesn't employ any conv kernels (thus probably ruling out cuDNN issues). I have faced similar issue with other book chapters (like sentiment analysis) as well.
from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle
import sys
import numpy
import unittest
import math
import sys
import os
import random
BATCH_SIZE = 64
def loss_net(hidden, label):
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
acc = fluid.layers.accuracy(input=prediction, label=label)
return prediction, avg_loss, acc
def mlp(img, label):
hidden = fluid.layers.fc(input=img, size=200, act='tanh')
hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
return loss_net(hidden, label)
def conv_net(img, label):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
return loss_net(conv_pool_2, label)
def train(nn_type,
use_cuda,
parallel,
save_dirname=None,
model_filename=None,
params_filename=None,
is_local=True):
random.seed(100)
fluid.default_startup_program().random_seed = 100
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if nn_type == 'mlp':
net_conf = mlp
else:
net_conf = conv_net
if parallel:
print("Not supported")
return
else:
prediction, avg_loss, acc = net_conf(img, label)
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimize_ops, params_grads = optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
def train_loop(main_program):
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
# train a mini-batch, fetch nothing
exe.run(main_program, feed=feeder.feed(data))
if (batch_id + 1) % 10 == 0:
acc_set = []
avg_loss_set = []
for test_data in test_reader():
acc_np, avg_loss_np = exe.run(
program=test_program,
feed=feeder.feed(test_data),
fetch_list=[acc, avg_loss])
acc_set.append(float(acc_np))
avg_loss_set.append(float(avg_loss_np))
# get test acc and loss
acc_val = numpy.array(acc_set).mean()
avg_loss_val = numpy.array(avg_loss_set).mean()
if float(acc_val
) > 0.8: # Smaller value to increase CI speed
if save_dirname is not None:
fluid.io.save_inference_model(
save_dirname, ["img"], [prediction],
exe,
model_filename=model_filename,
params_filename=params_filename)
return
else:
print((pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val)))
if math.isnan(float(avg_loss_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Loss of recognize digits is too large")
if is_local:
train_loop(fluid.default_main_program())
else:
print("Not supported")
def main(use_cuda, parallel, nn_type, combine):
save_dirname = None
model_filename = None
params_filename = None
if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model"
if combine == True:
model_filename = "__model_combined__"
params_filename = "__params_combined__"
train(
nn_type=nn_type,
use_cuda=use_cuda,
parallel=parallel,
save_dirname=save_dirname,
model_filename=model_filename,
params_filename=params_filename)
if __name__ == '__main__':
# CPU version
main(False, False, 'mlp', False)
# GPU
# main(True, False, 'mlp', False)Metadata
Metadata
Assignees
Labels
No labels