Skip to content

Commit 00eb42a

Browse files
committed
Merge branch 'develop' into refine_seq2seq
2 parents 555e089 + 1aaee80 commit 00eb42a

File tree

15 files changed

+310
-261
lines changed

15 files changed

+310
-261
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
group: deprecated-2017Q2
12
language: cpp
23
cache: ccache
34
sudo: required

.travis/unittest.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ abort(){
88

99
unittest(){
1010
cd $1 > /dev/null
11-
if [ -f "requirements.txt" ]; then
12-
pip install -r requirements.txt
11+
if [ -f "setup.sh" ]; then
12+
sh setup.sh
1313
fi
1414
if [ $? != 0 ]; then
1515
exit 1

deep_speech_2/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.
66

77
```
8-
pip install -r requirements.txt
8+
sh setup.sh
99
export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
1010
```
1111

deep_speech_2/data_utils/data.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import random
99
import numpy as np
10+
import multiprocessing
1011
import paddle.v2 as paddle
1112
from data_utils import utils
1213
from data_utils.augmentor.augmentation import AugmentationPipeline
@@ -44,6 +45,8 @@ class DataGenerator(object):
4445
:types max_freq: None|float
4546
:param specgram_type: Specgram feature type. Options: 'linear'.
4647
:type specgram_type: str
48+
:param num_threads: Number of CPU threads for processing data.
49+
:type num_threads: int
4750
:param random_seed: Random seed.
4851
:type random_seed: int
4952
"""
@@ -58,6 +61,7 @@ def __init__(self,
5861
window_ms=20.0,
5962
max_freq=None,
6063
specgram_type='linear',
64+
num_threads=multiprocessing.cpu_count(),
6165
random_seed=0):
6266
self._max_duration = max_duration
6367
self._min_duration = min_duration
@@ -70,6 +74,7 @@ def __init__(self,
7074
stride_ms=stride_ms,
7175
window_ms=window_ms,
7276
max_freq=max_freq)
77+
self._num_threads = num_threads
7378
self._rng = random.Random(random_seed)
7479
self._epoch = 0
7580

@@ -207,10 +212,14 @@ def _instance_reader_creator(self, manifest):
207212

208213
def reader():
209214
for instance in manifest:
210-
yield self._process_utterance(instance["audio_filepath"],
211-
instance["text"])
215+
yield instance
212216

213-
return reader
217+
def mapper(instance):
218+
return self._process_utterance(instance["audio_filepath"],
219+
instance["text"])
220+
221+
return paddle.reader.xmap_readers(
222+
mapper, reader, self._num_threads, 1024, order=True)
214223

215224
def _padding_batch(self, batch, padding_to=-1, flatten=False):
216225
"""

deep_speech_2/data_utils/speech.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def concatenate(cls, *segments):
9494
return cls(samples, sample_rate, transcripts)
9595

9696
@classmethod
97-
def slice_from_file(cls, filepath, start=None, end=None, transcript):
97+
def slice_from_file(cls, filepath, transcript, start=None, end=None):
9898
"""Loads a small section of an speech without having to load
9999
the entire file into the memory which can be incredibly wasteful.
100100

deep_speech_2/infer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import argparse
77
import gzip
88
import distutils.util
9+
import multiprocessing
910
import paddle.v2 as paddle
1011
from data_utils.data import DataGenerator
1112
from model import deep_speech2
@@ -38,6 +39,11 @@
3839
default=True,
3940
type=distutils.util.strtobool,
4041
help="Use gpu or not. (default: %(default)s)")
42+
parser.add_argument(
43+
"--num_threads_data",
44+
default=multiprocessing.cpu_count(),
45+
type=int,
46+
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
4147
parser.add_argument(
4248
"--mean_std_filepath",
4349
default='mean_std.npz',
@@ -67,7 +73,8 @@ def infer():
6773
data_generator = DataGenerator(
6874
vocab_filepath=args.vocab_filepath,
6975
mean_std_filepath=args.mean_std_filepath,
70-
augmentation_config='{}')
76+
augmentation_config='{}',
77+
num_threads=args.num_threads_data)
7178

7279
# create network config
7380
# paddle.data_type.dense_array is used for variable batch input.

deep_speech_2/requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
SoundFile==0.9.0.post1
22
wget==3.2
3-
scikits.samplerate==0.3.3
4-
scipy==0.13.0b1
3+
scipy==0.13.1

deep_speech_2/setup.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
3+
# install python dependencies
4+
if [ -f 'requirements.txt' ]; then
5+
pip install -r requirements.txt
6+
fi
7+
if [ $? != 0 ]; then
8+
echo "Install python dependencies failed !!!"
9+
exit 1
10+
fi
11+
12+
# install scikits.samplerate
13+
curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz"
14+
if [ $? != 0 ]; then
15+
echo "Download libsamplerate-0.1.9.tar.gz failed !!!"
16+
exit 1
17+
fi
18+
tar -xvf libsamplerate-0.1.9.tar.gz
19+
cd libsamplerate-0.1.9
20+
./configure && make && make install
21+
cd -
22+
rm -rf libsamplerate-0.1.9
23+
rm libsamplerate-0.1.9.tar.gz
24+
pip install scikits.samplerate==0.3.3
25+
if [ $? != 0 ]; then
26+
echo "Install scikits.samplerate failed !!!"
27+
exit 1
28+
fi
29+
30+
echo "Install all dependencies successfully."

deep_speech_2/train.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import gzip
1010
import time
1111
import distutils.util
12+
import multiprocessing
1213
import paddle.v2 as paddle
1314
from model import deep_speech2
1415
from data_utils.data import DataGenerator
@@ -52,6 +53,18 @@
5253
default=True,
5354
type=distutils.util.strtobool,
5455
help="Use sortagrad or not. (default: %(default)s)")
56+
parser.add_argument(
57+
"--max_duration",
58+
default=100.0,
59+
type=float,
60+
help="Audios with duration larger than this will be discarded. "
61+
"(default: %(default)s)")
62+
parser.add_argument(
63+
"--min_duration",
64+
default=0.0,
65+
type=float,
66+
help="Audios with duration smaller than this will be discarded. "
67+
"(default: %(default)s)")
5568
parser.add_argument(
5669
"--shuffle_method",
5770
default='instance_shuffle',
@@ -63,6 +76,11 @@
6376
default=4,
6477
type=int,
6578
help="Trainer number. (default: %(default)s)")
79+
parser.add_argument(
80+
"--num_threads_data",
81+
default=multiprocessing.cpu_count(),
82+
type=int,
83+
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
6684
parser.add_argument(
6785
"--mean_std_filepath",
6886
default='mean_std.npz',
@@ -107,7 +125,10 @@ def data_generator():
107125
return DataGenerator(
108126
vocab_filepath=args.vocab_filepath,
109127
mean_std_filepath=args.mean_std_filepath,
110-
augmentation_config=args.augmentation_config)
128+
augmentation_config=args.augmentation_config,
129+
max_duration=args.max_duration,
130+
min_duration=args.min_duration,
131+
num_threads=args.num_threads_data)
111132

112133
train_generator = data_generator()
113134
test_generator = data_generator()

language_model/network_conf.py

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -51,56 +51,41 @@ def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer):
5151
return cost, output
5252

5353

54-
def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer):
54+
def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer, gram_num=4):
5555
"""
5656
N-Gram language model definition.
5757
5858
:param vocab_size: size of vocab.
5959
:param emb_dim: embedding vector's dimension.
6060
:param hidden_size: size of unit.
61-
:param num_layer: layer number.
61+
:param num_layer: number of hidden layers.
62+
:param gram_size: gram number in n-gram method
6263
:return: cost and output layer of model.
6364
"""
6465

6566
assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0
6667

67-
def wordemb(inlayer):
68-
wordemb = paddle.layer.table_projection(
69-
input=inlayer,
70-
size=emb_dim,
71-
param_attr=paddle.attr.Param(
72-
name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
73-
return wordemb
74-
7568
# input layers
76-
first_word = paddle.layer.data(
77-
name="first_word", type=paddle.data_type.integer_value(vocab_size))
78-
second_word = paddle.layer.data(
79-
name="second_word", type=paddle.data_type.integer_value(vocab_size))
80-
third_word = paddle.layer.data(
81-
name="third_word", type=paddle.data_type.integer_value(vocab_size))
82-
fourth_word = paddle.layer.data(
83-
name="fourth_word", type=paddle.data_type.integer_value(vocab_size))
69+
emb_layers = []
70+
for i in range(gram_num):
71+
word = paddle.layer.data(
72+
name="__word%02d__" % (i + 1),
73+
type=paddle.data_type.integer_value(vocab_size))
74+
emb = paddle.layer.embedding(
75+
input=word,
76+
size=emb_dim,
77+
param_attr=paddle.attr.Param(name="_proj", initial_std=1e-3))
78+
emb_layers.append(emb)
8479
next_word = paddle.layer.data(
85-
name="next_word", type=paddle.data_type.integer_value(vocab_size))
86-
87-
# embedding layer
88-
first_emb = wordemb(first_word)
89-
second_emb = wordemb(second_word)
90-
third_emb = wordemb(third_word)
91-
fourth_emb = wordemb(fourth_word)
92-
93-
context_emb = paddle.layer.concat(
94-
input=[first_emb, second_emb, third_emb, fourth_emb])
80+
name="__next_word__", type=paddle.data_type.integer_value(vocab_size))
9581

9682
# hidden layer
97-
hidden = paddle.layer.fc(
98-
input=context_emb, size=hidden_size, act=paddle.activation.Relu())
99-
for _ in range(num_layer - 1):
83+
for i in range(num_layer):
10084
hidden = paddle.layer.fc(
101-
input=hidden, size=hidden_size, act=paddle.activation.Relu())
85+
input=hidden if i else paddle.layer.concat(input=emb_layers),
86+
size=hidden_size,
87+
act=paddle.activation.Relu())
10288

103-
# fc(full connected) and output layer
10489
predict_word = paddle.layer.fc(
10590
input=[hidden], size=vocab_size, act=paddle.activation.Softmax())
10691

0 commit comments

Comments
 (0)