1
- #! /usr/bin/python
2
1
# -*- coding: utf-8 -*-
3
2
4
3
import collections
15
14
import tensorflow as tf
16
15
from six .moves import urllib , xrange
17
16
from tensorflow .python .platform import gfile
17
+ from . import _logging as logging
18
18
19
19
# Iteration functions
20
20
@@ -127,17 +127,17 @@ def sample(a=[], temperature=1.0):
127
127
return np .argmax (np .random .multinomial (1 , a , 1 ))
128
128
except :
129
129
# np.set_printoptions(threshold=np.nan)
130
- # print (a)
131
- # print (np.sum(a))
132
- # print (np.max(a))
133
- # print (np.min(a))
130
+ # logging.info (a)
131
+ # logging.info (np.sum(a))
132
+ # logging.info (np.max(a))
133
+ # logging.info (np.min(a))
134
134
# exit()
135
135
message = "For large vocabulary_size, choice a higher temperature\
136
136
to avoid log error. Hint : use ``sample_top``. "
137
137
138
138
warnings .warn (message , Warning )
139
- # print (a)
140
- # print (b)
139
+ # logging.info (a)
140
+ # logging.info (b)
141
141
return np .argmax (np .random .multinomial (1 , b , 1 ))
142
142
143
143
@@ -153,7 +153,7 @@ def sample_top(a=[], top_k=10):
153
153
"""
154
154
idx = np .argpartition (a , - top_k )[- top_k :]
155
155
probs = a [idx ]
156
- # print ("new", probs)
156
+ # logging.info ("new", probs)
157
157
probs = probs / np .sum (probs )
158
158
choice = np .random .choice (idx , p = probs )
159
159
return choice
@@ -163,7 +163,7 @@ def sample_top(a=[], top_k=10):
163
163
# idx = idx[:top_k]
164
164
# # a = a[idx]
165
165
# probs = a[idx]
166
- # print ("prev", probs)
166
+ # logging.info ("prev", probs)
167
167
# # probs = probs / np.sum(probs)
168
168
# # choice = np.random.choice(idx, p=probs)
169
169
# # return choice
@@ -253,8 +253,8 @@ def __init__(self, vocab_file, start_word="<S>", end_word="</S>", unk_word="<UNK
253
253
254
254
vocab = dict ([(x , y ) for (y , x ) in enumerate (reverse_vocab )])
255
255
256
- print ( " [TL] Vocabulary from %s : %s %s %s" % (vocab_file , start_word , end_word , unk_word ))
257
- print (" vocabulary with %d words (includes start_word, end_word, unk_word)" % len (vocab ))
256
+ logging . info ( " Vocabulary from %s : %s %s %s" % (vocab_file , start_word , end_word , unk_word ))
257
+ logging . info (" vocabulary with %d words (includes start_word, end_word, unk_word)" % len (vocab ))
258
258
# tf.logging.info(" vocabulary with %d words" % len(vocab))
259
259
260
260
self .vocab = vocab # vocab[word] = id
@@ -265,10 +265,10 @@ def __init__(self, vocab_file, start_word="<S>", end_word="</S>", unk_word="<UNK
265
265
self .end_id = vocab [end_word ]
266
266
self .unk_id = vocab [unk_word ]
267
267
self .pad_id = vocab [pad_word ]
268
- print (" start_id: %d" % self .start_id )
269
- print (" end_id: %d" % self .end_id )
270
- print (" unk_id: %d" % self .unk_id )
271
- print (" pad_id: %d" % self .pad_id )
268
+ logging . info (" start_id: %d" % self .start_id )
269
+ logging . info (" end_id: %d" % self .end_id )
270
+ logging . info (" unk_id: %d" % self .unk_id )
271
+ logging . info (" pad_id: %d" % self .pad_id )
272
272
273
273
def word_to_id (self , word ):
274
274
"""Returns the integer word id of a word string."""
@@ -359,7 +359,7 @@ def create_vocab(sentences, word_counts_output_file, min_word_count=1):
359
359
...[['<S>', 'one', 'two', ',', 'three', '</S>'], ['<S>', 'four', 'five', 'five', '</S>']]
360
360
361
361
>>> tl.nlp.create_vocab(processed_capts, word_counts_output_file='vocab.txt', min_word_count=1)
362
- ... [TL] Creating vocabulary.
362
+ ... Creating vocabulary.
363
363
... Total words: 8
364
364
... Words in vocabulary: 8
365
365
... Wrote vocabulary file: vocab.txt
@@ -373,24 +373,24 @@ def create_vocab(sentences, word_counts_output_file, min_word_count=1):
373
373
... pad_id: 0
374
374
"""
375
375
from collections import Counter
376
- print ( " [TL] Creating vocabulary." )
376
+ logging . info ( " Creating vocabulary." )
377
377
counter = Counter ()
378
378
for c in sentences :
379
379
counter .update (c )
380
- # print ('c',c)
381
- print (" Total words: %d" % len (counter ))
380
+ # logging.info ('c',c)
381
+ logging . info (" Total words: %d" % len (counter ))
382
382
383
383
# Filter uncommon words and sort by descending count.
384
384
word_counts = [x for x in counter .items () if x [1 ] >= min_word_count ]
385
385
word_counts .sort (key = lambda x : x [1 ], reverse = True )
386
386
word_counts = [("<PAD>" , 0 )] + word_counts # 1st id should be reserved for padding
387
- # print (word_counts)
388
- print (" Words in vocabulary: %d" % len (word_counts ))
387
+ # logging.info (word_counts)
388
+ logging . info (" Words in vocabulary: %d" % len (word_counts ))
389
389
390
390
# Write out the word counts file.
391
391
with tf .gfile .FastGFile (word_counts_output_file , "w" ) as f :
392
392
f .write ("\n " .join (["%s %d" % (w , c ) for w , c in word_counts ]))
393
- print (" Wrote vocabulary file: %s" % word_counts_output_file )
393
+ logging . info (" Wrote vocabulary file: %s" % word_counts_output_file )
394
394
395
395
# Create the vocabulary dictionary.
396
396
reverse_vocab = [x [0 ] for x in word_counts ]
@@ -506,9 +506,9 @@ def read_analogies_file(eval_file='questions-words.txt', word2id={}):
506
506
questions_skipped += 1
507
507
else :
508
508
questions .append (np .array (ids ))
509
- print ("Eval analogy file: " , eval_file )
510
- print ("Questions: " , len (questions ))
511
- print ("Skipped: " , questions_skipped )
509
+ logging . info ("Eval analogy file: " , eval_file )
510
+ logging . info ("Questions: " , len (questions ))
511
+ logging . info ("Skipped: " , questions_skipped )
512
512
analogy_questions = np .array (questions , dtype = np .int32 )
513
513
return analogy_questions
514
514
@@ -541,13 +541,13 @@ def build_vocab(data):
541
541
"""
542
542
# data = _read_words(filename)
543
543
counter = collections .Counter (data )
544
- # print ('counter', counter) # dictionary for the occurrence number of each word, e.g. 'banknote': 1, 'photography': 1, 'kia': 1
544
+ # logging.info ('counter', counter) # dictionary for the occurrence number of each word, e.g. 'banknote': 1, 'photography': 1, 'kia': 1
545
545
count_pairs = sorted (counter .items (), key = lambda x : (- x [1 ], x [0 ]))
546
- # print ('count_pairs',count_pairs) # convert dictionary to list of tuple, e.g. ('ssangyong', 1), ('swapo', 1), ('wachter', 1)
546
+ # logging.info ('count_pairs',count_pairs) # convert dictionary to list of tuple, e.g. ('ssangyong', 1), ('swapo', 1), ('wachter', 1)
547
547
words , _ = list (zip (* count_pairs ))
548
548
word_to_id = dict (zip (words , range (len (words ))))
549
- # print (words) # list of words
550
- # print (word_to_id) # dictionary for word to id, e.g. 'campbell': 2587, 'atlantic': 2247, 'aoun': 6746
549
+ # logging.info (words) # list of words
550
+ # logging.info (word_to_id) # dictionary for word to id, e.g. 'campbell': 2587, 'atlantic': 2247, 'aoun': 6746
551
551
return word_to_id
552
552
553
553
@@ -627,8 +627,8 @@ def build_words_dataset(words=[], vocabulary_size=50000, printable=True, unk_key
627
627
count [0 ][1 ] = unk_count
628
628
reverse_dictionary = dict (zip (dictionary .values (), dictionary .keys ()))
629
629
if printable :
630
- print ('Real vocabulary size %d' % len (collections .Counter (words ).keys ()))
631
- print ('Limited vocabulary size {}' .format (vocabulary_size ))
630
+ logging . info ('Real vocabulary size %d' % len (collections .Counter (words ).keys ()))
631
+ logging . info ('Limited vocabulary size {}' .format (vocabulary_size ))
632
632
assert len (collections .Counter (words ).keys ()) >= vocabulary_size , \
633
633
"the limited vocabulary_size must be less than or equal to the read vocabulary_size"
634
634
return data , count , dictionary , reverse_dictionary
@@ -670,10 +670,10 @@ def words_to_word_ids(data=[], word_to_id={}, unk_key='UNK'):
670
670
- `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
671
671
"""
672
672
# if isinstance(data[0], six.string_types):
673
- # print (type(data[0]))
673
+ # logging.info (type(data[0]))
674
674
# # exit()
675
- # print (data[0])
676
- # print (word_to_id)
675
+ # logging.info (data[0])
676
+ # logging.info (word_to_id)
677
677
# return [word_to_id[str(word)] for word in data]
678
678
# else:
679
679
@@ -687,11 +687,11 @@ def words_to_word_ids(data=[], word_to_id={}, unk_key='UNK'):
687
687
# return [word_to_id[word] for word in data] # this one
688
688
689
689
# if isinstance(data[0], str):
690
- # # print ('is a string object')
690
+ # # logging.info ('is a string object')
691
691
# return [word_to_id[word] for word in data]
692
692
# else:#if isinstance(s, bytes):
693
- # # print ('is a unicode object')
694
- # # print (data[0])
693
+ # # logging.info ('is a unicode object')
694
+ # # logging.info (data[0])
695
695
# return [word_to_id[str(word)] f
696
696
697
697
@@ -749,7 +749,7 @@ def save_vocab(count=[], name='vocab.txt'):
749
749
with open (os .path .join (pwd , name ), "w" ) as f :
750
750
for i in xrange (vocabulary_size ):
751
751
f .write ("%s %d\n " % (tf .compat .as_text (count [i ][0 ]), count [i ][1 ]))
752
- print ("%d vocab saved to %s in %s" % (vocabulary_size , name , pwd ))
752
+ logging . info ("%d vocab saved to %s in %s" % (vocabulary_size , name , pwd ))
753
753
754
754
755
755
# Functions for translation
@@ -772,7 +772,7 @@ def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")):
772
772
>>> with gfile.GFile(train_path + ".en", mode="rb") as f:
773
773
>>> for line in f:
774
774
>>> tokens = tl.nlp.basic_tokenizer(line)
775
- >>> print (tokens)
775
+ >>> logging.info (tokens)
776
776
>>> exit()
777
777
... [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How',
778
778
... b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home',
@@ -821,14 +821,14 @@ def create_vocabulary(vocabulary_path,
821
821
- Code from ``/tensorflow/models/rnn/translation/data_utils.py``
822
822
"""
823
823
if not gfile .Exists (vocabulary_path ):
824
- print ("Creating vocabulary %s from data %s" % (vocabulary_path , data_path ))
824
+ logging . info ("Creating vocabulary %s from data %s" % (vocabulary_path , data_path ))
825
825
vocab = {}
826
826
with gfile .GFile (data_path , mode = "rb" ) as f :
827
827
counter = 0
828
828
for line in f :
829
829
counter += 1
830
830
if counter % 100000 == 0 :
831
- print (" processing line %d" % counter )
831
+ logging . info (" processing line %d" % counter )
832
832
tokens = tokenizer (line ) if tokenizer else basic_tokenizer (line )
833
833
for w in tokens :
834
834
word = re .sub (_DIGIT_RE , b"0" , w ) if normalize_digits else w
@@ -843,7 +843,7 @@ def create_vocabulary(vocabulary_path,
843
843
for w in vocab_list :
844
844
vocab_file .write (w + b"\n " )
845
845
else :
846
- print ("Vocabulary %s from data %s exists" % (vocabulary_path , data_path ))
846
+ logging . info ("Vocabulary %s from data %s exists" % (vocabulary_path , data_path ))
847
847
848
848
849
849
def initialize_vocabulary (vocabulary_path ):
@@ -948,19 +948,19 @@ def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, n
948
948
- Code from ``/tensorflow/models/rnn/translation/data_utils.py``
949
949
"""
950
950
if not gfile .Exists (target_path ):
951
- print ("Tokenizing data in %s" % data_path )
951
+ logging . info ("Tokenizing data in %s" % data_path )
952
952
vocab , _ = initialize_vocabulary (vocabulary_path )
953
953
with gfile .GFile (data_path , mode = "rb" ) as data_file :
954
954
with gfile .GFile (target_path , mode = "w" ) as tokens_file :
955
955
counter = 0
956
956
for line in data_file :
957
957
counter += 1
958
958
if counter % 100000 == 0 :
959
- print (" tokenizing line %d" % counter )
959
+ logging . info (" tokenizing line %d" % counter )
960
960
token_ids = sentence_to_token_ids (line , vocab , tokenizer , normalize_digits , UNK_ID = UNK_ID , _DIGIT_RE = _DIGIT_RE )
961
961
tokens_file .write (" " .join ([str (tok ) for tok in token_ids ]) + "\n " )
962
962
else :
963
- print ("Target path %s exists" % target_path )
963
+ logging . info ("Target path %s exists" % target_path )
964
964
965
965
966
966
def moses_multi_bleu (hypotheses , references , lowercase = False ): # tl.nlp
0 commit comments