-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMyTopicModeling.py
More file actions
60 lines (46 loc) · 1.85 KB
/
MyTopicModeling.py
File metadata and controls
60 lines (46 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 04 22:03:17 2018
@author: Mendez Vasquez
"""
import os
import numpy as np # a conventional alias
import sklearn.feature_extraction.text as text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import unicodedata
import codecs
from googletrans import Translator
def load_codecs(file_name):
with codecs.open(file_name, "r",encoding='latin_1', errors='ignore') as fdata:
return fdata.readlines()
def elimina_tildes(s):
s = ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
return s.decode()
def load_file(filename):
with open(filename, 'r') as myfile:
data=myfile.read().replace('\n', '')
return data
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("Topic %d:" % (topic_idx))
print (" ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
CORPUS_PATH = os.path.join('texto')
translator = Translator()
filenames = sorted([os.path.join(CORPUS_PATH, fn) for fn in os.listdir(CORPUS_PATH)])
texto = [load_codecs(txt_file) for txt_file in filenames]
texto = texto[0]
texto_trad = [translator.translate(txt_file).text for txt_file in texto]
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(texto_trad)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
no_topics = 5
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
"""
svectorizer = text.CountVectorizer(input='filename', min_df=20)
dtm = vectorizer.fit_transform(filenames).toarray()
vocab = np.array(vectorizer.get_feature_names())
"""