-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_lsd.py
More file actions
executable file
·77 lines (67 loc) · 3.09 KB
/
make_lsd.py
File metadata and controls
executable file
·77 lines (67 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# Author: Sara Derakhshani
# This script is only necessary when downloading the original Lexicoder Sentiment Dictionary ("LSDaug2015") from the tool's website: https://www.snsoroka.com/data-lexicoder/
# The UMUC repository already contains the unzipped and preprocessed files in /Code/lexicoder_sentiment_scores/LSData. Therefore, for sentiment extraction with lexicoder, it is not necessary to run this script.
from csv import writer
from re import sub
import configparser
def extract_polarity(polarity: str):
"""Extract the polarity label. Returns the polarity without unnecessary characters."""
polarity = polarity[1:].strip()
if "#" in polarity:
polarity = polarity.split("#")[0]
return polarity
def create_lex_entry(pattern: str):
"""Create an entry with meta info for the new LSD.
Returns a list containing the lexicon entry, the number of tokens, 0 or 1 (prefix or no prefix)."""
pattern = pattern.strip()
tokens = pattern.split()
is_prefix = False
if pattern.endswith("*"):
is_prefix = True
pattern = sub('\*', '', pattern)
return [pattern, len(tokens), int(is_prefix)]
def main(unprocessed_lsd, unprocessed_lsd_neg, lsd):
polarity = None
# Count entries for progress bar
n_lsd_entries = sum([1 for n in open(unprocessed_lsd, "r")])
n_lsd_neg_entries = sum([1 for m in open(unprocessed_lsd_neg, "r")])
total_n_entries = n_lsd_entries + n_lsd_neg_entries
# Iterate LSD and negated LSD and add entry to new lexicon file
with open(lsd, "w", encoding="utf-8") as out_f:
out_writer = writer(out_f, delimiter='\t')
out_writer.writerow(["lexEntry", "nrOfTokens", "isPrefix", "polarity"])
with open(unprocessed_lsd, "r", encoding="utf-8") as lex_f:
for line in lex_f:
# Get polarity
if line.startswith("+"):
polarity = extract_polarity(line)
continue
# Skip this entry
elif "unite" in line:
continue
new_entry = create_lex_entry(line)
new_entry.append(polarity)
out_writer.writerow(new_entry)
lex_f.close()
with open(unprocessed_lsd_neg, "r", encoding="utf-8") as lex_neg_f:
for line in lex_neg_f:
# Get polarity
if line.startswith("+"):
polarity = extract_polarity(line)
continue
# Skip this entry
elif "unite" in line:
continue
new_entry = create_lex_entry(line)
new_entry.append(polarity)
out_writer.writerow(new_entry)
lex_neg_f.close()
out_f.close()
print(f"\n Lexicon successfully created and saved in {lsd} \n")
if __name__ == "__main__":
config = configparser.ConfigParser()
config.read("config.ini")
unprocessed_lsd = config["LEXICODER"]["UNPROCESSED_LSD"]
unprocessed_lsd_neg = config["LEXICODER"]["UNPROCESSED_LSD_NEG"]
lsd = config["LEXICODER"]["LSD"]
main(unprocessed_lsd, unprocessed_lsd_neg, lsd)