Skip to content

Commit 2e6627e

Browse files
committed
Fixed a bug that caused a loading error if a mention couldn't be found inside a tweet. Fixed the problem that some mentions are not listed with hashtags while they contain hashtags inside the real tweet. Added a test for this type of 'search with ignored hashtags'. refs #81
1 parent a543e9b commit 2e6627e

File tree

4 files changed

+196
-64
lines changed

4 files changed

+196
-64
lines changed

src/main/java/org/aksw/gerbil/bat/datasets/Microposts2014Dataset.java

Lines changed: 106 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import it.acubelab.batframework.utils.AnnotationException;
3131
import it.acubelab.batframework.utils.ProblemReduction;
3232
import it.acubelab.batframework.utils.WikipediaApiInterface;
33+
import it.unimi.dsi.fastutil.ints.IntArrayList;
3334
import it.unimi.dsi.lang.MutableString;
3435

3536
import java.io.BufferedReader;
@@ -48,13 +49,17 @@
4849
import javax.xml.parsers.ParserConfigurationException;
4950
import javax.xml.xpath.XPathExpressionException;
5051

52+
import org.slf4j.Logger;
53+
import org.slf4j.LoggerFactory;
5154
import org.xml.sax.SAXException;
5255

53-
/**
56+
/**
5457
* @author Giuseppe Rizzo <giuse.rizzo@gmail.com>
5558
*/
5659
public class Microposts2014Dataset implements A2WDataset {
5760

61+
private static final Logger LOGGER = LoggerFactory.getLogger(Microposts2014Dataset.class);
62+
5863
private List<HashSet<Annotation>> annotations = new Vector<HashSet<Annotation>>();
5964
private List<MutableString> tweets = new Vector<MutableString>();
6065
private Pattern dbpediaUrlPattern = Pattern.compile("http://dbpedia.org/resource/(.*)");
@@ -89,33 +94,52 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
8994
if (mTweet.matches())
9095
{
9196
// current tweet
92-
String tweet = mTweet.group(1);
97+
String tweet = mTweet.group(1);
9398
tweets.add(new MutableString(tweet));
9499

95100
String pairs = mRecord.group(4);
96101
if (pairs != null && !pairs.equals(""))
97102
{
98103
String[] tAnn = pairs.split("\t");
99-
for (int i = 0; i < tAnn.length; i = i + 2)
100-
{
104+
for (int i = 0; i < tAnn.length; i = i + 2)
105+
{
101106
// fetch the DBpedia name
102107
// TODO: naive assumption that all DBpedia resources have the corresponding Wikipedia ones
103108
// better to be verified
104109
Matcher mDBpedia = dbpediaUrlPattern.matcher(tAnn[i + 1]);
105-
if (mDBpedia.matches())
110+
if (mDBpedia.matches())
106111
{
107-
String mention = tAnn[i];
108-
112+
String mention = tAnn[i];
113+
114+
// Let's start getting the title
115+
currentTitle = mDBpedia.group(1);
116+
currentTitle = URLDecoder.decode(currentTitle, "utf-8");
117+
118+
// Try to create a Microposts2014Annotation object by searching the mention inside the
119+
// tweet
120+
Microposts2014Annotation annotation = null;
109121
int offset = indexMentionAlreadySpotted(mention, currentAnns);
110122
int currentPos = tweet.indexOf(mention, offset);
111-
112-
currentTitle = mDBpedia.group(1);
113-
currentTitle = URLDecoder.decode(currentTitle, "utf-8");
114-
currentAnns.add(new Microposts2014Annotation(mention,currentPos, mention.length(), currentTitle));
115-
116-
System.out.println(mention + " " + currentPos + " " + mention.length() + " " + currentTitle);
117-
118-
titlesToPrefetch.add(currentTitle);
123+
if (currentPos >= 0) {
124+
annotation = new Microposts2014Annotation(mention, currentPos, mention.length(),
125+
currentTitle);
126+
}
127+
if (annotation == null) {
128+
// Micha: In some cases the mention is not exactly the same as the part of the text.
129+
// For now, we only can try to remove hash tags and search again.
130+
annotation = findMentionInsideTweetIgnoringHashes(tweet, mention, offset,
131+
currentTitle);
132+
}
133+
if (annotation == null) {
134+
LOGGER.error(
135+
"Couldn't find mention=\"{}\" inside the tweet=\"{}\" (should be there after the offset {}). Ignoring this mention.",
136+
mention, tweet, offset);
137+
} else {
138+
currentAnns.add(annotation);
139+
// System.out.println(mention + " " + currentPos + " " + mention.length() + " "
140+
// + currentTitle);
141+
titlesToPrefetch.add(currentTitle);
142+
}
119143
}
120144

121145
}
@@ -141,7 +165,7 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
141165
for (Microposts2014Annotation aA : s) {
142166
int wid = wikiApi.getIdByTitle(aA.title);
143167
if (wid == -1)
144-
System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page " + aA.title);
168+
LOGGER.warn("Dataset is malformed: Wikipedia API could not find page " + aA.title);
145169
else
146170
sA.add(new Annotation(aA.position, aA.length, wid));
147171
}
@@ -150,6 +174,61 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
150174
}
151175
}
152176

177+
/**
178+
* A very simple workaround to search for a mention without hashes. Note that this only works, if the mention
179+
* couldn't be found because the tweets contains hash tags that should be part of the mentions.
180+
*
181+
* @param tweet
182+
* the tweet
183+
* @param mention
184+
* the mention that couldn't be found directly inside the tweet
185+
* @param offset
186+
* the position from which the search should start
187+
* @param wikiTitle
188+
* the title of the entity inside the Wikipedia
189+
*
190+
* @return
191+
*/
192+
protected static Microposts2014Annotation findMentionInsideTweetIgnoringHashes(String tweet, String mention,
193+
int offset, String wikiTitle) {
194+
IntArrayList hashes = new IntArrayList();
195+
int pos = tweet.indexOf('#');
196+
while (pos >= 0) {
197+
hashes.add(pos);
198+
pos = tweet.indexOf('#', pos + 1);
199+
}
200+
// There are no hashes --> the problem of finding the mention can't be solved by removing the hashes
201+
if (hashes.size() == 0) {
202+
return null;
203+
}
204+
// The offset might have been moved through the removing of the hashes.
205+
int newOffset = 0;
206+
for (int i = 0; (i < hashes.size() && (hashes.get(i) < newOffset)); ++i) {
207+
--newOffset;
208+
}
209+
String newTweet = tweet.replaceAll("#", "");
210+
pos = newTweet.indexOf(mention, newOffset);
211+
// if the mention couldn't be found
212+
if (pos < 0) {
213+
return null;
214+
}
215+
// find the start and end positions of the mention inside the original tweet by looking at the list of hashes
216+
int startPos = pos;
217+
int endPos = pos + mention.length();
218+
for (int i = 0; i < hashes.size(); ++i) {
219+
if (hashes.get(i) < endPos) {
220+
++endPos;
221+
if (hashes.get(i) < startPos) {
222+
++startPos;
223+
}
224+
}
225+
}
226+
String newMention = new String(tweet.substring(startPos, endPos));
227+
LOGGER.debug("Couldn't find \"{}\" but found \"{}\" instead.", mention, newMention);
228+
return new Microposts2014Annotation(newMention, startPos, newMention.length(),
229+
wikiTitle);
230+
}
231+
153232
@Override
154233
public int getSize() {
155234
return annotations.size();
@@ -199,22 +278,22 @@ public String getName() {
199278

200279
private int indexMentionAlreadySpotted(String mention, List<Microposts2014Annotation> currentAnns)
201280
{
202-
int result = 0;
203-
for (Microposts2014Annotation a : currentAnns) {
204-
if(a.mention.equals(mention))
205-
result = a.position + mention.length(); //if many, then we get the last
206-
}
207-
return result;
208-
}
209-
210-
private class Microposts2014Annotation {
281+
int result = 0;
282+
for (Microposts2014Annotation a : currentAnns) {
283+
if (a.mention.equals(mention))
284+
result = a.position + mention.length(); // if many, then we get the last
285+
}
286+
return result;
287+
}
288+
289+
protected static class Microposts2014Annotation {
211290
public Microposts2014Annotation(String mention, int position, int length, String title) {
212-
this.mention = mention;
291+
this.mention = mention;
213292
this.position = position;
214293
this.title = title;
215294
this.length = length;
216295
}
217-
296+
218297
public String mention;
219298
public String title;
220299
public int position;

src/main/java/org/aksw/gerbil/datasets/Microposts2014Config.java

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,29 +33,29 @@
3333
import org.aksw.gerbil.datatypes.ExperimentType;
3434

3535
/**
36-
* ...
37-
*
36+
* Configuration class that is able to load the Micropost2014 datasets (train and test).
37+
* The datasets are distinguished using the {@link Microposts2014Chunk} enum.
38+
*
3839
* @author Giuseppe Rizzo <giuse.rizzo@gmail.com>
3940
*/
4041
public class Microposts2014Config extends AbstractDatasetConfiguration {
4142

42-
public static final String DATASET_NAME_START = "Microposts2014";
43+
public static final String DATASET_NAME_START = "Microposts2014";
4344
private static final String DATASET_FILE_PROPERTY_NAME = "org.aksw.gerbil.datasets.Microposts2014DatasetConfig";
44-
45+
4546
private Microposts2014Chunk chunk;
4647
private WikipediaApiInterface wikiApi;
4748

4849
public static enum Microposts2014Chunk {
4950
TRAIN, TEST
5051
}
51-
52-
public Microposts2014Config(
53-
Microposts2014Chunk chunk,
54-
WikipediaApiInterface wikiApi
55-
)
56-
{
57-
super(DATASET_NAME_START, true, ExperimentType.Sa2KB);
58-
this.chunk = chunk;
52+
53+
public Microposts2014Config(
54+
Microposts2014Chunk chunk,
55+
WikipediaApiInterface wikiApi)
56+
{
57+
super(DATASET_NAME_START, true, ExperimentType.Sa2KB);
58+
this.chunk = chunk;
5959
this.wikiApi = wikiApi;
6060
// Set the correct name
6161
switch (chunk) {
@@ -68,27 +68,27 @@ public Microposts2014Config(
6868
break;
6969
}
7070
}
71-
}
71+
}
7272

73-
@Override
74-
protected TopicDataset loadDataset() throws Exception {
73+
@Override
74+
protected TopicDataset loadDataset() throws Exception {
7575
switch (chunk) {
7676
case TRAIN: {
77-
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train"));
77+
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train"));
7878
if (file == null) {
7979
throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\".");
8080
}
8181
return new Microposts2014Dataset(file, wikiApi);
8282
}
8383
case TEST: {
84-
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test"));
84+
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test"));
8585
if (file == null) {
8686
throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\".");
87-
}
87+
}
8888
return new Microposts2014Dataset(file, wikiApi);
8989
}
9090
}
9191
return null;
92-
}
92+
}
9393

9494
}

src/test/java/org/aksw/gerbil/Microposts2014Test.java

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,30 +36,30 @@
3636
import org.junit.Ignore;
3737

3838
/**
39-
* ...
40-
*
39+
* Class for testing the microposts dataset.
40+
*
4141
* @author Giuseppe Rizzo <giuse.rizzo@gmail.com>
4242
*/
4343
@Ignore
4444
public class Microposts2014Test {
4545

46-
public static void main(String[] args) {
47-
48-
WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance();
49-
ExperimentTaskConfiguration taskConfigs[] =
50-
new ExperimentTaskConfiguration[]
51-
{
52-
new ExperimentTaskConfiguration(
53-
new NERDAnnotatorConfig(wikiAPI),
54-
new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()),
55-
ExperimentType.D2KB,
56-
Matching.STRONG_ANNOTATION_MATCH) };
57-
Experimenter experimenter = new Experimenter(wikiAPI,
58-
new SimpleLoggingDAO4Debugging(),
59-
taskConfigs,
60-
"NERD_TEST");
46+
public static void main(String[] args) {
47+
48+
WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance();
49+
ExperimentTaskConfiguration taskConfigs[] =
50+
new ExperimentTaskConfiguration[]
51+
{
52+
new ExperimentTaskConfiguration(
53+
new NERDAnnotatorConfig(wikiAPI),
54+
new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()),
55+
ExperimentType.D2KB,
56+
Matching.STRONG_ANNOTATION_MATCH) };
57+
Experimenter experimenter = new Experimenter(wikiAPI,
58+
new SimpleLoggingDAO4Debugging(),
59+
taskConfigs,
60+
"MICROPOSTS_TEST");
6161
experimenter.run();
6262

63-
}
63+
}
6464

6565
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.aksw.gerbil.bat.datasets;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collection;
5+
import java.util.List;
6+
7+
import org.aksw.gerbil.bat.datasets.Microposts2014Dataset.Microposts2014Annotation;
8+
import org.junit.Assert;
9+
import org.junit.Test;
10+
import org.junit.runner.RunWith;
11+
import org.junit.runners.Parameterized;
12+
import org.junit.runners.Parameterized.Parameters;
13+
14+
@RunWith(Parameterized.class)
15+
public class Microposts2014DatasetMentionSearchTest {
16+
17+
@Parameters
18+
public static Collection<Object[]> data() {
19+
List<Object[]> testConfigs = new ArrayList<Object[]>();
20+
testConfigs
21+
.add(new Object[] {
22+
"NOTW phone hacking",
23+
"Rupert #Murdoch, asked who was responsible for #NOTW phone #hacking? 'The people I trusted & maybe the people they trusted'",
24+
"#NOTW phone #hacking" });
25+
testConfigs.add(new Object[] { "Amy Winehouse",
26+
"#Amy #Winehouse Is #Dead After a Suspected Drug Overdose http://t.co/9KBWCeN via @YahooNews",
27+
"#Amy #Winehouse" });
28+
testConfigs
29+
.add(new Object[] {
30+
"White Sox",
31+
"#MLB Live Score Update #White #Sox (4) - #Indians (2) Final Play By Play Click link: http://rotoinfo.com/gameview?310724105",
32+
"#White #Sox" });
33+
return testConfigs;
34+
}
35+
36+
private String mention;
37+
private String tweet;
38+
private String expectedMention;
39+
40+
public Microposts2014DatasetMentionSearchTest(String mention, String tweet, String expectedMention) {
41+
this.mention = mention;
42+
this.tweet = tweet;
43+
this.expectedMention = expectedMention;
44+
}
45+
46+
@Test
47+
public void test() {
48+
Microposts2014Annotation annotation = Microposts2014Dataset.findMentionInsideTweetIgnoringHashes(tweet,
49+
mention, 0, null);
50+
Assert.assertNotNull(annotation);
51+
Assert.assertEquals(expectedMention, annotation.mention);
52+
}
53+
}

0 commit comments

Comments
 (0)