3030import it .acubelab .batframework .utils .AnnotationException ;
3131import it .acubelab .batframework .utils .ProblemReduction ;
3232import it .acubelab .batframework .utils .WikipediaApiInterface ;
33+ import it .unimi .dsi .fastutil .ints .IntArrayList ;
3334import it .unimi .dsi .lang .MutableString ;
3435
3536import java .io .BufferedReader ;
4849import javax .xml .parsers .ParserConfigurationException ;
4950import javax .xml .xpath .XPathExpressionException ;
5051
52+ import org .slf4j .Logger ;
53+ import org .slf4j .LoggerFactory ;
5154import org .xml .sax .SAXException ;
5255
53- /**
56+ /**
5457 * @author Giuseppe Rizzo <giuse.rizzo@gmail.com>
5558 */
5659public class Microposts2014Dataset implements A2WDataset {
5760
61+ private static final Logger LOGGER = LoggerFactory .getLogger (Microposts2014Dataset .class );
62+
5863 private List <HashSet <Annotation >> annotations = new Vector <HashSet <Annotation >>();
5964 private List <MutableString > tweets = new Vector <MutableString >();
6065 private Pattern dbpediaUrlPattern = Pattern .compile ("http://dbpedia.org/resource/(.*)" );
@@ -89,33 +94,52 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
8994 if (mTweet .matches ())
9095 {
9196 // current tweet
92- String tweet = mTweet .group (1 );
97+ String tweet = mTweet .group (1 );
9398 tweets .add (new MutableString (tweet ));
9499
95100 String pairs = mRecord .group (4 );
96101 if (pairs != null && !pairs .equals ("" ))
97102 {
98103 String [] tAnn = pairs .split ("\t " );
99- for (int i = 0 ; i < tAnn .length ; i = i + 2 )
100- {
104+ for (int i = 0 ; i < tAnn .length ; i = i + 2 )
105+ {
101106 // fetch the DBpedia name
102107 // TODO: naive assumption that all DBpedia resources have the corresponding Wikipedia ones
103108 // better to be verified
104109 Matcher mDBpedia = dbpediaUrlPattern .matcher (tAnn [i + 1 ]);
105- if (mDBpedia .matches ())
110+ if (mDBpedia .matches ())
106111 {
107- String mention = tAnn [i ];
108-
112+ String mention = tAnn [i ];
113+
114+ // Let's start getting the title
115+ currentTitle = mDBpedia .group (1 );
116+ currentTitle = URLDecoder .decode (currentTitle , "utf-8" );
117+
118+ // Try to create a Microposts2014Annotation object by searching the mention inside the
119+ // tweet
120+ Microposts2014Annotation annotation = null ;
109121 int offset = indexMentionAlreadySpotted (mention , currentAnns );
110122 int currentPos = tweet .indexOf (mention , offset );
111-
112- currentTitle = mDBpedia .group (1 );
113- currentTitle = URLDecoder .decode (currentTitle , "utf-8" );
114- currentAnns .add (new Microposts2014Annotation (mention ,currentPos , mention .length (), currentTitle ));
115-
116- System .out .println (mention + " " + currentPos + " " + mention .length () + " " + currentTitle );
117-
118- titlesToPrefetch .add (currentTitle );
123+ if (currentPos >= 0 ) {
124+ annotation = new Microposts2014Annotation (mention , currentPos , mention .length (),
125+ currentTitle );
126+ }
127+ if (annotation == null ) {
128+ // Micha: In some cases the mention is not exactly the same as the part of the text.
129+ // For now, we only can try to remove hash tags and search again.
130+ annotation = findMentionInsideTweetIgnoringHashes (tweet , mention , offset ,
131+ currentTitle );
132+ }
133+ if (annotation == null ) {
134+ LOGGER .error (
135+ "Couldn't find mention=\" {}\" inside the tweet=\" {}\" (should be there after the offset {}). Ignoring this mention." ,
136+ mention , tweet , offset );
137+ } else {
138+ currentAnns .add (annotation );
139+ // System.out.println(mention + " " + currentPos + " " + mention.length() + " "
140+ // + currentTitle);
141+ titlesToPrefetch .add (currentTitle );
142+ }
119143 }
120144
121145 }
@@ -141,7 +165,7 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
141165 for (Microposts2014Annotation aA : s ) {
142166 int wid = wikiApi .getIdByTitle (aA .title );
143167 if (wid == -1 )
144- System . out . println ( "ERROR: Dataset is malformed: Wikipedia API could not find page " + aA .title );
168+ LOGGER . warn ( " Dataset is malformed: Wikipedia API could not find page " + aA .title );
145169 else
146170 sA .add (new Annotation (aA .position , aA .length , wid ));
147171 }
@@ -150,6 +174,61 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
150174 }
151175 }
152176
177+ /**
178+ * A very simple workaround to search for a mention without hashes. Note that this only works, if the mention
179+ * couldn't be found because the tweets contains hash tags that should be part of the mentions.
180+ *
181+ * @param tweet
182+ * the tweet
183+ * @param mention
184+ * the mention that couldn't be found directly inside the tweet
185+ * @param offset
186+ * the position from which the search should start
187+ * @param wikiTitle
188+ * the title of the entity inside the Wikipedia
189+ *
190+ * @return
191+ */
192+ protected static Microposts2014Annotation findMentionInsideTweetIgnoringHashes (String tweet , String mention ,
193+ int offset , String wikiTitle ) {
194+ IntArrayList hashes = new IntArrayList ();
195+ int pos = tweet .indexOf ('#' );
196+ while (pos >= 0 ) {
197+ hashes .add (pos );
198+ pos = tweet .indexOf ('#' , pos + 1 );
199+ }
200+ // There are no hashes --> the problem of finding the mention can't be solved by removing the hashes
201+ if (hashes .size () == 0 ) {
202+ return null ;
203+ }
204+ // The offset might have been moved through the removing of the hashes.
205+ int newOffset = 0 ;
206+ for (int i = 0 ; (i < hashes .size () && (hashes .get (i ) < newOffset )); ++i ) {
207+ --newOffset ;
208+ }
209+ String newTweet = tweet .replaceAll ("#" , "" );
210+ pos = newTweet .indexOf (mention , newOffset );
211+ // if the mention couldn't be found
212+ if (pos < 0 ) {
213+ return null ;
214+ }
215+ // find the start and end positions of the mention inside the original tweet by looking at the list of hashes
216+ int startPos = pos ;
217+ int endPos = pos + mention .length ();
218+ for (int i = 0 ; i < hashes .size (); ++i ) {
219+ if (hashes .get (i ) < endPos ) {
220+ ++endPos ;
221+ if (hashes .get (i ) < startPos ) {
222+ ++startPos ;
223+ }
224+ }
225+ }
226+ String newMention = new String (tweet .substring (startPos , endPos ));
227+ LOGGER .debug ("Couldn't find \" {}\" but found \" {}\" instead." , mention , newMention );
228+ return new Microposts2014Annotation (newMention , startPos , newMention .length (),
229+ wikiTitle );
230+ }
231+
153232 @ Override
154233 public int getSize () {
155234 return annotations .size ();
@@ -199,22 +278,22 @@ public String getName() {
199278
200279 private int indexMentionAlreadySpotted (String mention , List <Microposts2014Annotation > currentAnns )
201280 {
202- int result = 0 ;
203- for (Microposts2014Annotation a : currentAnns ) {
204- if (a .mention .equals (mention ))
205- result = a .position + mention .length (); //if many, then we get the last
206- }
207- return result ;
208- }
209-
210- private class Microposts2014Annotation {
281+ int result = 0 ;
282+ for (Microposts2014Annotation a : currentAnns ) {
283+ if (a .mention .equals (mention ))
284+ result = a .position + mention .length (); // if many, then we get the last
285+ }
286+ return result ;
287+ }
288+
289+ protected static class Microposts2014Annotation {
211290 public Microposts2014Annotation (String mention , int position , int length , String title ) {
212- this .mention = mention ;
291+ this .mention = mention ;
213292 this .position = position ;
214293 this .title = title ;
215294 this .length = length ;
216295 }
217-
296+
218297 public String mention ;
219298 public String title ;
220299 public int position ;
0 commit comments