bumped to 1.8.3 after adding rule for don't and new kaflib version

ragerri · ragerri · commit cdaec1595e90 · 2015-09-17T16:01:27.000+02:00
diff --git a/dependency-reduced-pom.xml b/dependency-reduced-pom.xml
@@ -4,7 +4,7 @@
   <groupId>eus.ixa</groupId>
   <artifactId>ixa-pipe-tok</artifactId>
   <name>ixa-pipe-tok</name>
-  <version>1.8.2</version>
+  <version>1.8.3</version>
   <description>The ixa pipes tokenizer (ixa2.si.ehu.es/ixa-pipes)</description>
   <url>http://ixa2.si.ehu.es/ixa-pipes</url>
   <prerequisites>
@@ -154,12 +154,6 @@
       </build>
     </profile>
   </profiles>
-  <repositories>
-    <repository>
-      <id>apache opennlp snapshots</id>
-      <url>https://repository.apache.org/content/repositories/snapshots/</url>
-    </repository>
-  </repositories>
   <dependencies>
     <dependency>
       <groupId>junit</groupId>
diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 	<groupId>eus.ixa</groupId>
 	<artifactId>ixa-pipe-tok</artifactId>
 	<packaging>jar</packaging>
-	<version>1.8.2</version>
+	<version>1.8.3</version>
 	<name>ixa-pipe-tok</name>
 	<description>The ixa pipes tokenizer (ixa2.si.ehu.es/ixa-pipes)</description>
 	<licenses>
@@ -30,16 +30,12 @@
 	</prerequisites>
 	<url>http://ixa2.si.ehu.es/ixa-pipes</url>
 	<repositories>
-		<repository>
-			<id>apache opennlp snapshots</id>
-			<url>https://repository.apache.org/content/repositories/snapshots/</url>
-		</repository>
 	</repositories>
 	<dependencies>
 		<dependency>
 			<groupId>com.github.ixa-ehu</groupId>
 			<artifactId>kaflib-naf</artifactId>
-			<version>1.0.3</version>
+			<version>1.1.12</version>
 		</dependency>
 		<dependency>
 			<groupId>junit</groupId>
diff --git a/src/main/java/eus/ixa/ixa/pipe/tok/Annotate.java b/src/main/java/eus/ixa/ixa/pipe/tok/Annotate.java
@@ -77,8 +77,7 @@ public void tokenizeToKAF(final KAFDocument kaf) throws IOException {
         if (token.getTokenValue().equals(RuleBasedSegmenter.PARAGRAPH)) {
           ++noParas;
         } else {
-          final WF wf = kaf.newWF(token.getTokenValue(), token.startOffset(),
-              noSents);
+          final WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), noSents);
           wf.setLength(token.tokenLength());
           wf.setPara(noParas);
         }
@@ -180,9 +179,9 @@ public static void tokensToKAF(final Reader breader, final KAFDocument kaf)
           }
         } else {
           // TODO add offset
-          final WF wf = kaf.newWF(token);
+          final WF wf = kaf.newWF(0, token, noSents);
           wf.setPara(noParas);
-          wf.setSent(noSents);
+          //wf.setSent(noSents);
         }
       }
     }
diff --git a/src/main/java/eus/ixa/ixa/pipe/tok/RuleBasedTokenizer.java b/src/main/java/eus/ixa/ixa/pipe/tok/RuleBasedTokenizer.java
@@ -165,6 +165,10 @@ public class RuleBasedTokenizer implements Tokenizer {
    */
   public static Pattern endOfSentenceApos = Pattern.compile("([^\\p{Alpha}])("
       + Normalizer.TO_ASCII_SINGLE_QUOTE + ")$");
+  /**
+   * Detokenize wrongly tokenize n't English contractions.
+   */
+  public static Pattern deTokenEnglishNegation = Pattern.compile("([n])(" + Normalizer.TO_ASCII_SINGLE_QUOTE + ")\\s+([t])", Pattern.UNICODE_CHARACTER_CLASS);
   /**
    * De-tokenize paragraph marks.
    */
@@ -277,7 +281,7 @@ private String[] getTokens(String line) {
     line = digitCommaNoDigit.matcher(line).replaceAll("$1 $2 $3");
     line = noDigitCommaDigit.matcher(line).replaceAll("$1 $2 $3");
 
-    // contractions it's, l'agila, c'est
+    // contractions it's, l'agila, c'est, don't
     line = treatContractions(line);
     // exceptions for period tokenization
     line = nonBreaker.TokenizerNonBreaker(line);
@@ -358,10 +362,9 @@ private String treatContractions(String line) {
     line = englishApos.matcher(line).replaceAll("$1 $2$3");
     line = yearApos.matcher(line).replaceAll("$1 $2$3");
     // romance tokenization of apostrophes c' l'
-    if (!lang.equalsIgnoreCase("en")) {
-      line = AlphaAposAlpha.matcher(line).replaceAll("$1$2 $3");
-    }
+    line = AlphaAposAlpha.matcher(line).replaceAll("$1$2 $3");
     line = endOfSentenceApos.matcher(line).replaceAll("$1 $2");
+    line = deTokenEnglishNegation.matcher(line).replaceAll("$1$2$3");
     return line;
   }
 

Original file line number	Diff line number	Diff line change
`@@ -77,8 +77,7 @@ public void tokenizeToKAF(final KAFDocument kaf) throws IOException {`
`77`	`77`	`if (token.getTokenValue().equals(RuleBasedSegmenter.PARAGRAPH)) {`
`78`	`78`	`++noParas;`
`79`	`79`	`} else {`
`80`		`- final WF wf = kaf.newWF(token.getTokenValue(), token.startOffset(),`
`81`		`- noSents);`
	`80`	`+ final WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), noSents);`
`82`	`81`	`wf.setLength(token.tokenLength());`
`83`	`82`	`wf.setPara(noParas);`
`84`	`83`	`}`
`@@ -180,9 +179,9 @@ public static void tokensToKAF(final Reader breader, final KAFDocument kaf)`
`180`	`179`	`}`
`181`	`180`	`} else {`
`182`	`181`	`// TODO add offset`
`183`		`- final WF wf = kaf.newWF(token);`
	`182`	`+ final WF wf = kaf.newWF(0, token, noSents);`
`184`	`183`	`wf.setPara(noParas);`
`185`		`- wf.setSent(noSents);`
	`184`	`+ //wf.setSent(noSents);`
`186`	`185`	`}`
`187`	`186`	`}`
`188`	`187`	`}`