Skip to content

Commit cdaec15

Browse files
committed
bumped to 1.8.3 after adding rule for don't and new kaflib version
1 parent 0de227a commit cdaec15

File tree

4 files changed

+13
-21
lines changed

4 files changed

+13
-21
lines changed

dependency-reduced-pom.xml

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<groupId>eus.ixa</groupId>
55
<artifactId>ixa-pipe-tok</artifactId>
66
<name>ixa-pipe-tok</name>
7-
<version>1.8.2</version>
7+
<version>1.8.3</version>
88
<description>The ixa pipes tokenizer (ixa2.si.ehu.es/ixa-pipes)</description>
99
<url>http://ixa2.si.ehu.es/ixa-pipes</url>
1010
<prerequisites>
@@ -154,12 +154,6 @@
154154
</build>
155155
</profile>
156156
</profiles>
157-
<repositories>
158-
<repository>
159-
<id>apache opennlp snapshots</id>
160-
<url>https://repository.apache.org/content/repositories/snapshots/</url>
161-
</repository>
162-
</repositories>
163157
<dependencies>
164158
<dependency>
165159
<groupId>junit</groupId>

pom.xml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<groupId>eus.ixa</groupId>
55
<artifactId>ixa-pipe-tok</artifactId>
66
<packaging>jar</packaging>
7-
<version>1.8.2</version>
7+
<version>1.8.3</version>
88
<name>ixa-pipe-tok</name>
99
<description>The ixa pipes tokenizer (ixa2.si.ehu.es/ixa-pipes)</description>
1010
<licenses>
@@ -30,16 +30,12 @@
3030
</prerequisites>
3131
<url>http://ixa2.si.ehu.es/ixa-pipes</url>
3232
<repositories>
33-
<repository>
34-
<id>apache opennlp snapshots</id>
35-
<url>https://repository.apache.org/content/repositories/snapshots/</url>
36-
</repository>
3733
</repositories>
3834
<dependencies>
3935
<dependency>
4036
<groupId>com.github.ixa-ehu</groupId>
4137
<artifactId>kaflib-naf</artifactId>
42-
<version>1.0.3</version>
38+
<version>1.1.12</version>
4339
</dependency>
4440
<dependency>
4541
<groupId>junit</groupId>

src/main/java/eus/ixa/ixa/pipe/tok/Annotate.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,7 @@ public void tokenizeToKAF(final KAFDocument kaf) throws IOException {
7777
if (token.getTokenValue().equals(RuleBasedSegmenter.PARAGRAPH)) {
7878
++noParas;
7979
} else {
80-
final WF wf = kaf.newWF(token.getTokenValue(), token.startOffset(),
81-
noSents);
80+
final WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), noSents);
8281
wf.setLength(token.tokenLength());
8382
wf.setPara(noParas);
8483
}
@@ -180,9 +179,9 @@ public static void tokensToKAF(final Reader breader, final KAFDocument kaf)
180179
}
181180
} else {
182181
// TODO add offset
183-
final WF wf = kaf.newWF(token);
182+
final WF wf = kaf.newWF(0, token, noSents);
184183
wf.setPara(noParas);
185-
wf.setSent(noSents);
184+
//wf.setSent(noSents);
186185
}
187186
}
188187
}

src/main/java/eus/ixa/ixa/pipe/tok/RuleBasedTokenizer.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ public class RuleBasedTokenizer implements Tokenizer {
165165
*/
166166
public static Pattern endOfSentenceApos = Pattern.compile("([^\\p{Alpha}])("
167167
+ Normalizer.TO_ASCII_SINGLE_QUOTE + ")$");
168+
/**
169+
* Detokenize wrongly tokenize n't English contractions.
170+
*/
171+
public static Pattern deTokenEnglishNegation = Pattern.compile("([n])(" + Normalizer.TO_ASCII_SINGLE_QUOTE + ")\\s+([t])", Pattern.UNICODE_CHARACTER_CLASS);
168172
/**
169173
* De-tokenize paragraph marks.
170174
*/
@@ -277,7 +281,7 @@ private String[] getTokens(String line) {
277281
line = digitCommaNoDigit.matcher(line).replaceAll("$1 $2 $3");
278282
line = noDigitCommaDigit.matcher(line).replaceAll("$1 $2 $3");
279283

280-
// contractions it's, l'agila, c'est
284+
// contractions it's, l'agila, c'est, don't
281285
line = treatContractions(line);
282286
// exceptions for period tokenization
283287
line = nonBreaker.TokenizerNonBreaker(line);
@@ -358,10 +362,9 @@ private String treatContractions(String line) {
358362
line = englishApos.matcher(line).replaceAll("$1 $2$3");
359363
line = yearApos.matcher(line).replaceAll("$1 $2$3");
360364
// romance tokenization of apostrophes c' l'
361-
if (!lang.equalsIgnoreCase("en")) {
362-
line = AlphaAposAlpha.matcher(line).replaceAll("$1$2 $3");
363-
}
365+
line = AlphaAposAlpha.matcher(line).replaceAll("$1$2 $3");
364366
line = endOfSentenceApos.matcher(line).replaceAll("$1 $2");
367+
line = deTokenEnglishNegation.matcher(line).replaceAll("$1$2$3");
365368
return line;
366369
}
367370

0 commit comments

Comments
 (0)