Skip to content

Commit be558fe

Browse files
committed
Revert "Switch from tika-parsers to tika-core (#5217)"
This reverts commit 29cf4f2
1 parent c34f1d3 commit be558fe

File tree

5 files changed

+9
-66
lines changed

5 files changed

+9
-66
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ dependencies {
122122
compile 'org.apache.pdfbox:fontbox:2.0.16'
123123
compile 'org.apache.pdfbox:xmpbox:2.0.16'
124124

125-
compile group: 'org.apache.tika', name: 'tika-core', version: '1.22'
125+
compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.22'
126126

127127
// required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635
128128
compile 'org.bouncycastle:bcprov-jdk15on:1.62'

docs/adr/0005-fully-support-utf8-only-for-latex-files.md

Lines changed: 0 additions & 44 deletions
This file was deleted.

docs/adr/index.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ This log lists the architectural decisions for JabRef.
99
- [ADR-0002](0002-use-slf4j-for-logging.md) - Use slf4j together with log4j2 for logging
1010
- [ADR-0003](0003-use-gradle-as-build-tool.md) - Use Gradle as build tool
1111
- [ADR-0004](0004-use-mariadb-connector.md) - Use MariaDB Connector
12-
- [ADR-0005](0005-fully-support-utf8-only-for-latex-files.md) - Fully Support UTF-8 Only For LaTeX Files
1312

1413
<!-- adrlogstop -->
1514

src/main/java/org/jabref/logic/texparser/DefaultTexParser.java

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
package org.jabref.logic.texparser;
22

33
import java.io.IOException;
4-
import java.io.InputStream;
5-
import java.io.InputStreamReader;
64
import java.io.LineNumberReader;
75
import java.io.Reader;
86
import java.io.UncheckedIOException;
97
import java.nio.channels.ClosedChannelException;
10-
import java.nio.charset.StandardCharsets;
118
import java.nio.file.Files;
129
import java.nio.file.Path;
1310
import java.nio.file.Paths;
@@ -20,6 +17,7 @@
2017
import org.jabref.model.texparser.TexParser;
2118
import org.jabref.model.texparser.TexParserResult;
2219

20+
import org.apache.tika.parser.txt.CharsetDetector;
2321
import org.slf4j.Logger;
2422
import org.slf4j.LoggerFactory;
2523

@@ -84,8 +82,7 @@ public TexParserResult parse(List<Path> texFiles) {
8482
}
8583

8684
try (
87-
InputStream inputStream = Files.newInputStream(file);
88-
Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
85+
Reader reader = new CharsetDetector().setText(Files.readAllBytes(file)).detect().getReader();
8986
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
9087
for (String line = lineNumberReader.readLine(); line != null; line = lineNumberReader.readLine()) {
9188
// Skip comments and blank lines.

src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import static org.junit.jupiter.api.Assertions.assertEquals;
1313

1414
public class DefaultTexParserTest {
15-
1615
private final static String DARWIN = "Darwin1888";
1716
private final static String EINSTEIN = "Einstein1920";
1817
private final static String NEWTON = "Newton1999";
@@ -89,9 +88,7 @@ public void testFileEncodingIso88591() throws URISyntaxException {
8988
TexParserResult expectedParserResult = new TexParserResult();
9089

9190
expectedParserResult.getFileList().add(texFile);
92-
// The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005
93-
expectedParserResult
94-
.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
91+
expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
9592

9693
assertEquals(expectedParserResult, parserResult);
9794
}
@@ -104,9 +101,7 @@ public void testFileEncodingIso885915() throws URISyntaxException {
104101
TexParserResult expectedParserResult = new TexParserResult();
105102

106103
expectedParserResult.getFileList().add(texFile);
107-
// The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005
108-
expectedParserResult
109-
.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
104+
expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
110105

111106
assertEquals(expectedParserResult, parserResult);
112107
}
@@ -117,17 +112,13 @@ public void testFileEncodingForThreeFiles() throws URISyntaxException {
117112
Path texFile2 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-1.tex").toURI());
118113
Path texFile3 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-15.tex").toURI());
119114

120-
TexParserResult parserResult = new DefaultTexParser()
121-
.parse(Arrays.asList(texFile, texFile2, texFile3));
115+
TexParserResult parserResult = new DefaultTexParser().parse(Arrays.asList(texFile, texFile2, texFile3));
122116
TexParserResult expectedParserResult = new TexParserResult();
123117

124118
expectedParserResult.getFileList().addAll(Arrays.asList(texFile, texFile2, texFile3));
125-
expectedParserResult
126-
.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
127-
expectedParserResult
128-
.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
129-
expectedParserResult
130-
.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
119+
expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
120+
expectedParserResult.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
121+
expectedParserResult.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
131122

132123
assertEquals(expectedParserResult, parserResult);
133124
}

0 commit comments

Comments
 (0)