microsoft · ReinierKoops · Dec 9, 2021
diff --git a/Dockerfile b/Dockerfile
@@ -8,7 +8,6 @@ RUN mkdir -p /usr/share/man/man1 && \
     build-essential \
     cifs-utils \
     curl \
-    default-jdk \
     dialog \
     dos2unix \
     git \
@@ -24,11 +23,6 @@ RUN pip install --user -r requirements.txt --no-warn-script-location && \
 # Cache the pretrained BERT model
 RUN python -c "from transformers import BertModel; BertModel.from_pretrained('bert-large-uncased-whole-word-masking')"
 
-# Download & cache StanfordNLP
-RUN mkdir -p /app/third_party && \
-    cd /app/third_party && \
-    curl https://download.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip | jar xv
-
 # Now copy the rest of the app
 COPY . /app/
 

diff --git a/README.md b/README.md
@@ -18,6 +18,9 @@ If you use RAT-SQL in your work, please cite it as follows:
 
 ## Changelog
 
+**2021-12-09:**
+- Remove dependency on Java and Stanford CoreNLP (changed into Python native package 'Stanza').
+
 **2020-08-14:**
 - The Docker image now inherits from a CUDA-enabled base image.
 - Clarified memory and dataset requirements on the image.
@@ -72,7 +75,7 @@ By default, [Docker Desktop for Mac](https://hub.docker.com/editions/community/d
 The `-m4g` switch overrides it; alternatively, you can increase the default limit in the Docker Desktop settings.
 
 > If you prefer to set up and run the codebase without Docker, follow the steps in `Dockerfile` one by one.
-> Note that this repository requires Python 3.7 or higher and a JVM to run [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/).
+> Note that this repository requires Python 3.7 or higher.
 
 ### Step 3: Run the experiments
 

diff --git a/ratsql/models/spider/spider_enc.py b/ratsql/models/spider/spider_enc.py
@@ -600,8 +600,8 @@ def normalize_toks(self):
         # lemmatize "abc"
         normalized_toks = []
         for i, tok in enumerate(new_toks):
-            ann = corenlp.annotate(tok, annotators=['tokenize', 'ssplit', 'lemma'])
-            lemmas = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
+            ann = corenlp.annotate(tok, annotators=['tokenize', 'lemma'])
+            lemmas = [token.to_dict()[0]["lemma"].lower() for sentence in ann.sentences for token in sentence.tokens]
             lemma_word = " ".join(lemmas)
             normalized_toks.append(lemma_word)
 

diff --git a/ratsql/resources/corenlp.py b/ratsql/resources/corenlp.py
@@ -1,46 +1,10 @@
-import os
-import sys
-
-import corenlp
-import requests
-
-
-class CoreNLP:
-    def __init__(self):
-        if not os.environ.get('CORENLP_HOME'):
-            os.environ['CORENLP_HOME'] = os.path.abspath(
-                os.path.join(
-                    os.path.dirname(__file__),
-                    '../../third_party/stanford-corenlp-full-2018-10-05'))
-        if not os.path.exists(os.environ['CORENLP_HOME']):
-            raise Exception(
-                f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}.
-
-                Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
-                Landing page: https://stanfordnlp.github.io/CoreNLP/''')
-        self.client = corenlp.CoreNLPClient()
-
-    def __del__(self):
-        self.client.stop()
-
-    def annotate(self, text, annotators=None, output_format=None, properties=None):
-        try:
-            result = self.client.annotate(text, annotators, output_format, properties)
-        except (corenlp.client.PermanentlyFailedException,
-                requests.exceptions.ConnectionError) as e:
-            print('\nWARNING: CoreNLP connection timeout. Recreating the server...', file=sys.stderr)
-            self.client.stop()
-            self.client.start()
-            result = self.client.annotate(text, annotators, output_format, properties)
-
-        return result
-
+import stanza
 
 _singleton = None
 
 
 def annotate(text, annotators=None, output_format=None, properties=None):
     global _singleton
     if not _singleton:
-        _singleton = CoreNLP()
-    return _singleton.annotate(text, annotators, output_format, properties)
+        _singleton = stanza.Pipeline('en', processors=','.join(annotators))
+    return _singleton(text)
diff --git a/ratsql/resources/pretrained_embeddings.py b/ratsql/resources/pretrained_embeddings.py
@@ -4,7 +4,6 @@
 import time
 
 import bpemb
-import corenlp
 import torch
 import torchtext
 
@@ -50,26 +49,26 @@ def __init__(self, kind, lemmatize=False):
         self.dim = self.glove.dim
         self.vectors = self.glove.vectors
         self.lemmatize = lemmatize
-        self.corenlp_annotators = ['tokenize', 'ssplit']
+        self.corenlp_annotators = ['tokenize']
         if lemmatize:
             self.corenlp_annotators.append('lemma')
 
     @functools.lru_cache(maxsize=1024)
     def tokenize(self, text):
-        ann = corenlp.annotate(text, self.corenlp_annotators)
+        annotation = corenlp.annotate(text, self.corenlp_annotators)
         if self.lemmatize:
-            return [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
+            return [token.to_dict()[0]["lemma"].lower() for sentence in annotation.sentences for token in sentence.tokens]
         else:
-            return [tok.word.lower() for sent in ann.sentence for tok in sent.token]
-    
+            return [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens]
+
     @functools.lru_cache(maxsize=1024)
     def tokenize_for_copying(self, text):
-        ann = corenlp.annotate(text, self.corenlp_annotators)
-        text_for_copying = [tok.originalText.lower() for sent in ann.sentence for tok in sent.token]
+        annotation = corenlp.annotate(text, self.corenlp_annotators)
+        text_for_copying = [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens]
         if self.lemmatize:
-            text = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
+            text = [token.to_dict()[0]["lemma"].lower() for sentence in annotation.sentences for token in sentence.tokens]
         else:
-            text = [tok.word.lower() for sent in ann.sentence for tok in sent.token]
+            text = [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens]
         return text, text_for_copying
 
     def untokenize(self, tokens):

diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
         'pyrsistent~=0.14.9',
         'pytest~=5.3.2',
         'records~=0.5.3',
-        'stanford-corenlp~=3.9.2',
+        'stanza~=1.3.0',
         'tabulate~=0.8.6',
         'torch~=1.3.1',
         'torchtext~=0.3.1',