diff --git a/Dockerfile b/Dockerfile index 0103070..6a2c163 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,6 @@ RUN mkdir -p /usr/share/man/man1 && \ build-essential \ cifs-utils \ curl \ - default-jdk \ dialog \ dos2unix \ git \ @@ -24,11 +23,6 @@ RUN pip install --user -r requirements.txt --no-warn-script-location && \ # Cache the pretrained BERT model RUN python -c "from transformers import BertModel; BertModel.from_pretrained('bert-large-uncased-whole-word-masking')" -# Download & cache StanfordNLP -RUN mkdir -p /app/third_party && \ - cd /app/third_party && \ - curl https://download.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip | jar xv - # Now copy the rest of the app COPY . /app/ diff --git a/README.md b/README.md index 52fb281..83d14fa 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,9 @@ If you use RAT-SQL in your work, please cite it as follows: ## Changelog +**2021-12-09:** +- Remove dependency on Java and Stanford CoreNLP (changed into Python native package 'Stanza'). + **2020-08-14:** - The Docker image now inherits from a CUDA-enabled base image. - Clarified memory and dataset requirements on the image. @@ -72,7 +75,7 @@ By default, [Docker Desktop for Mac](https://hub.docker.com/editions/community/d The `-m4g` switch overrides it; alternatively, you can increase the default limit in the Docker Desktop settings. > If you prefer to set up and run the codebase without Docker, follow the steps in `Dockerfile` one by one. -> Note that this repository requires Python 3.7 or higher and a JVM to run [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/). +> Note that this repository requires Python 3.7 or higher. ### Step 3: Run the experiments diff --git a/ratsql/models/spider/spider_enc.py b/ratsql/models/spider/spider_enc.py index fe6b63d..aaa23c0 100644 --- a/ratsql/models/spider/spider_enc.py +++ b/ratsql/models/spider/spider_enc.py @@ -600,8 +600,8 @@ def normalize_toks(self): # lemmatize "abc" normalized_toks = [] for i, tok in enumerate(new_toks): - ann = corenlp.annotate(tok, annotators=['tokenize', 'ssplit', 'lemma']) - lemmas = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token] + ann = corenlp.annotate(tok, annotators=['tokenize', 'lemma']) + lemmas = [token.to_dict()[0]["lemma"].lower() for sentence in ann.sentences for token in sentence.tokens] lemma_word = " ".join(lemmas) normalized_toks.append(lemma_word) diff --git a/ratsql/resources/corenlp.py b/ratsql/resources/corenlp.py index 9e81b6a..345d8fc 100644 --- a/ratsql/resources/corenlp.py +++ b/ratsql/resources/corenlp.py @@ -1,40 +1,4 @@ -import os -import sys - -import corenlp -import requests - - -class CoreNLP: - def __init__(self): - if not os.environ.get('CORENLP_HOME'): - os.environ['CORENLP_HOME'] = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - '../../third_party/stanford-corenlp-full-2018-10-05')) - if not os.path.exists(os.environ['CORENLP_HOME']): - raise Exception( - f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}. - - Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip - Landing page: https://stanfordnlp.github.io/CoreNLP/''') - self.client = corenlp.CoreNLPClient() - - def __del__(self): - self.client.stop() - - def annotate(self, text, annotators=None, output_format=None, properties=None): - try: - result = self.client.annotate(text, annotators, output_format, properties) - except (corenlp.client.PermanentlyFailedException, - requests.exceptions.ConnectionError) as e: - print('\nWARNING: CoreNLP connection timeout. Recreating the server...', file=sys.stderr) - self.client.stop() - self.client.start() - result = self.client.annotate(text, annotators, output_format, properties) - - return result - +import stanza _singleton = None @@ -42,5 +6,5 @@ def annotate(self, text, annotators=None, output_format=None, properties=None): def annotate(text, annotators=None, output_format=None, properties=None): global _singleton if not _singleton: - _singleton = CoreNLP() - return _singleton.annotate(text, annotators, output_format, properties) + _singleton = stanza.Pipeline('en', processors=','.join(annotators)) + return _singleton(text) diff --git a/ratsql/resources/pretrained_embeddings.py b/ratsql/resources/pretrained_embeddings.py index 8d04383..b7e360d 100644 --- a/ratsql/resources/pretrained_embeddings.py +++ b/ratsql/resources/pretrained_embeddings.py @@ -4,7 +4,6 @@ import time import bpemb -import corenlp import torch import torchtext @@ -50,26 +49,26 @@ def __init__(self, kind, lemmatize=False): self.dim = self.glove.dim self.vectors = self.glove.vectors self.lemmatize = lemmatize - self.corenlp_annotators = ['tokenize', 'ssplit'] + self.corenlp_annotators = ['tokenize'] if lemmatize: self.corenlp_annotators.append('lemma') @functools.lru_cache(maxsize=1024) def tokenize(self, text): - ann = corenlp.annotate(text, self.corenlp_annotators) + annotation = corenlp.annotate(text, self.corenlp_annotators) if self.lemmatize: - return [tok.lemma.lower() for sent in ann.sentence for tok in sent.token] + return [token.to_dict()[0]["lemma"].lower() for sentence in annotation.sentences for token in sentence.tokens] else: - return [tok.word.lower() for sent in ann.sentence for tok in sent.token] - + return [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens] + @functools.lru_cache(maxsize=1024) def tokenize_for_copying(self, text): - ann = corenlp.annotate(text, self.corenlp_annotators) - text_for_copying = [tok.originalText.lower() for sent in ann.sentence for tok in sent.token] + annotation = corenlp.annotate(text, self.corenlp_annotators) + text_for_copying = [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens] if self.lemmatize: - text = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token] + text = [token.to_dict()[0]["lemma"].lower() for sentence in annotation.sentences for token in sentence.tokens] else: - text = [tok.word.lower() for sent in ann.sentence for tok in sent.token] + text = [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens] return text, text_for_copying def untokenize(self, tokens): diff --git a/setup.py b/setup.py index 0131946..4172deb 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ 'pyrsistent~=0.14.9', 'pytest~=5.3.2', 'records~=0.5.3', - 'stanford-corenlp~=3.9.2', + 'stanza~=1.3.0', 'tabulate~=0.8.6', 'torch~=1.3.1', 'torchtext~=0.3.1',