Skip to content

Changed Stanford CoreNLP to Stanza #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ RUN mkdir -p /usr/share/man/man1 && \
build-essential \
cifs-utils \
curl \
default-jdk \
dialog \
dos2unix \
git \
Expand All @@ -24,11 +23,6 @@ RUN pip install --user -r requirements.txt --no-warn-script-location && \
# Cache the pretrained BERT model
RUN python -c "from transformers import BertModel; BertModel.from_pretrained('bert-large-uncased-whole-word-masking')"

# Download & cache StanfordNLP
RUN mkdir -p /app/third_party && \
cd /app/third_party && \
curl https://download.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip | jar xv

# Now copy the rest of the app
COPY . /app/

Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ If you use RAT-SQL in your work, please cite it as follows:

## Changelog

**2021-12-09:**
- Remove dependency on Java and Stanford CoreNLP (changed into Python native package 'Stanza').

**2020-08-14:**
- The Docker image now inherits from a CUDA-enabled base image.
- Clarified memory and dataset requirements on the image.
Expand Down Expand Up @@ -72,7 +75,7 @@ By default, [Docker Desktop for Mac](https://hub.docker.com/editions/community/d
The `-m4g` switch overrides it; alternatively, you can increase the default limit in the Docker Desktop settings.

> If you prefer to set up and run the codebase without Docker, follow the steps in `Dockerfile` one by one.
> Note that this repository requires Python 3.7 or higher and a JVM to run [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/).
> Note that this repository requires Python 3.7 or higher.

### Step 3: Run the experiments

Expand Down
4 changes: 2 additions & 2 deletions ratsql/models/spider/spider_enc.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,8 +600,8 @@ def normalize_toks(self):
# lemmatize "abc"
normalized_toks = []
for i, tok in enumerate(new_toks):
ann = corenlp.annotate(tok, annotators=['tokenize', 'ssplit', 'lemma'])
lemmas = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
ann = corenlp.annotate(tok, annotators=['tokenize', 'lemma'])
lemmas = [token.to_dict()[0]["lemma"].lower() for sentence in ann.sentences for token in sentence.tokens]
lemma_word = " ".join(lemmas)
normalized_toks.append(lemma_word)

Expand Down
42 changes: 3 additions & 39 deletions ratsql/resources/corenlp.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,10 @@
import os
import sys

import corenlp
import requests


class CoreNLP:
def __init__(self):
if not os.environ.get('CORENLP_HOME'):
os.environ['CORENLP_HOME'] = os.path.abspath(
os.path.join(
os.path.dirname(__file__),
'../../third_party/stanford-corenlp-full-2018-10-05'))
if not os.path.exists(os.environ['CORENLP_HOME']):
raise Exception(
f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}.

Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
Landing page: https://stanfordnlp.github.io/CoreNLP/''')
self.client = corenlp.CoreNLPClient()

def __del__(self):
self.client.stop()

def annotate(self, text, annotators=None, output_format=None, properties=None):
try:
result = self.client.annotate(text, annotators, output_format, properties)
except (corenlp.client.PermanentlyFailedException,
requests.exceptions.ConnectionError) as e:
print('\nWARNING: CoreNLP connection timeout. Recreating the server...', file=sys.stderr)
self.client.stop()
self.client.start()
result = self.client.annotate(text, annotators, output_format, properties)

return result

import stanza

_singleton = None


def annotate(text, annotators=None, output_format=None, properties=None):
global _singleton
if not _singleton:
_singleton = CoreNLP()
return _singleton.annotate(text, annotators, output_format, properties)
_singleton = stanza.Pipeline('en', processors=','.join(annotators))
return _singleton(text)
19 changes: 9 additions & 10 deletions ratsql/resources/pretrained_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import time

import bpemb
import corenlp
import torch
import torchtext

Expand Down Expand Up @@ -50,26 +49,26 @@ def __init__(self, kind, lemmatize=False):
self.dim = self.glove.dim
self.vectors = self.glove.vectors
self.lemmatize = lemmatize
self.corenlp_annotators = ['tokenize', 'ssplit']
self.corenlp_annotators = ['tokenize']
if lemmatize:
self.corenlp_annotators.append('lemma')

@functools.lru_cache(maxsize=1024)
def tokenize(self, text):
ann = corenlp.annotate(text, self.corenlp_annotators)
annotation = corenlp.annotate(text, self.corenlp_annotators)
if self.lemmatize:
return [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
return [token.to_dict()[0]["lemma"].lower() for sentence in annotation.sentences for token in sentence.tokens]
else:
return [tok.word.lower() for sent in ann.sentence for tok in sent.token]
return [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens]

@functools.lru_cache(maxsize=1024)
def tokenize_for_copying(self, text):
ann = corenlp.annotate(text, self.corenlp_annotators)
text_for_copying = [tok.originalText.lower() for sent in ann.sentence for tok in sent.token]
annotation = corenlp.annotate(text, self.corenlp_annotators)
text_for_copying = [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens]
if self.lemmatize:
text = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
text = [token.to_dict()[0]["lemma"].lower() for sentence in annotation.sentences for token in sentence.tokens]
else:
text = [tok.word.lower() for sent in ann.sentence for tok in sent.token]
text = [token.to_dict()[0]["text"].lower() for sentence in annotation.sentences for token in sentence.tokens]
return text, text_for_copying

def untokenize(self, tokens):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
'pyrsistent~=0.14.9',
'pytest~=5.3.2',
'records~=0.5.3',
'stanford-corenlp~=3.9.2',
'stanza~=1.3.0',
'tabulate~=0.8.6',
'torch~=1.3.1',
'torchtext~=0.3.1',
Expand Down