fix preprocess/evaluate instructions

fynnos · fynnos · commit ba9ac920417f · 2021-08-06T16:45:03.000+02:00
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ Set up environment and data for training and evaluation:
 * All data and config files are placed relative to the: `base_dir = /path/to/project` in [local.conf](local.conf) so change it to point to the root of this repo
 * All splits created using the `split_*` Python scripts will need to be processed using `preprocess.py` to be used as training input for the model, for example, to split the DROC dataset run:
     - `python split_droc.py --type-system-xml /path/to/DROC-Release/droc/src/main/resources/CorefTypeSystem.xml /path/to/DROC-Release/droc/DROC-xmi data/german.droc_gold_conll`
-    - `python preprocess.py --input_dir data/droc_full --output_dir data/droc_full --seg_len 512 --language german --tokenizer_name german-nlp-group/electra-base-german-uncased --input_suffix droc_gold_conll --input_format conll-2012`
+    - `python preprocess.py --input_dir data/droc_full --output_dir data/droc_full --seg_len 512 --language german --tokenizer_name german-nlp-group/electra-base-german-uncased --input_suffix droc_gold_conll --input_format conll-2012 --model_type electra`
 
 
 ## Evaluation
@@ -59,7 +59,7 @@ If you want to use the official evaluator, download and unzip [official conll 20
 Evaluate a model on the dev/test set:
 * Download the corresponding model file (`.mar`) and extract `model*.bin` from it and place it in `data_dir/<experiment_id>/`
 * `python evaluate.py [config] [model_id] [gpu_id] ([output_file])`
-    * e.g. News, SemEval-2010, ELECTRA uncased (base) :`python evaluate.py se10_electra_uncased Apr30_08-52-00_56879 0`
+    * e.g. News, SemEval-2010, ELECTRA uncased (base) :`python evaluate.py se10_electra_uncased tuba10_electra_uncased_Apr30_08-52-00_56879 0`
 
 ## Training
 
diff --git a/experiments.conf b/experiments.conf
@@ -281,13 +281,14 @@ news = ${base}{
   model_type = electra
   incremental = false
   postprocess_merge_overlapping_spans = false
+  language = german
 }
 
 # SemEval 2010
 
 se10 = ${news}{
-  conll_eval_path = ${base.data_dir}/se10.dev.german.v4_gold_conll
-  conll_test_path = ${base.data_dir}/se10.test.german.v4_gold_conll
+  conll_eval_path = ${base.data_dir}/dev.german.v4_gold_conll
+  conll_test_path = ${base.data_dir}/test.german.v4_gold_conll
   num_epochs = 48
   long_doc_strategy = truncate
 }
@@ -308,8 +309,8 @@ se10_gelectra_large = ${se10}{
 # TuBa-D/Z 10.0
 
 tuba10 = ${news}{
-  conll_eval_path = ${base.data_dir}/tuba10.dev.german.tuebdz_gold_conll
-  conll_test_path = ${base.data_dir}/tuba10.test.german.tuebdz_gold_conll
+  conll_eval_path = ${base.data_dir}/dev.german.tuebdz_gold_conll
+  conll_test_path = ${base.data_dir}/test.german.tuebdz_gold_conll
   max_training_sentences = 3
   num_epochs = 24
 }
diff --git a/tensorize.py b/tensorize.py
@@ -1,7 +1,7 @@
 import util
 import numpy as np
 import random
-from transformers import BertTokenizer
+from transformers import AutoTokenizer
 import os
 from os.path import join
 import json
@@ -91,7 +91,7 @@ class Tensorizer:
     def __init__(self, config):
         self.config = config
         self.long_doc_strategy = config['long_doc_strategy']
-        self.tokenizer = BertTokenizer.from_pretrained(config['bert_tokenizer_name'])
+        self.tokenizer = AutoTokenizer.from_pretrained(config['bert_tokenizer_name'])
 
         # Will be used in evaluation
         self.stored_info = {}