Update Bert support (danielhers#78)

OfirArviv · danielhers · commit ae4c4216352e · 2019-08-04T17:27:10.000+02:00
* print weights only if verbose level is 4 and above

* filtering passage longer that 510 tokens (as defined by the bert tokenizer) if using bert

* typo fix

* Update readme with bert info

* typo fix

* Moving bert filtering to Parser.parse() in order to be compatible with the LazyLoading mechanism

* Clarify error messages

* Changing bert filter to a generator function

* code conventions fix

* Remove extra empty line in readme

* remove unneeded bert filtering old code

* hopefully the last fixes

* update readme and support parsing without lang in xml while not in training

* revert some changes and updated readme

* update readme

* update version

* update readme

* cr fixes- wrong model url and name

* typo fixes in readme

* add bert prefix to the bert config use-default-word-embeddings

* add bert prefix to the bert config use-default-word-embeddings

* save and load bert configs in model file

* update readme
diff --git a/README.md b/README.md
@@ -85,6 +85,45 @@ To train in the multilingual settings you need to:
 3) Pass the `--bert-multilingual=0` argument.
 4) Make sure the UCCA passages files have the `lang` property. See the script 'set_lang' in the package `semstr`.
 
+### BERT Performance
+Here are the average results over 3 Bert multilingual models trained on the [German _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_German-20K),
+[English Wiki_corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_English-Wiki) 
+and only on 15 sentences from the [French _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_French-20K), 
+with the following settings:
+```
+bert-model=bert-base-multilingual-cased
+bert-layers= -1 -2 -3 -4
+bert-layers-pooling=weighted
+bert-token-align-by=sum
+```
+The results:
+
+| description          | test primary F1 | test remote F1 | test average |
+| --------------------  | ------------------- | --------------- | ---------------- |
+| German_20K Leagues |      0.828           |     0.6723        |    0.824          |
+| English_20K Leagues |      0.763           |     0.359        |    0.755          |
+| French_20K Leagues |      0.739           |     0.46        |    0.732          |
+| English_Wiki |      0.789           |     0.581        |    0.784          |
+
+*[English _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_English-20K) is used as out of domain test.
+
+### BERT Pre-trained models
+
+To download and extract [a multilingual model](https://github.com/huji-nlp/tupa/releases/download/v1.4.0/bert_multilingual_layers_4_layers_pooling_weighted_align_sum.tar.gz), run:
+
+    curl -LO https://github.com/huji-nlp/tupa/releases/download/v1.4.0/bert_multilingual_layers_4_layers_pooling_weighted_align_sum.tar.gz
+    tar xvzf bert_multilingual_layers_4_layers_pooling_weighted_align_sum.tar.gz
+
+To run the parser using the mode, use the following command. Pay attention that you need to replace `[example lang]` with
+ the language symbol of the sentence in `example.txt` (fr, en, de, etc.):
+
+    python -m tupa example.txt --lang [example lang] -m bert_multilingual_layers_4_layers_pooling_weighed_align_sum
+
+The model was trained on the [German _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_German-20K),
+[English Wiki_corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_English-Wiki) 
+and only on 15 sentences from the [French _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_French-20K).
+
+See the expected performance at [BERT Performance](#bert-performance).
 
 Author
 ------
diff --git a/tupa/__version__.py b/tupa/__version__.py
@@ -1,4 +1,4 @@
-VERSION = "1.3.10"
+VERSION = "1.4.0"
 # noinspection PyBroadException
 try:
     from subprocess import check_output, DEVNULL
diff --git a/tupa/classifiers/nn/neural_network.py b/tupa/classifiers/nn/neural_network.py
@@ -138,7 +138,9 @@ def init_axis_model(self, axis, init=True):
         for key, param in sorted(self.input_params.items()):
             if not param.enabled:
                 continue
-            if (not self.config.args.use_default_word_embeddings or self.config.args.bert_multilingual is not None) \
+            if self.config.args.use_bert and \
+                    (not self.config.args.bert_use_default_word_embeddings
+                     or self.config.args.bert_multilingual is not None) \
                     and key == 'W':
                 i = self.birnn_indices(param)
                 indexed_num[i] = np.fmax(indexed_num[i], param.num)  # indices to be looked up are collected
diff --git a/tupa/config.py b/tupa/config.py
@@ -253,7 +253,7 @@ def __init__(self, *args):
         ap.add_argument("--bert-layers-pooling", choices=["weighted", "sum", "concat"], default="weighted")
         ap.add_argument("--bert-token-align-by", choices=["first", "sum", "mean"], default="sum")
         ap.add_argument("--bert-multilingual", choices=[0], type=int)
-        add_boolean_option(ap, "use-default-word-embeddings", default=False,
+        add_boolean_option(ap, "bert-use-default-word-embeddings", default=False,
                            description="whether to use default word embeddings")
         ap.add_argument("--bert-dropout", type=float, default=0, choices=np.linspace(0, 0.9, num=10))
 
diff --git a/tupa/model.py b/tupa/model.py
@@ -260,9 +260,20 @@ def save(self, save_init=False):
                 self.feature_extractor.save(self.filename, save_init=save_init)
                 node_labels = self.feature_extractor.params.get(NODE_LABEL_KEY)
                 skip_labels = (NODE_LABEL_KEY,) if node_labels and node_labels.size else ()
+                bert_configs = {
+                        "use_bert": self.config.args.use_bert,
+                        "bert_model": self.config.args.bert_model,
+                        "bert_layers": self.config.args.bert_layers,
+                        "bert_layers_pooling": self.config.args.bert_layers_pooling,
+                        "bert_token_align_by": self.config.args.bert_token_align_by,
+                        "bert_multilingual": self.config.args.bert_multilingual,
+                        "bert_use_default_word_embeddings": self.config.args.bert_use_default_word_embeddings,
+                        "bert_dropout": self.config.args.bert_dropout}\
+                    if self.config.args.use_bert else {"use_bert": self.config.args.use_bert}
                 self.classifier.save(self.filename, skip_labels=skip_labels,
                                      multilingual=self.config.args.multilingual,
-                                     omit_features=self.config.args.omit_features)
+                                     omit_features=self.config.args.omit_features,
+                                     **bert_configs)
                 textutil.models["vocab"] = self.config.args.vocab
                 save_json(self.filename + ".nlp.json", textutil.models)
                 remove_backup(self.filename)
@@ -279,6 +290,16 @@ def load(self, is_finalized=True):
                 self.config.args.classifier = Classifier.get_property(self.filename, "type")
                 self.config.args.multilingual = Classifier.get_property(self.filename, "multilingual")
                 self.config.args.omit_features = Classifier.get_property(self.filename, "omit_features")
+                self.config.args.use_bert = Classifier.get_property(self.filename, "use_bert")
+                if self.config.args.use_bert:
+                    self.config.args.bert_model = Classifier.get_property(self.filename, "bert_model")
+                    self.config.args.bert_layers = Classifier.get_property(self.filename, "bert_layers")
+                    self.config.args.bert_layers_pooling = Classifier.get_property(self.filename, "bert_layers_pooling")
+                    self.config.args.bert_token_align_by = Classifier.get_property(self.filename, "bert_token_align_by")
+                    self.config.args.bert_multilingual = Classifier.get_property(self.filename, "bert_multilingual")
+                    self.config.args.bert_use_default_word_embeddings =\
+                        Classifier.get_property(self.filename, "bert_use_default_word_embeddings")
+                    self.config.args.bert_dropout = Classifier.get_property(self.filename, "bert_dropout")
                 self.init_model(init_params=False)
                 self.feature_extractor.load(self.filename, order=[p.name for p in self.param_defs()])
                 if not is_finalized:
diff --git a/tupa/parse.py b/tupa/parse.py
@@ -73,7 +73,7 @@ def __init__(self, passage, *args, **kwargs):
             assert not errors, errors
         self.in_format = self.format or "ucca"
         self.out_format = "ucca" if self.format in (None, "text") else self.format
-        if self.config.args.bert_multilingual is not None:
+        if self.config.args.use_bert and self.config.args.bert_multilingual is not None:
             self.lang = self.passage.attrib.get("lang")
             assert self.lang, "Attribute 'lang' is required per passage when using multilingual BERT"
         else:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-VERSION = "1.3.10"`
	`1`	`+VERSION = "1.4.0"`
`2`	`2`	`# noinspection PyBroadException`
`3`	`3`	`try:`
`4`	`4`	`from subprocess import check_output, DEVNULL`