Skip to content

Commit ae4c421

Browse files
OfirArvivdanielhers
authored andcommitted
Update Bert support (danielhers#78)
* print weights only if verbose level is 4 and above * filtering passage longer that 510 tokens (as defined by the bert tokenizer) if using bert * typo fix * Update readme with bert info * typo fix * Moving bert filtering to Parser.parse() in order to be compatible with the LazyLoading mechanism * Clarify error messages * Changing bert filter to a generator function * code conventions fix * Remove extra empty line in readme * remove unneeded bert filtering old code * hopefully the last fixes * update readme and support parsing without lang in xml while not in training * revert some changes and updated readme * update readme * update version * update readme * cr fixes- wrong model url and name * typo fixes in readme * add bert prefix to the bert config use-default-word-embeddings * add bert prefix to the bert config use-default-word-embeddings * save and load bert configs in model file * update readme
1 parent 3c56d7e commit ae4c421

File tree

6 files changed

+67
-5
lines changed

6 files changed

+67
-5
lines changed

README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,45 @@ To train in the multilingual settings you need to:
8585
3) Pass the `--bert-multilingual=0` argument.
8686
4) Make sure the UCCA passages files have the `lang` property. See the script 'set_lang' in the package `semstr`.
8787

88+
### BERT Performance
89+
Here are the average results over 3 Bert multilingual models trained on the [German _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_German-20K),
90+
[English Wiki_corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_English-Wiki)
91+
and only on 15 sentences from the [French _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_French-20K),
92+
with the following settings:
93+
```
94+
bert-model=bert-base-multilingual-cased
95+
bert-layers= -1 -2 -3 -4
96+
bert-layers-pooling=weighted
97+
bert-token-align-by=sum
98+
```
99+
The results:
100+
101+
| description | test primary F1 | test remote F1 | test average |
102+
| -------------------- | ------------------- | --------------- | ---------------- |
103+
| German_20K Leagues | 0.828 | 0.6723 | 0.824 |
104+
| English_20K Leagues | 0.763 | 0.359 | 0.755 |
105+
| French_20K Leagues | 0.739 | 0.46 | 0.732 |
106+
| English_Wiki | 0.789 | 0.581 | 0.784 |
107+
108+
*[English _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_English-20K) is used as out of domain test.
109+
110+
### BERT Pre-trained models
111+
112+
To download and extract [a multilingual model](https://github.com/huji-nlp/tupa/releases/download/v1.4.0/bert_multilingual_layers_4_layers_pooling_weighted_align_sum.tar.gz), run:
113+
114+
curl -LO https://github.com/huji-nlp/tupa/releases/download/v1.4.0/bert_multilingual_layers_4_layers_pooling_weighted_align_sum.tar.gz
115+
tar xvzf bert_multilingual_layers_4_layers_pooling_weighted_align_sum.tar.gz
116+
117+
To run the parser using the mode, use the following command. Pay attention that you need to replace `[example lang]` with
118+
the language symbol of the sentence in `example.txt` (fr, en, de, etc.):
119+
120+
python -m tupa example.txt --lang [example lang] -m bert_multilingual_layers_4_layers_pooling_weighed_align_sum
121+
122+
The model was trained on the [German _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_German-20K),
123+
[English Wiki_corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_English-Wiki)
124+
and only on 15 sentences from the [French _20K Leagues_ corpus](https://github.com/UniversalConceptualCognitiveAnnotation/UCCA_French-20K).
125+
126+
See the expected performance at [BERT Performance](#bert-performance).
88127

89128
Author
90129
------

tupa/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "1.3.10"
1+
VERSION = "1.4.0"
22
# noinspection PyBroadException
33
try:
44
from subprocess import check_output, DEVNULL

tupa/classifiers/nn/neural_network.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,9 @@ def init_axis_model(self, axis, init=True):
138138
for key, param in sorted(self.input_params.items()):
139139
if not param.enabled:
140140
continue
141-
if (not self.config.args.use_default_word_embeddings or self.config.args.bert_multilingual is not None) \
141+
if self.config.args.use_bert and \
142+
(not self.config.args.bert_use_default_word_embeddings
143+
or self.config.args.bert_multilingual is not None) \
142144
and key == 'W':
143145
i = self.birnn_indices(param)
144146
indexed_num[i] = np.fmax(indexed_num[i], param.num) # indices to be looked up are collected

tupa/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def __init__(self, *args):
253253
ap.add_argument("--bert-layers-pooling", choices=["weighted", "sum", "concat"], default="weighted")
254254
ap.add_argument("--bert-token-align-by", choices=["first", "sum", "mean"], default="sum")
255255
ap.add_argument("--bert-multilingual", choices=[0], type=int)
256-
add_boolean_option(ap, "use-default-word-embeddings", default=False,
256+
add_boolean_option(ap, "bert-use-default-word-embeddings", default=False,
257257
description="whether to use default word embeddings")
258258
ap.add_argument("--bert-dropout", type=float, default=0, choices=np.linspace(0, 0.9, num=10))
259259

tupa/model.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,20 @@ def save(self, save_init=False):
260260
self.feature_extractor.save(self.filename, save_init=save_init)
261261
node_labels = self.feature_extractor.params.get(NODE_LABEL_KEY)
262262
skip_labels = (NODE_LABEL_KEY,) if node_labels and node_labels.size else ()
263+
bert_configs = {
264+
"use_bert": self.config.args.use_bert,
265+
"bert_model": self.config.args.bert_model,
266+
"bert_layers": self.config.args.bert_layers,
267+
"bert_layers_pooling": self.config.args.bert_layers_pooling,
268+
"bert_token_align_by": self.config.args.bert_token_align_by,
269+
"bert_multilingual": self.config.args.bert_multilingual,
270+
"bert_use_default_word_embeddings": self.config.args.bert_use_default_word_embeddings,
271+
"bert_dropout": self.config.args.bert_dropout}\
272+
if self.config.args.use_bert else {"use_bert": self.config.args.use_bert}
263273
self.classifier.save(self.filename, skip_labels=skip_labels,
264274
multilingual=self.config.args.multilingual,
265-
omit_features=self.config.args.omit_features)
275+
omit_features=self.config.args.omit_features,
276+
**bert_configs)
266277
textutil.models["vocab"] = self.config.args.vocab
267278
save_json(self.filename + ".nlp.json", textutil.models)
268279
remove_backup(self.filename)
@@ -279,6 +290,16 @@ def load(self, is_finalized=True):
279290
self.config.args.classifier = Classifier.get_property(self.filename, "type")
280291
self.config.args.multilingual = Classifier.get_property(self.filename, "multilingual")
281292
self.config.args.omit_features = Classifier.get_property(self.filename, "omit_features")
293+
self.config.args.use_bert = Classifier.get_property(self.filename, "use_bert")
294+
if self.config.args.use_bert:
295+
self.config.args.bert_model = Classifier.get_property(self.filename, "bert_model")
296+
self.config.args.bert_layers = Classifier.get_property(self.filename, "bert_layers")
297+
self.config.args.bert_layers_pooling = Classifier.get_property(self.filename, "bert_layers_pooling")
298+
self.config.args.bert_token_align_by = Classifier.get_property(self.filename, "bert_token_align_by")
299+
self.config.args.bert_multilingual = Classifier.get_property(self.filename, "bert_multilingual")
300+
self.config.args.bert_use_default_word_embeddings =\
301+
Classifier.get_property(self.filename, "bert_use_default_word_embeddings")
302+
self.config.args.bert_dropout = Classifier.get_property(self.filename, "bert_dropout")
282303
self.init_model(init_params=False)
283304
self.feature_extractor.load(self.filename, order=[p.name for p in self.param_defs()])
284305
if not is_finalized:

tupa/parse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __init__(self, passage, *args, **kwargs):
7373
assert not errors, errors
7474
self.in_format = self.format or "ucca"
7575
self.out_format = "ucca" if self.format in (None, "text") else self.format
76-
if self.config.args.bert_multilingual is not None:
76+
if self.config.args.use_bert and self.config.args.bert_multilingual is not None:
7777
self.lang = self.passage.attrib.get("lang")
7878
assert self.lang, "Attribute 'lang' is required per passage when using multilingual BERT"
7979
else:

0 commit comments

Comments
 (0)