Skip to content

Commit 77d753d

Browse files
committed
Update prepare_con_dataset.py for the training of an Icelandic constituency parser
Create convert_icepahc.py Update default_packages.py Added a BERT model for Icelandic Update default_packages.py
1 parent 6e442a6 commit 77d753d

File tree

3 files changed

+88
-0
lines changed

3 files changed

+88
-0
lines changed

stanza/resources/default_packages.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,17 @@ def build_default_pretrains(default_treebanks):
635635
# xlm-roberta-base : 89.31
636636
"hy": "xlm-roberta-base",
637637

638+
# https://huggingface.co/mideind/IceBERT
639+
# IceBERT-large is also available:
640+
# https://huggingface.co/mideind/IceBERT-large
641+
# Constituency F1 scores:
642+
# No bert (in-order): 84.40%
643+
# IceBERT (top-down): 88.66%
644+
# IceBERT (finetuning, top-down): 90.38%
645+
# IceBERT-large (top-down): 88.80%
646+
# IceBERT-large (ft, top-down): 90.29%
647+
"is": "mideind/IceBERT"
648+
638649
# Indonesian POS experiments: dev set of GSD
639650
# python3 stanza/utils/training/run_pos.py id_gsd --no_bert
640651
# python3 stanza/utils/training/run_pos.py id_gsd --bert_model ...
@@ -811,6 +822,10 @@ def build_default_pretrains(default_treebanks):
811822
# hy
812823
"xlm-roberta-base": "xlm-roberta-base",
813824

825+
# is
826+
"mideind/IceBERT": "icebert",
827+
"mideind/IceBERT-large": "icebert-large",
828+
814829
# id
815830
"indolem/indobert-base-uncased": "indobert",
816831
"indobenchmark/indobert-large-p1": "indobenchmark-large-p1",
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from stanza.utils.datasets.constituency import utils
2+
3+
def read_psd_file(input_file):
4+
"""
5+
Convert the IcePaHC .psd file to text
6+
7+
Returns a list of sentences
8+
"""
9+
with open(input_file, encoding='utf-8') as file:
10+
lines = file.readlines()
11+
12+
output_trees = []
13+
current_tree = ''
14+
15+
# Add the trees as parsed sentences to the output_trees list
16+
for line in lines:
17+
if line.startswith("(ROOT"):
18+
if current_tree:
19+
cleaned_tree = ' '.join(current_tree.split())
20+
output_trees.append(cleaned_tree)
21+
current_tree = line
22+
else:
23+
current_tree += line
24+
25+
# Can't forget the last tree
26+
if current_tree:
27+
cleaned_tree = ' '.join(current_tree.split())
28+
output_trees.append(cleaned_tree.strip())
29+
30+
return output_trees
31+
32+
33+
def convert_icepahc_treebank(input_file, train_size=0.8, dev_size=0.1):
34+
35+
trees = read_psd_file(input_file)
36+
37+
print("Read %d trees" % len(trees))
38+
train_trees, dev_trees, test_trees = utils.split_treebank(trees, train_size, dev_size)
39+
print("Split %d trees into %d train %d dev %d test" % (len(trees), len(train_trees), len(dev_trees), len(test_trees)))
40+
41+
return train_trees, dev_trees, test_trees
42+
43+
44+
def main():
45+
treebank = convert_icepahc_treebank("simpleicepahc24.psd")
46+
47+
if __name__ == '__main__':
48+
main()

stanza/utils/datasets/constituency/prepare_con_dataset.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,14 @@
171171
Currently only German is converted, the German version being a
172172
version of the Tiger Treebank
173173
python3 -m stanza.utils.datasets.constituency.prepare_con_dataset de_spmrl
174+
175+
is_icepahc
176+
The Icelandic Parsed Historical Corpus (IcePaHC), available at:
177+
https://clarin.is/en/resources/icepahc/
178+
A simplified/clean version of the IcePaHC treebank is used for the training
179+
of the constituency parser, where for example empty phrases (traces and zero
180+
subjects) and lemmas have been removed. This version is available at:
181+
https://github.com/ingunnjk/IceConParse/tree/main/data
174182
"""
175183

176184
import argparse
@@ -198,6 +206,7 @@
198206
from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset
199207
import stanza.utils.datasets.constituency.vtb_convert as vtb_convert
200208
import stanza.utils.datasets.constituency.vtb_split as vtb_split
209+
from stanza.utils.datasets.constituency.convert_icepahc import convert_icepahc_treebank
201210

202211
class UnknownDatasetError(ValueError):
203212
def __init__(self, dataset, text):
@@ -470,6 +479,20 @@ def process_spmrl(paths, dataset_name, *args):
470479

471480
convert_spmrl(input_directory, output_directory, dataset_name)
472481

482+
def process_icepahc(paths, dataset_name, *args):
483+
"""
484+
Processes the Icelandic dataset, IcePaHC
485+
"""
486+
assert dataset_name == 'is_icepahc'
487+
488+
input_file = os.path.join(paths["CONSTITUENCY_BASE"], "simpleicepahc24.psd")
489+
if not os.path.exists(input_file):
490+
raise FileNotFoundError("Unable to find input file for IcePaHC. Expected in {}".format(input_file))
491+
output_dir = paths["CONSTITUENCY_DATA_DIR"]
492+
493+
datasets = convert_icepahc_treebank(input_file)
494+
write_dataset(datasets, output_dir, dataset_name)
495+
473496
DATASET_MAPPING = {
474497
'da_arboretum': process_arboretum,
475498

@@ -495,6 +518,8 @@ def process_spmrl(paths, dataset_name, *args):
495518

496519
'zh-hans_ctb-51': process_ctb_51,
497520
'zh-hans_ctb-90': process_ctb_90,
521+
522+
'is_icepahc': process_icepahc,
498523
}
499524

500525
def main(dataset_name, *args):

0 commit comments

Comments
 (0)