Update prepare_con_dataset.py for the training of an Icelandic constituency parser

ingunnjk · ingunnjk · commit 77d753d2c66f · 2024-04-26T10:30:31.000Z
Create convert_icepahc.py

Update default_packages.py

Added a BERT model for Icelandic

Update default_packages.py
diff --git a/stanza/resources/default_packages.py b/stanza/resources/default_packages.py
@@ -635,6 +635,17 @@ def build_default_pretrains(default_treebanks):
     # xlm-roberta-base : 89.31
     "hy": "xlm-roberta-base",
 
+    # https://huggingface.co/mideind/IceBERT
+    # IceBERT-large is also available:
+    # https://huggingface.co/mideind/IceBERT-large
+    # Constituency F1 scores:
+    # No bert (in-order):             84.40%
+    # IceBERT (top-down):             88.66%
+    # IceBERT (finetuning, top-down): 90.38%
+    # IceBERT-large (top-down):       88.80%
+    # IceBERT-large (ft, top-down):   90.29%
+    "is": "mideind/IceBERT"
+
     # Indonesian POS experiments: dev set of GSD
     # python3 stanza/utils/training/run_pos.py id_gsd --no_bert
     # python3 stanza/utils/training/run_pos.py id_gsd --bert_model ...
@@ -811,6 +822,10 @@ def build_default_pretrains(default_treebanks):
     # hy
     "xlm-roberta-base": "xlm-roberta-base",
 
+    # is
+    "mideind/IceBERT": "icebert",
+    "mideind/IceBERT-large": "icebert-large",
+
     # id
     "indolem/indobert-base-uncased":         "indobert",
     "indobenchmark/indobert-large-p1":       "indobenchmark-large-p1",
diff --git a/stanza/utils/datasets/constituency/convert_icepahc.py b/stanza/utils/datasets/constituency/convert_icepahc.py
@@ -0,0 +1,48 @@
+from stanza.utils.datasets.constituency import utils
+
+def read_psd_file(input_file):
+    """
+    Convert the IcePaHC .psd file to text
+
+    Returns a list of sentences
+    """
+    with open(input_file, encoding='utf-8') as file:
+        lines = file.readlines()
+
+    output_trees = []
+    current_tree = ''
+
+    # Add the trees as parsed sentences to the output_trees list
+    for line in lines:
+        if line.startswith("(ROOT"):
+            if current_tree:
+                cleaned_tree = ' '.join(current_tree.split())
+                output_trees.append(cleaned_tree)
+            current_tree = line
+        else:
+            current_tree += line
+
+    # Can't forget the last tree
+    if current_tree:
+        cleaned_tree = ' '.join(current_tree.split())
+        output_trees.append(cleaned_tree.strip())
+
+    return output_trees    
+
+
+def convert_icepahc_treebank(input_file, train_size=0.8, dev_size=0.1):
+
+    trees = read_psd_file(input_file)
+
+    print("Read %d trees" % len(trees))
+    train_trees, dev_trees, test_trees = utils.split_treebank(trees, train_size, dev_size)
+    print("Split %d trees into %d train %d dev %d test" % (len(trees), len(train_trees), len(dev_trees), len(test_trees)))
+
+    return train_trees, dev_trees, test_trees
+
+
+def main():
+    treebank = convert_icepahc_treebank("simpleicepahc24.psd")
+
+if __name__ == '__main__':
+    main()
diff --git a/stanza/utils/datasets/constituency/prepare_con_dataset.py b/stanza/utils/datasets/constituency/prepare_con_dataset.py
@@ -171,6 +171,14 @@
   Currently only German is converted, the German version being a
     version of the Tiger Treebank
   python3 -m stanza.utils.datasets.constituency.prepare_con_dataset de_spmrl  
+
+is_icepahc
+  The Icelandic Parsed Historical Corpus (IcePaHC), available at:
+    https://clarin.is/en/resources/icepahc/
+  A simplified/clean version of the IcePaHC treebank is used for the training
+  of the constituency parser, where for example empty phrases (traces and zero
+  subjects) and lemmas have been removed. This version is available at:
+    https://github.com/ingunnjk/IceConParse/tree/main/data 
 """
 
 import argparse
@@ -198,6 +206,7 @@
 from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset
 import stanza.utils.datasets.constituency.vtb_convert as vtb_convert
 import stanza.utils.datasets.constituency.vtb_split as vtb_split
+from stanza.utils.datasets.constituency.convert_icepahc import convert_icepahc_treebank
 
 class UnknownDatasetError(ValueError):
     def __init__(self, dataset, text):
@@ -470,6 +479,20 @@ def process_spmrl(paths, dataset_name, *args):
 
     convert_spmrl(input_directory, output_directory, dataset_name)
 
+def process_icepahc(paths, dataset_name, *args):
+    """
+    Processes the Icelandic dataset, IcePaHC
+    """
+    assert dataset_name == 'is_icepahc'
+
+    input_file = os.path.join(paths["CONSTITUENCY_BASE"], "simpleicepahc24.psd")
+    if not os.path.exists(input_file):
+        raise FileNotFoundError("Unable to find input file for IcePaHC. Expected in {}".format(input_file))
+    output_dir = paths["CONSTITUENCY_DATA_DIR"]
+    
+    datasets = convert_icepahc_treebank(input_file)
+    write_dataset(datasets, output_dir, dataset_name)
+
 DATASET_MAPPING = {
     'da_arboretum': process_arboretum,
 
@@ -495,6 +518,8 @@ def process_spmrl(paths, dataset_name, *args):
 
     'zh-hans_ctb-51':   process_ctb_51,
     'zh-hans_ctb-90':   process_ctb_90,
+
+    'is_icepahc': process_icepahc,
 }
 
 def main(dataset_name, *args):