From 95559ea3fcbc8122a4da994dda554df6575f9a63 Mon Sep 17 00:00:00 2001 From: Johnathan Clementi Date: Mon, 2 Jun 2025 15:49:25 -0400 Subject: [PATCH] Stub out importing skos rdf into Controlled Lists #116 --- .../management/commands/packages.py | 68 ++++++---- arches_controlled_lists/utils/skos.py | 126 ++++++++++++++++++ 2 files changed, 167 insertions(+), 27 deletions(-) create mode 100644 arches_controlled_lists/utils/skos.py diff --git a/arches_controlled_lists/management/commands/packages.py b/arches_controlled_lists/management/commands/packages.py index dc716c39..feba96f8 100644 --- a/arches_controlled_lists/management/commands/packages.py +++ b/arches_controlled_lists/management/commands/packages.py @@ -7,6 +7,7 @@ from arches.management.commands.packages import Command as PackagesCommand from arches.app.models import models from arches_controlled_lists.models import List, ListItem, ListItemValue +from arches_controlled_lists.utils.skos import SKOSReader class Command(PackagesCommand): @@ -37,36 +38,49 @@ def handle(self, *args, **options): if options["operation"] == "export_controlled_lists": self.export_controlled_lists(options["dest_dir"], options["file_name"]) + if options["operation"] == "import_rdf_xml": + self.import_ + def import_controlled_lists(self, source): - created_instances_pks = [] - if os.path.exists(source): - wb = openpyxl.load_workbook(source) - with transaction.atomic(): - for sheet in wb.sheetnames: - if sheet == "List": - created_instances_pks.extend( - self.import_sheet_to_model(wb[sheet], List) - ) - elif sheet == "ListItem": - created_instances_pks.extend( - self.import_sheet_to_model(wb[sheet], ListItem) - ) - elif sheet == "ListItemValue": - created_instances_pks.extend( - self.import_sheet_to_model(wb[sheet], ListItemValue) - ) - # validate all data - for model in [ - List, - ListItem, - ListItemValue, - ]: - for instance in model.objects.filter(pk__in=created_instances_pks): - instance.full_clean() - self.stdout.write("Data imported successfully from {0}".format(source)) + if source.lower().endswith(".xml"): + skos = SKOSReader() + rdf = skos.read_file(source) + concepts = skos.save_controlled_lists_from_skos(rdf) + + elif source.lower().endswith(".xlsx"): + created_instances_pks = [] + if os.path.exists(source): + wb = openpyxl.load_workbook(source) + with transaction.atomic(): + for sheet in wb.sheetnames: + if sheet == "List": + created_instances_pks.extend( + self.import_sheet_to_model(wb[sheet], List) + ) + elif sheet == "ListItem": + created_instances_pks.extend( + self.import_sheet_to_model(wb[sheet], ListItem) + ) + elif sheet == "ListItemValue": + created_instances_pks.extend( + self.import_sheet_to_model(wb[sheet], ListItemValue) + ) + # validate all data + for model in [ + List, + ListItem, + ListItemValue, + ]: + for instance in model.objects.filter( + pk__in=created_instances_pks + ): + instance.full_clean() + self.stdout.write( + "Data imported successfully from {0}".format(source) + ) else: self.stdout.write( - "The source file does not exist. Please rerun this command with a valid source file." + "The source file does not exist or is not the correct format. Please rerun this command with a valid source file." ) def import_sheet_to_model(self, sheet, model): diff --git a/arches_controlled_lists/utils/skos.py b/arches_controlled_lists/utils/skos.py new file mode 100644 index 00000000..64f98592 --- /dev/null +++ b/arches_controlled_lists/utils/skos.py @@ -0,0 +1,126 @@ +import os +import uuid +import re +import logging +from django.db import transaction, IntegrityError +from django.db.models import Q +from django.utils import translation +from django.utils.http import urlencode +from rdflib import Literal, Namespace, RDF, URIRef +from rdflib.namespace import SKOS, DCTERMS +from rdflib.graph import Graph +from time import time +from arches.app.models import models +from arches.app.models.system_settings import settings +from arches.app.utils.betterJSONSerializer import JSONSerializer, JSONDeserializer +from arches.app.utils.i18n import capitalize_region +from arches.app.utils.skos import SKOSReader +from arches_controlled_lists.models import List, ListItem, ListItemValue + +# define the ARCHES namespace +ARCHES = Namespace(settings.ARCHES_NAMESPACE_FOR_DATA_EXPORT) + + +class SKOSReader(SKOSReader): + def __init__(self): + super().__init__() + self.lists = [] + self.list_items = [] + self.list_item_values = [] + + """ + This class extends the SKOSReader to provide additional functionality + specific to the Arches controlled lists application. + """ + + def save_controlled_lists_from_skos( + self, + graph, + # overwrite_options="overwrite", + ): + baseuuid = uuid.uuid4() + allowed_languages = models.Language.objects.values_list("code", flat=True) + default_lang = settings.LANGUAGE_CODE + + # if the graph is of the type rdflib.graph.Graph + if isinstance(graph, Graph): + + # Search for ConceptSchemes first - these will become Lists + for scheme, v, o in graph.triples((None, RDF.type, SKOS.ConceptScheme)): + scheme_id = self.generate_uuidv5_from_subject(baseuuid, scheme) + new_list = List(scheme_id) + + for predicate, object in graph.predicate_objects(subject=scheme): + # Get List name from a ConceptScheme's title element + if predicate == DCTERMS.title: + + if not self.language_exists(object, allowed_languages): + allowed_languages = models.Language.objects.values_list( + "code", flat=True + ) + + val = self.unwrapJsonLiteral(object) + new_list.name = val["value"] + + # TODO: Bulk create (and blessed overwrite) + # list.save() + self.lists.append(new_list) + + # Create lookups for valuetypes used during Concept processing + value_types = models.DValueType.objects.all() + skos_value_types = value_types.filter( + Q(namespace="skos") | Q(namespace="arches") + ) + skos_value_types_list = list( + skos_value_types.values_list("valuetype", flat=True) + ) + skos_value_types = { + valuetype.valuetype: valuetype for valuetype in skos_value_types + } + dcterms_value_types = value_types.filter(namespace="dcterms") + dcterms_identifier_type = dcterms_value_types.get( + valuetype=str(DCTERMS.identifier).replace(str(DCTERMS), "") + ) + + # Concepts become ListItems & ListItemValues + for concept, v, o in graph.triples((None, RDF.type, SKOS.Concept)): + list_item_id = self.generate_uuidv5_from_subject(baseuuid, concept) + list_item = ListItem(id=list_item_id) + + # rdf:about is fallback URI for a concept, unless it has dcterms:identifier + # which overwrites this value below + uri = self.unwrapJsonLiteral(str(concept)) + + for predicate, object in graph.predicate_objects(subject=concept): + if predicate == DCTERMS.identifier: + uri = self.unwrapJsonLiteral(str(object)) + + elif str(SKOS) in predicate or str(ARCHES) in predicate: + if not self.language_exists(object, allowed_languages): + allowed_languages = models.Language.objects.values_list( + "code", flat=True + ) + + # Get skos element type from predicate (e.g. prefLabel, broader, etc.) + relation_or_value_type = predicate.replace(SKOS, "").replace( + ARCHES, "" + ) + + list_item.uri = uri + + # TODO: Tie the list_item to a list + # list_item.list = + # list_item.sortorder = # not sure how to determine this from SKOS + # list_item.guide = False # safe to fall back to False? + + self.list_items.append(list_item) + + def generate_uuidv5_from_subject(self, baseuuid, subject): + uuidregx = re.compile( + r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" + ) + matches = uuidregx.search(str(subject)) + if matches: + return matches.group(0) + else: + return str(uuid.uuid5(baseuuid, str(subject)))