Skip to content

Import skos rdf into Controlled Lists #119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 41 additions & 27 deletions arches_controlled_lists/management/commands/packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from arches.management.commands.packages import Command as PackagesCommand
from arches.app.models import models
from arches_controlled_lists.models import List, ListItem, ListItemValue
from arches_controlled_lists.utils.skos import SKOSReader


class Command(PackagesCommand):
Expand Down Expand Up @@ -37,36 +38,49 @@ def handle(self, *args, **options):
if options["operation"] == "export_controlled_lists":
self.export_controlled_lists(options["dest_dir"], options["file_name"])

if options["operation"] == "import_rdf_xml":
self.import_

def import_controlled_lists(self, source):
created_instances_pks = []
if os.path.exists(source):
wb = openpyxl.load_workbook(source)
with transaction.atomic():
for sheet in wb.sheetnames:
if sheet == "List":
created_instances_pks.extend(
self.import_sheet_to_model(wb[sheet], List)
)
elif sheet == "ListItem":
created_instances_pks.extend(
self.import_sheet_to_model(wb[sheet], ListItem)
)
elif sheet == "ListItemValue":
created_instances_pks.extend(
self.import_sheet_to_model(wb[sheet], ListItemValue)
)
# validate all data
for model in [
List,
ListItem,
ListItemValue,
]:
for instance in model.objects.filter(pk__in=created_instances_pks):
instance.full_clean()
self.stdout.write("Data imported successfully from {0}".format(source))
if source.lower().endswith(".xml"):
skos = SKOSReader()
rdf = skos.read_file(source)
concepts = skos.save_controlled_lists_from_skos(rdf)

elif source.lower().endswith(".xlsx"):
created_instances_pks = []
if os.path.exists(source):
wb = openpyxl.load_workbook(source)
with transaction.atomic():
for sheet in wb.sheetnames:
if sheet == "List":
created_instances_pks.extend(
self.import_sheet_to_model(wb[sheet], List)
)
elif sheet == "ListItem":
created_instances_pks.extend(
self.import_sheet_to_model(wb[sheet], ListItem)
)
elif sheet == "ListItemValue":
created_instances_pks.extend(
self.import_sheet_to_model(wb[sheet], ListItemValue)
)
# validate all data
for model in [
List,
ListItem,
ListItemValue,
]:
for instance in model.objects.filter(
pk__in=created_instances_pks
):
instance.full_clean()
self.stdout.write(
"Data imported successfully from {0}".format(source)
)
else:
self.stdout.write(
"The source file does not exist. Please rerun this command with a valid source file."
"The source file does not exist or is not the correct format. Please rerun this command with a valid source file."
)

def import_sheet_to_model(self, sheet, model):
Expand Down
126 changes: 126 additions & 0 deletions arches_controlled_lists/utils/skos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
import uuid
import re
import logging
from django.db import transaction, IntegrityError
from django.db.models import Q
from django.utils import translation
from django.utils.http import urlencode
from rdflib import Literal, Namespace, RDF, URIRef
from rdflib.namespace import SKOS, DCTERMS
from rdflib.graph import Graph
from time import time
from arches.app.models import models
from arches.app.models.system_settings import settings
from arches.app.utils.betterJSONSerializer import JSONSerializer, JSONDeserializer
from arches.app.utils.i18n import capitalize_region
from arches.app.utils.skos import SKOSReader
from arches_controlled_lists.models import List, ListItem, ListItemValue

# define the ARCHES namespace
ARCHES = Namespace(settings.ARCHES_NAMESPACE_FOR_DATA_EXPORT)


class SKOSReader(SKOSReader):
def __init__(self):
super().__init__()
self.lists = []
self.list_items = []
self.list_item_values = []

"""
This class extends the SKOSReader to provide additional functionality
specific to the Arches controlled lists application.
"""

def save_controlled_lists_from_skos(
self,
graph,
# overwrite_options="overwrite",
):
baseuuid = uuid.uuid4()
allowed_languages = models.Language.objects.values_list("code", flat=True)
default_lang = settings.LANGUAGE_CODE

# if the graph is of the type rdflib.graph.Graph
if isinstance(graph, Graph):

# Search for ConceptSchemes first - these will become Lists
for scheme, v, o in graph.triples((None, RDF.type, SKOS.ConceptScheme)):
scheme_id = self.generate_uuidv5_from_subject(baseuuid, scheme)
new_list = List(scheme_id)

for predicate, object in graph.predicate_objects(subject=scheme):
# Get List name from a ConceptScheme's title element
if predicate == DCTERMS.title:

if not self.language_exists(object, allowed_languages):
allowed_languages = models.Language.objects.values_list(
"code", flat=True
)

val = self.unwrapJsonLiteral(object)
new_list.name = val["value"]

# TODO: Bulk create (and blessed overwrite)
# list.save()
self.lists.append(new_list)

# Create lookups for valuetypes used during Concept processing
value_types = models.DValueType.objects.all()
skos_value_types = value_types.filter(
Q(namespace="skos") | Q(namespace="arches")
)
skos_value_types_list = list(
skos_value_types.values_list("valuetype", flat=True)
)
skos_value_types = {
valuetype.valuetype: valuetype for valuetype in skos_value_types
}
dcterms_value_types = value_types.filter(namespace="dcterms")
dcterms_identifier_type = dcterms_value_types.get(
valuetype=str(DCTERMS.identifier).replace(str(DCTERMS), "")
)

# Concepts become ListItems & ListItemValues
for concept, v, o in graph.triples((None, RDF.type, SKOS.Concept)):
list_item_id = self.generate_uuidv5_from_subject(baseuuid, concept)
list_item = ListItem(id=list_item_id)

# rdf:about is fallback URI for a concept, unless it has dcterms:identifier
# which overwrites this value below
uri = self.unwrapJsonLiteral(str(concept))

for predicate, object in graph.predicate_objects(subject=concept):
if predicate == DCTERMS.identifier:
uri = self.unwrapJsonLiteral(str(object))

elif str(SKOS) in predicate or str(ARCHES) in predicate:
if not self.language_exists(object, allowed_languages):
allowed_languages = models.Language.objects.values_list(
"code", flat=True
)

# Get skos element type from predicate (e.g. prefLabel, broader, etc.)
relation_or_value_type = predicate.replace(SKOS, "").replace(
ARCHES, ""
)

list_item.uri = uri

# TODO: Tie the list_item to a list
# list_item.list =
# list_item.sortorder = # not sure how to determine this from SKOS
# list_item.guide = False # safe to fall back to False?

self.list_items.append(list_item)

def generate_uuidv5_from_subject(self, baseuuid, subject):
uuidregx = re.compile(
r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
)
matches = uuidregx.search(str(subject))
if matches:
return matches.group(0)
else:
return str(uuid.uuid5(baseuuid, str(subject)))
Loading