diff --git a/catalog/dags/common/loader/provider_details.py b/catalog/dags/common/loader/provider_details.py index c3d8530ad32..4b79c5a8dae 100644 --- a/catalog/dags/common/loader/provider_details.py +++ b/catalog/dags/common/loader/provider_details.py @@ -12,6 +12,7 @@ # Default provider names +AUCKLAND_MUSEUM_IMAGE_PROVIDER = "aucklandmuseum_tamakipaengahira" BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum" CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum" EUROPEANA_DEFAULT_PROVIDER = "europeana" diff --git a/catalog/dags/common/requester.py b/catalog/dags/common/requester.py index b0ef1cc1af5..c417de28738 100644 --- a/catalog/dags/common/requester.py +++ b/catalog/dags/common/requester.py @@ -122,6 +122,18 @@ def head(self, url, **kwargs): """ return self._make_request(self.session.head, url, **kwargs) + def post(self, url, params=None, **kwargs): + """ + Make a POST request, and return the response object if it exists. + + Required Arguments: + + url: URL to make the request as a string. + params: Dictionary of query string params. + **kwargs: Optional arguments that will be passed to `requests.get`. + """ + return self._make_request(self.session.post, url, params=params, **kwargs) + def _delay_processing(self): wait = self._DELAY - (time.time() - self._last_request) if wait >= 0: @@ -134,14 +146,20 @@ def _get_json(self, response) -> dict | list | None: except JSONDecodeError as e: logger.warning(f"Could not get response_json.\n{e}") - def get_response_json(self, endpoint, retries=0, query_params=None, **kwargs): + def get_response_json( + self, endpoint, retries=0, query_params=None, requestMethod="get", **kwargs + ): response_json = None - + response = None if retries < 0: logger.error("No retries remaining. Failure.") raise RetriesExceeded("Retries exceeded") - response = self.get(endpoint, params=query_params, **kwargs) + if requestMethod == "get": + response = self.get(endpoint, params=query_params, **kwargs) + elif requestMethod == "post": + response = self.post(endpoint, params=query_params, **kwargs) + if response is not None and response.status_code == 200: response_json = self._get_json(response) diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py new file mode 100644 index 00000000000..87035537ae4 --- /dev/null +++ b/catalog/dags/providers/provider_api_scripts/auckland_museum.py @@ -0,0 +1,200 @@ +""" +Content Provider: Auckland War Memorial Museum Tāmaki Paenga Hira + +ETL Process: Use the API to identify all CC licensed media. + +Output: TSV file containing the media and the + respective meta-data. + +Notes: https://api.aucklandmuseum.com/ + +Resource: https://api.aucklandmuseum.com/ + https://github.com/AucklandMuseum/API/wiki/Tutorial + +Resource | Requests per second | Requests per day +-- | -- | -- +/search, /id | 10 | 1000 +/id/media | 10 | 1000 +""" +import logging +from datetime import datetime, timedelta + +from common.constants import IMAGE +from common.licenses import get_license_info +from common.loader import provider_details as prov +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester + + +logger = logging.getLogger(__name__) + +LANDING_URL = ( + "https://www.aucklandmuseum.com/collections-research/collections/record/am_" +) + + +class AucklandMuseumDataIngester(ProviderDataIngester): + providers = { + "image": prov.AUCKLAND_MUSEUM_IMAGE_PROVIDER, + } + endpoint = "https://api.aucklandmuseum.com/search/collectionsonline/_search" + license_url = "https://creativecommons.org/licenses/by/4.0/" + total_amount_of_data = 10000 + DEFAULT_LICENSE_INFO = get_license_info(license_url=license_url) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.delay = 4 + self.batch_start = 0 + self.batch_limit = 2000 + self.headers = {"Content-Type": "application/json"} + date_from = datetime.strptime(self.date, "%Y-%m-%d") + self.date_from = date_from.isoformat() + self.date_to = (date_from + timedelta(days=1)).isoformat() + logger.info(f"Start timestamp: {self.date_from}, end timestamp: {self.date_to}") + self.data = { + "query": { + "bool": { + "must": [ + {"wildcard": {"copyright": {"value": "Auckland"}}}, + {"exists": {"field": "primaryRepresentation"}}, + { + "range": { + "lastModifiedOn": { + "from": self.date_from, + "to": self.date_to, + } + } + }, + ] + } + } + } + + def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: + # Return default query params on the first request + # primaryRepresentation contain a image url for each data + # "+" is a query string syntax for must be present + # copyright:CC state Creative Commons Attribution 4.0 + return { + "size": "2000", + "from": self.batch_start, + } + + def get_batch_data(self, response_json): + # Takes the raw API response from calling `get` on the endpoint, and returns + # the list of records to process. + if response_json: + return response_json.get("hits", {}).get("hits") + return None + + def get_should_continue(self, response_json): + # Do not continue if we have exceeded the total amount of data + self.batch_start += self.batch_limit + if self.batch_start >= self.total_amount_of_data: + logger.info( + "The final amount of data has been processed. Halting ingestion." + ) + return False + + return True + + def get_media_type(self, record: dict): + return IMAGE + + def get_record_data(self, data: dict) -> dict | list[dict] | None: + # check if _id is empty then foreign_landing_url and + # foreign_identifier doesn't exist + + if not (identifier := data.get("_id")): + return None + + url_parameter = identifier.split("id/")[-1].replace("/", "-") + foreign_landing_url = f"{LANDING_URL}{url_parameter}" + + foreign_identifier = identifier.split("/")[-1] + + information = data.get("_source", {}) + + if not (url := information.get("primaryRepresentation")): + return None + + license_info = self.DEFAULT_LICENSE_INFO + + creator = ( + information.get("dc_contributor")[0] + if information.get("dc_contributor", []) + else None + ) + + appellation = information.get("appellation", {}) + title = ( + appellation.get("Primary Title")[0] + if appellation.get("Primary Title") + else None + ) + meta_data = self._get_meta_data(information) + + return { + "foreign_landing_url": foreign_landing_url, + "foreign_identifier": foreign_identifier, + "url": url, + "license_info": license_info, + "creator": creator, + "title": title, + "meta_data": meta_data, + } + + @staticmethod + def _get_meta_data(object_json: dict) -> dict | None: + geopos = object_json.get("geopos")[0] if object_json.get("geopos", []) else "" + department = ( + object_json.get("department")[0] + if object_json.get("department", []) + else None + ) + + metadata = { + "type": object_json.get("type"), + "geopos": geopos, + "department": department, + } + + metadata = {k: v for k, v in metadata.items() if v is not None} + return metadata + + def _get_file_info(self, url) -> int | None: + """Get the image size in bytes.""" + resp = self.delayed_requester.head(url) + if resp: + filesize = int(resp.headers.get("Content-Length", 0)) + return filesize if filesize != 0 else None + + def get_response_json( + self, query_params: dict, endpoint: str | None = None, **kwargs + ): + """ + Make the actual API requests needed to ingest a batch. + + This can be overridden in order to support APIs that require multiple requests, + for example. + """ + return self.delayed_requester.get_response_json( + endpoint or self.endpoint, + self.retries, + query_params, + headers=self.headers, + requestMethod="post", + json=self.data, + **kwargs, + ) + + +def main(): + # Allows running ingestion from the CLI without Airflow running for debugging + # purposes. + ingester = AucklandMuseumDataIngester() + ingester.ingest_records() + + +if __name__ == "__main__": + main() diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 52cee8b7fb8..08abe56155c 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -8,6 +8,7 @@ from airflow.models import Variable from typing_extensions import NotRequired, TypedDict +from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester from providers.provider_api_scripts.cc_mixter import CcMixterDataIngester from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester @@ -193,6 +194,12 @@ def __post_init__(self): PROVIDER_WORKFLOWS = [ + ProviderWorkflow( + start_date=datetime(2023, 10, 1), + ingester_class=AucklandMuseumDataIngester, + schedule_string="@daily", + dated=True, + ), ProviderWorkflow( start_date=datetime(2020, 1, 1), ingester_class=BrooklynMuseumDataIngester, diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json new file mode 100644 index 00000000000..ac482f907ed --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/aucklandmuseum/single_item.json @@ -0,0 +1,183 @@ +{ + "_index": "collectionsonline-2022-05-04-1", + "_type": "_doc", + "_id": "http://api.aucklandmuseum.com/id/naturalsciences/object/691102", + "_score": 2.0630994, + "_source": { + "copyright": ["© Auckland Museum CC BY"], + "notes": [], + "references": [ + { + "person": { + "secondary_maker": [], + "primary_maker": [], + "classified": ["http://api.aucklandmuseum.com/id/person/28441"], + "collected": ["http://api.aucklandmuseum.com/id/person/25299"], + "_all": [ + "http://api.aucklandmuseum.com/id/person/28441", + "http://api.aucklandmuseum.com/id/person/25299" + ], + "referred": [] + } + }, + { "object": { "childOf": [], "_all": [], "referred": [] } } + ], + "documentType": [], + "geoSubject": [], + "language": [], + "type": "ecrm:E20_Biological_Object", + "content": [], + "localityDescription": ["[Western Samoa, Savai'i] Hinter [behind] Safai"], + "acquisitionStatement": [], + "recordScore": 40, + "responsibility": [], + "dc_contributor": ["R. O. Gardner"], + "isTaonga": false, + "place": { + "found": { "_all": [] }, + "made": { "_all": [] }, + "associated": { "_all": [] }, + "captured": { "_all": [] }, + "published": { "_all": [] }, + "acquired": { "_all": ["Samoa"] }, + "_all": ["Samoa"] + }, + "appellation": { + "Common Name": [], + "Classification Display Value": ["Cypholophus macrocephalus mollis"], + "Primary Title": [ + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ], + "Other Title": [], + "_all_suggest": { + "input": [ + "mollis", + "Cypholophus macrocephalus mollis", + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ], + "contexts": { "type_context": "ecrm:E20_Biological_Object" } + }, + "Maori Name": [], + "Classification Value": ["mollis"], + "_all": [ + "mollis", + "Cypholophus macrocephalus mollis", + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ] + }, + "keyword": [], + "department": ["botany"], + "dc_identifier": ["AK28252"], + "process": [], + "period": { + "made": [{ "exact": "2010-07-08T00:00:00.000Z" }], + "associated": [], + "published": [], + "accession": [ + { + "end": "2010-07-08T00:00:00.000Z", + "text": "08 Jul 2010", + "begin": "2010-07-08T00:00:00.000Z" + } + ], + "acquired": [ + { + "end": "1905-06-22T00:00:00.000Z", + "text": "22 Jun 1905", + "begin": "1905-06-22T00:00:00.000Z" + } + ], + "time_period": [], + "_all": [ + { + "end": "2010-07-08T00:00:00.000Z", + "text": "08 Jul 2010", + "begin": "2010-07-08T00:00:00.000Z" + }, + { + "end": "1905-06-22T00:00:00.000Z", + "text": "22 Jun 1905", + "begin": "1905-06-22T00:00:00.000Z" + }, + { "exact": "2010-07-08T00:00:00.000Z" } + ] + }, + "subjectStatus": [], + "geopos": [], + "primaryRepresentation": "http://api.aucklandmuseum.com/id/media/v/214749", + "dc_date": ["Jul 1989"], + "typeStatus": [], + "collection": [], + "classification": [ + { + "object": [ + { + "Kingdom": "Linnaean", + "Genus": "Linnaean", + "Linnaean System": "Linnaean", + "Family": "Linnaean", + "Species": "Linnaean", + "_all": [ + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean", + "Mollis", + "Macrocephalus", + "Cypholophus", + "Urticaceae", + "Plantae", + "Linnaean" + ], + "Var.": "Linnaean" + } + ] + }, + { "material": [] }, + { "place": [] } + ], + "subjectCategory": [], + "lastModifiedOn": ["2022-06-23T09:44:41.824Z"], + "tags": { "official": [], "user": [], "_all": [] }, + "dc_place": ["Samoa"], + "kindOfSpecimen": ["1F- Foreign dry"], + "unit": [], + "culturalOrigin": [], + "isSensitive": false, + "series": [], + "dc_title": [ + "mollis", + "Cypholophus macrocephalus mollis", + "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd." + ], + "location": [], + "isInLibrary": false, + "support": [] + } +} diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py new file mode 100644 index 00000000000..cae12187521 --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/test_auckland_museum.py @@ -0,0 +1,54 @@ +from pathlib import Path + +from catalog.tests.dags.providers.provider_api_scripts.resources.json_load import ( + make_resource_json_func, +) +from common.licenses import get_license_info +from providers.provider_api_scripts.auckland_museum import AucklandMuseumDataIngester + + +RESOURCES = Path(__file__).parent / "resources/aucklandmuseum" +CC_BY_4_0 = get_license_info("https://creativecommons.org/licenses/by/4.0/") + +ingester = AucklandMuseumDataIngester(date="2018-01-15") +_get_resource_json = make_resource_json_func("aucklandmuseum") + + +def test_get_next_query_params_default_response(): + actual_param = ingester.get_next_query_params(None) + expected_param = { + "size": "2000", + "from": ingester.batch_start, + } + assert actual_param == expected_param + + +def test_get_record_data(): + # High level test for `get_record_data`. One way to test this is to create a + # `tests/resources/AucklandMuseum/single_item.json` file containing a sample json + # representation of a record from the API under test, call `get_record_data` with + # the json, and directly compare to expected output. + # + # Make sure to add additional tests for records of each media type supported by + # your provider. + + # Sample code for loading in the sample json + + single_item = _get_resource_json("single_item.json") + actual_data = ingester.get_record_data(single_item) + meta_data = { + "type": "ecrm:E20_Biological_Object", + "geopos": "", + "department": "botany", + } + expected_data = { + "foreign_landing_url": "https://www.aucklandmuseum.com/collections-research/collections/record/am_naturalsciences-object-691102", + "foreign_identifier": "691102", + "url": "http://api.aucklandmuseum.com/id/media/v/214749", + "license_info": CC_BY_4_0, + "creator": "R. O. Gardner", + "title": "Cypholophus macrocephalus mollis (Blume) Wedd. var. mollis (Wedd.) Wedd.", + "meta_data": meta_data, + } + + assert actual_data == expected_data