From b3f9e7ce8e106c7c32052d94e7680debad433d0b Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Thu, 7 Mar 2024 02:32:29 +0000 Subject: [PATCH 1/7] Add CKAN provider --- binderhub/app.py | 2 + binderhub/event-schemas/launch.json | 3 +- binderhub/main.py | 1 + binderhub/repoproviders.py | 67 +++++++++++++++++++++++++ binderhub/static/js/src/form.js | 3 +- binderhub/tests/test_repoproviders.py | 29 +++++++++++ docs/source/developer/repoproviders.rst | 2 + docs/source/reference/repoproviders.rst | 5 ++ 8 files changed, 110 insertions(+), 2 deletions(-) diff --git a/binderhub/app.py b/binderhub/app.py index 4c9e8f4c2..ae2632c9a 100644 --- a/binderhub/app.py +++ b/binderhub/app.py @@ -56,6 +56,7 @@ from .ratelimit import RateLimiter from .registry import DockerRegistry from .repoproviders import ( + CKANProvider, DataverseProvider, FigshareProvider, GistRepoProvider, @@ -586,6 +587,7 @@ def _default_build_namespace(self): "figshare": FigshareProvider, "hydroshare": HydroshareProvider, "dataverse": DataverseProvider, + "ckan": CKANProvider, }, config=True, help=""" diff --git a/binderhub/event-schemas/launch.json b/binderhub/event-schemas/launch.json index 16e277cf4..446182926 100644 --- a/binderhub/event-schemas/launch.json +++ b/binderhub/event-schemas/launch.json @@ -14,7 +14,8 @@ "Zenodo", "Figshare", "Hydroshare", - "Dataverse" + "Dataverse", + "CKAN" ], "description": "Provider for the repository being launched" }, diff --git a/binderhub/main.py b/binderhub/main.py index 2a2027598..f89d23d79 100644 --- a/binderhub/main.py +++ b/binderhub/main.py @@ -22,6 +22,7 @@ "figshare": "Figshare", "hydroshare": "Hydroshare", "dataverse": "Dataverse", + "ckan": "CKAN", } diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index be9dd75f4..fc3ca9cb5 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -448,6 +448,73 @@ def get_build_slug(self): return f"hydroshare-{self.record_id}" +class CKANProvider(RepoProvider): + """Provide contents of a CKAN dataset + Users must provide a spec consisting of the CKAN dataset URL. + """ + + name = Unicode("CKAN") + + display_name = "CKAN dataset" + + url_regex = r"/dataset/[a-z0-9_\\-]*$" + + labels = { + "text": "CKAN dataset URL (https://demo.ckan.org/dataset/sample-dataset-1)", + "tag_text": "Git ref (branch, tag, or commit)", + "ref_prop_disabled": True, + "label_prop_disabled": True, + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.repo = urllib.parse.unquote(self.spec) + + async def get_resolved_ref(self): + parsed_repo = urlparse(self.repo) + self.dataset_id = parsed_repo.path.rsplit("/", maxsplit=1)[1] + + client = AsyncHTTPClient() + + api = parsed_repo._replace( + path=re.sub(self.url_regex, "/api/3/action/", parsed_repo.path) + ).geturl() + + package_show_url = f"{api}package_show?id={self.dataset_id}" + + try: + r = await client.fetch(package_show_url, user_agent="BinderHub") + except HTTPError: + return None + + def parse_date(json_body): + json_response = json.loads(json_body) + date = json_response["result"]["metadata_modified"] + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") + epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() + # truncate the timestamp + return str(int(epoch)) + + self.record_id = f"{self.dataset_id}.v{parse_date(r.body)}" + + return self.record_id + + async def get_resolved_spec(self): + if not hasattr(self, "record_id"): + await self.get_resolved_ref() + return self.repo + + def get_repo_url(self): + return self.repo + + async def get_resolved_ref_url(self): + resolved_spec = await self.get_resolved_spec() + return resolved_spec + + def get_build_slug(self): + return f"ckan-{self.dataset_id}" + + class GitRepoProvider(RepoProvider): """Bare bones git repo provider. diff --git a/binderhub/static/js/src/form.js b/binderhub/static/js/src/form.js index cc00d7b45..1bf70e6f1 100644 --- a/binderhub/static/js/src/form.js +++ b/binderhub/static/js/src/form.js @@ -31,7 +31,8 @@ export function getBuildFormValues() { providerPrefix === "zenodo" || providerPrefix === "figshare" || providerPrefix === "dataverse" || - providerPrefix === "hydroshare" + providerPrefix === "hydroshare" || + providerPrefix === "ckan" ) { ref = ""; } diff --git a/binderhub/tests/test_repoproviders.py b/binderhub/tests/test_repoproviders.py index df5f63e0e..591bc0848 100644 --- a/binderhub/tests/test_repoproviders.py +++ b/binderhub/tests/test_repoproviders.py @@ -6,6 +6,7 @@ from tornado.ioloop import IOLoop from binderhub.repoproviders import ( + CKANProvider, DataverseProvider, FigshareProvider, GistRepoProvider, @@ -209,6 +210,34 @@ async def test_dataverse( assert spec == resolved_spec +@pytest.mark.parametrize( + "spec,resolved_spec,resolved_ref,resolved_ref_url,build_slug", + [ + [ + "https://demo.ckan.org/dataset/sample-dataset-1", + "https://demo.ckan.org/dataset/sample-dataset-1", + "sample-dataset-1.v", + "https://demo.ckan.org/dataset/sample-dataset-1", + "ckan-sample-dataset-1", + ], + ], +) +async def test_ckan(spec, resolved_spec, resolved_ref, resolved_ref_url, build_slug): + provider = CKANProvider(spec=spec) + + ref = await provider.get_resolved_ref() + assert resolved_ref in ref + + slug = provider.get_build_slug() + assert slug == build_slug + repo_url = provider.get_repo_url() + assert repo_url == spec + ref_url = await provider.get_resolved_ref_url() + assert ref_url == resolved_ref_url + spec = await provider.get_resolved_spec() + assert spec == resolved_spec + + @pytest.mark.github_api @pytest.mark.parametrize( "repo,unresolved_ref,resolved_ref", diff --git a/docs/source/developer/repoproviders.rst b/docs/source/developer/repoproviders.rst index 083db3e27..ab648f1c4 100644 --- a/docs/source/developer/repoproviders.rst +++ b/docs/source/developer/repoproviders.rst @@ -36,6 +36,8 @@ Currently supported providers, their prefixes and specs are: +------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ | Dataverse | ``dataverse`` | ```` | `Dataverse `_ is open source research data repository software installed all over the world. | +------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ + | CKAN | ``ckan`` | ``/`` | `CKAN `_ is an open source data management system. | + +------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ | Git | ``git`` | ``/`` | A generic repository provider for URLs that point directly to a git repository. | +------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/docs/source/reference/repoproviders.rst b/docs/source/reference/repoproviders.rst index d0f5ca37c..40b230d70 100644 --- a/docs/source/reference/repoproviders.rst +++ b/docs/source/reference/repoproviders.rst @@ -65,6 +65,11 @@ Module: :mod:`binderhub.repoproviders` .. autoconfigurable:: DataverseProvider :members: +:class:`CKANProvider` +--------------------------- + +.. autoconfigurable:: CKANProvider + :members: :class:`GitRepoProvider` --------------------------- From 83768a1cdbef03a7412c0c82be424e270a012ab8 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Mon, 25 Mar 2024 12:17:45 +0000 Subject: [PATCH 2/7] Inline the parse_date function --- binderhub/repoproviders.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index fc3ca9cb5..e2d950014 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -487,15 +487,14 @@ async def get_resolved_ref(self): except HTTPError: return None - def parse_date(json_body): - json_response = json.loads(json_body) - date = json_response["result"]["metadata_modified"] - parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") - epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() - # truncate the timestamp - return str(int(epoch)) - - self.record_id = f"{self.dataset_id}.v{parse_date(r.body)}" + json_response = json.loads(r.body) + date = json_response["result"]["metadata_modified"] + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") + epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() + # truncate the timestamp + dataset_version = str(int(epoch)) + + self.record_id = f"{self.dataset_id}.v{dataset_version}" return self.record_id From 1ba7d017584cb2412ca6c5637fbf85b09986c5cd Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Mon, 25 Mar 2024 12:39:56 +0000 Subject: [PATCH 3/7] Make the splitting logic more straightforward --- binderhub/repoproviders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index e2d950014..5843e794d 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -457,8 +457,6 @@ class CKANProvider(RepoProvider): display_name = "CKAN dataset" - url_regex = r"/dataset/[a-z0-9_\\-]*$" - labels = { "text": "CKAN dataset URL (https://demo.ckan.org/dataset/sample-dataset-1)", "tag_text": "Git ref (branch, tag, or commit)", @@ -476,8 +474,10 @@ async def get_resolved_ref(self): client = AsyncHTTPClient() + url_parts = parsed_repo.path.split("/") + api_url_path = "/api/3/action/" api = parsed_repo._replace( - path=re.sub(self.url_regex, "/api/3/action/", parsed_repo.path) + path="/".join(url_parts[:-2]) + api_url_path ).geturl() package_show_url = f"{api}package_show?id={self.dataset_id}" From d0838308d48e8bb4e7cb7bf32b291d1e59f043fd Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Wed, 15 May 2024 03:58:33 +0000 Subject: [PATCH 4/7] Handle the activities --- binderhub/repoproviders.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 5843e794d..955455671 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -15,7 +15,7 @@ import time import urllib.parse from datetime import datetime, timedelta, timezone -from urllib.parse import urlparse +from urllib.parse import parse_qs, urlparse import escapism from prometheus_client import Gauge @@ -470,20 +470,35 @@ def __init__(self, *args, **kwargs): async def get_resolved_ref(self): parsed_repo = urlparse(self.repo) - self.dataset_id = parsed_repo.path.rsplit("/", maxsplit=1)[1] - client = AsyncHTTPClient() + url_parts_1 = parsed_repo.path.split("/history/") + url_parts_2 = url_parts_1[0].split("/") + if url_parts_2[-2] == "dataset": + self.dataset_id = url_parts_2[-1] + else: + return None - url_parts = parsed_repo.path.split("/") api_url_path = "/api/3/action/" api = parsed_repo._replace( - path="/".join(url_parts[:-2]) + api_url_path + path="/".join(url_parts_2[:-2]) + api_url_path, query="" ).geturl() - package_show_url = f"{api}package_show?id={self.dataset_id}" + # handle the activites + activity_id = None + if parse_qs(parsed_repo.query).get("activity_id") is not None: + activity_id = parse_qs(parsed_repo.query).get("activity_id")[0] + if len(url_parts_1) == 2: + activity_id = url_parts_1[-1] + if activity_id: + fetch_url = ( + f"{api}activity_data_show?" f"id={activity_id}&object_type=package" + ) + else: + fetch_url = f"{api}package_show?id={self.dataset_id}" + client = AsyncHTTPClient() try: - r = await client.fetch(package_show_url, user_agent="BinderHub") + r = await client.fetch(fetch_url, user_agent="BinderHub") except HTTPError: return None From 32a534100773b0a8c3fd45d2e39d75fb9e0f72c5 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Thu, 27 Jun 2024 03:54:59 +0000 Subject: [PATCH 5/7] Use urlencode to construct query strings --- binderhub/repoproviders.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 955455671..2fad9f15b 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -15,7 +15,7 @@ import time import urllib.parse from datetime import datetime, timedelta, timezone -from urllib.parse import parse_qs, urlparse +from urllib.parse import parse_qs, urlencode, urlparse import escapism from prometheus_client import Gauge @@ -490,11 +490,11 @@ async def get_resolved_ref(self): if len(url_parts_1) == 2: activity_id = url_parts_1[-1] if activity_id: - fetch_url = ( - f"{api}activity_data_show?" f"id={activity_id}&object_type=package" + fetch_url = f"{api}activity_data_show?" + urlencode( + {"id": activity_id, "object_type": "package"} ) else: - fetch_url = f"{api}package_show?id={self.dataset_id}" + fetch_url = f"{api}package_show?" + urlencode({"id": self.dataset_id}) client = AsyncHTTPClient() try: From 4ad04eaa5a6ef333b01e4b0f5aa448f3367f0250 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Thu, 27 Jun 2024 07:04:48 +0000 Subject: [PATCH 6/7] Cleanup URL parsing mechanisms --- binderhub/repoproviders.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 2fad9f15b..dbab18d5a 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -471,24 +471,29 @@ def __init__(self, *args, **kwargs): async def get_resolved_ref(self): parsed_repo = urlparse(self.repo) - url_parts_1 = parsed_repo.path.split("/history/") - url_parts_2 = url_parts_1[0].split("/") - if url_parts_2[-2] == "dataset": - self.dataset_id = url_parts_2[-1] - else: + if "/dataset/" not in parsed_repo.path: + # Not actually a dataset return None - api_url_path = "/api/3/action/" + # CKAN may be under a URL prefix, and we should accomodate that + url_prefix, dataset_url = parsed_repo.path.split("/dataset/") + + dataset_url_parts = dataset_url.split("/") + self.dataset_id = dataset_url_parts[0] + api = parsed_repo._replace( - path="/".join(url_parts_2[:-2]) + api_url_path, query="" + path=f"{url_prefix}/api/3/action/", query="" ).geturl() - # handle the activites + # Activity ID may be present either as a query parameter, activity_id + # or as part of the URL, under `/history/`. If `/history/` + # is present, that takes precedence over `activity_id` activity_id = None - if parse_qs(parsed_repo.query).get("activity_id") is not None: + if "history" in dataset_url_parts: + activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1] + elif parse_qs(parsed_repo.query).get("activity_id") is not None: activity_id = parse_qs(parsed_repo.query).get("activity_id")[0] - if len(url_parts_1) == 2: - activity_id = url_parts_1[-1] + if activity_id: fetch_url = f"{api}activity_data_show?" + urlencode( {"id": activity_id, "object_type": "package"} From 2945d83ca686b7a68c28170c9eb4fcfd432702d2 Mon Sep 17 00:00:00 2001 From: Sol Lee Date: Thu, 27 Jun 2024 15:53:28 +0000 Subject: [PATCH 7/7] Add tests for dataset activities and non-CKAN datasets --- binderhub/tests/test_repoproviders.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/binderhub/tests/test_repoproviders.py b/binderhub/tests/test_repoproviders.py index 591bc0848..ea3b6e4c0 100644 --- a/binderhub/tests/test_repoproviders.py +++ b/binderhub/tests/test_repoproviders.py @@ -220,12 +220,31 @@ async def test_dataverse( "https://demo.ckan.org/dataset/sample-dataset-1", "ckan-sample-dataset-1", ], + [ + "https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f", + "https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f", + "chart-test.v1717501747", + "https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f", + "ckan-chart-test", + ], + [ + "https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f", + "https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f", + "chart-test.v1717501747", + "https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f", + "ckan-chart-test", + ], + ["https://demo.ckan.org/group/roger", None, None, None, None], + ["https://demo.ckan.org/dataset/nosuchdataset", None, None, None, None], ], ) async def test_ckan(spec, resolved_spec, resolved_ref, resolved_ref_url, build_slug): provider = CKANProvider(spec=spec) ref = await provider.get_resolved_ref() + if not resolved_ref: + # We are done here if we don't expect to resolve + return assert resolved_ref in ref slug = provider.get_build_slug()