feat(code): add function to download notebook outputs (#184)

KeijiBranshi · web-flow · commit ffa9ff78f497 · 2024-12-02T11:16:06.000-08:00
BUG=b/371574828 CHILD=#185 BLOCKED_BY=go/kaggle-pr/32581,go/kaggle-pr/32632 CC=@rosbo,@dster2,@jplotts Extends the same functionality from [`models.py`](https://github.com/Kaggle/kagglehub/blob/8b1fae8632f9d381cebb14ec50c56d7ff5fbeb1b/src/kagglehub/models.py), [`datasets.py`](https://github.com/Kaggle/kagglehub/blob/8b1fae8632f9d381cebb14ec50c56d7ff5fbeb1b/src/kagglehub/datasets.py) and [`competition.py`](https://github.com/Kaggle/kagglehub/blob/8b1fae8632f9d381cebb14ec50c56d7ff5fbeb1b/src/kagglehub/competition.py) to the notebooks at https://kaggle.com/code. ### Changes [handle.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-4d86981fdd4a6e41ce621dd3dcafa482e4ca96d5cca7da9b6cbaff147cc3bffb) - added a new `*Handle` data type - using `Code` to align with the route on the main site https://kaggle.com/code [cache.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-3468a9a96dc65a3a8770b887cdc452e4975b9c934f0376c56a7e39dff7fd778a) - added new functions to dictate the cached path for notebook outputs based on the properties in `CodeHandle` - this mostly mirrors the same structure as the `model`, `dataset`, and `competition` paths. - the cache structure is split into the _output\_path_, _archive\_path_, a _completion\_file\_marker\_path for individual files in the download payload, and a _completion\_file\_marker\_path for the entire download payload. - the structure is as follows: ``` <cache_root>/ └── notebooks/ └── username/ └── notebook_slug/ ├── output.complete <-- tracker for the entire output ├── .complete/ <-- trackers for per file within the output │ └── output/ │ ├── file1.txt.complete │ ├── file2.txt.complete ├── output.archive <-- the compressed output (.tar.gz or .zip) └── output/ <-- the uncompressed output ├── file1.txt └── file2.txt ``` [http_resolver.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-bac8b2fc0706f6a2f83562279c3095d907c84f377a4f49829bb06935d3e6773a) - Implemented the `NotebookOutputHttpResolver` - Note, we don't currently have an API endpoint to download notebook output in a kagglehub-compatible compression format (left a TODO with our internal tracker for this) - It leverages the our existing `KaggleApiV1Client` + the new cache location mentioned above [registry.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-3a93c8bc0f26d8267eb445e7e90f03b2c85c54e8c59e37f997c82104cb5d1541) + [\_\_init\_\_.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-214a9613a5b623f57fb158c6c784e29e051f95fe72694b446cae592f76b825d8) - bootstraps the `NotebookOutputHttpResolver` so that it can be called by `kagglehub.notebook_output_download` in `code.py` [code.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-e0c454ef6e8643b0efb25013db58209adc53499f2a5a485a8b5bd63cd280899a) - the entry point to the notebook output downloading functionality - the file is named `code.py` to align with our navigation paths at https://kaggle.com (similar to `models`, `datasets`, and `competitions`). Open to changing if needed. - the function is named `notebook_output_download` to be more specific about what's being downoaded [test_notebook_output_download.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-04eda2a8f4c98b098d47a0dc61b70c4bfce3a2fba4205abcb9830c69a05001e3) - integration tests for the new `kagglehub.notebook_output_download` function - TODO(#185): adding tests in a followup since that requires propping up a [stubbed API server](https://github.com/Kaggle/kagglehub/tree/8b1fae8632f9d381cebb14ec50c56d7ff5fbeb1b/tests/server_stubs). Trying to keep this diff from getting any bigger. [gcs_upload.py](https://github.com/Kaggle/kagglehub/pull/184/files#diff-2dcd4fa7b008e12dc0d76354035bc2cdaf48dd2422e997476443a4fbe550d8ea) - A miscellaneous lint error that slipped through. Fixing here as a drive-by change as per [this comment](#184 (comment))
diff --git a/integration_tests/test_notebook_output_download.py b/integration_tests/test_notebook_output_download.py
@@ -0,0 +1,55 @@
+import unittest
+
+from requests import HTTPError
+
+from kagglehub import notebook_output_download
+
+from .utils import assert_files, create_test_cache, unauthenticated
+
+
+class TestModelDownload(unittest.TestCase):
+    def test_download_notebook_output_succeeds(self) -> None:
+        with create_test_cache():
+            actual_path = notebook_output_download("alexisbcook/titanic-tutorial")
+
+            expected_files = ["submission.csv"]
+            assert_files(self, actual_path, expected_files)
+
+    def test_download_public_notebook_output_as_unauthenticated_succeeds(self) -> None:
+        with create_test_cache():
+            with unauthenticated():
+                actual_path = notebook_output_download("alexisbcook/titanic-tutorial")
+
+                expected_files = ["submission.csv"]
+                assert_files(self, actual_path, expected_files)
+
+    def test_download_private_notebook_output_succeeds(self) -> None:
+        with create_test_cache():
+            actual_path = notebook_output_download("integrationtester/private-titanic-tutorial")
+
+            expected_files = ["submission-01.csv", "submission-02.csv"]
+
+            assert_files(self, actual_path, expected_files)
+
+    def test_download_private_notebook_output_single_file_succeeds(self) -> None:
+        with create_test_cache():
+            actual_path = notebook_output_download(
+                "integrationtester/private-titanic-tutorial", path="submission-02.csv"
+            )
+
+            expected_files = ["submission-02.csv"]
+
+            assert_files(self, actual_path, expected_files)
+
+    def test_download_large_notebook_output_warns(self) -> None:
+        handle = "integrationtester/titanic-tutorial-many-output-files"
+        with create_test_cache():
+            # If the model has > 25 files, we warn the user that it's not supported yet
+            # TODO(b/379761520): add support for .tar.gz archived downloads
+            notebook_output_download(handle)
+            msg = f"Too many files in {handle} (capped at 25). Unable to download notebook output."
+            self.assertLogs(msg, "WARNING")
+
+    def test_download_private_notebook_output_with_incorrect_file_path_fails(self) -> None:
+        with create_test_cache(), self.assertRaises(HTTPError):
+            notebook_output_download("integrationtester/titanic-tutorial", path="submission-03.csv")
diff --git a/src/kagglehub/__init__.py b/src/kagglehub/__init__.py
@@ -6,6 +6,7 @@
 from kagglehub.competition import competition_download
 from kagglehub.datasets import dataset_download, dataset_upload
 from kagglehub.models import model_download, model_upload
+from kagglehub.notebooks import notebook_output_download
 
 registry.model_resolver.add_implementation(http_resolver.ModelHttpResolver())
 registry.model_resolver.add_implementation(kaggle_cache_resolver.ModelKaggleCacheResolver())
@@ -17,3 +18,6 @@
 
 registry.competition_resolver.add_implementation(http_resolver.CompetitionHttpResolver())
 registry.competition_resolver.add_implementation(kaggle_cache_resolver.CompetitionKaggleCacheResolver())
+
+# TODO(b/380340624): implement a kaggle_cache_resolver for notebook outputs
+registry.notebook_output_resolver.add_implementation(http_resolver.NotebookOutputHttpResolver())
diff --git a/src/kagglehub/cache.py b/src/kagglehub/cache.py
@@ -4,9 +4,10 @@
 from typing import Optional
 
 from kagglehub.config import get_cache_folder
-from kagglehub.handle import CompetitionHandle, DatasetHandle, ModelHandle, ResourceHandle
+from kagglehub.handle import CompetitionHandle, DatasetHandle, ModelHandle, NotebookHandle, ResourceHandle
 
 DATASETS_CACHE_SUBFOLDER = "datasets"
+NOTEBOOKS_CACHE_SUBFOLDER = "notebooks"  # for resources under kaggle.com/code
 COMPETITIONS_CACHE_SUBFOLDER = "competitions"
 MODELS_CACHE_SUBFOLDER = "models"
 FILE_COMPLETION_MARKER_FOLDER = ".complete"
@@ -35,6 +36,8 @@ def get_cached_path(handle: ResourceHandle, path: Optional[str] = None) -> str:
         return _get_dataset_path(handle, path)
     elif isinstance(handle, CompetitionHandle):
         return _get_competition_path(handle, path)
+    elif isinstance(handle, NotebookHandle):
+        return _get_notebook_output_path(handle, path)
     else:
         msg = "Invalid handle"
         raise ValueError(msg)
@@ -47,6 +50,8 @@ def get_cached_archive_path(handle: ResourceHandle) -> str:
         return _get_dataset_archive_path(handle)
     elif isinstance(handle, CompetitionHandle):
         return _get_competition_archive_path(handle)
+    elif isinstance(handle, NotebookHandle):
+        return _get_notebook_output_archive_path(handle)
     else:
         msg = "Invalid handle"
         raise ValueError(msg)
@@ -105,6 +110,8 @@ def _get_completion_marker_filepath(handle: ResourceHandle, path: Optional[str]
         return _get_datasets_completion_marker_filepath(handle, path)
     elif isinstance(handle, CompetitionHandle):
         return _get_competitions_completion_marker_filepath(handle, path)
+    elif isinstance(handle, NotebookHandle):
+        return _get_notebook_output_completion_marker_filepath(handle, path)
     else:
         msg = "Invalid handle"
         raise ValueError(msg)
@@ -118,6 +125,11 @@ def _get_dataset_path(handle: DatasetHandle, path: Optional[str] = None) -> str:
     return os.path.join(base_path, path) if path else base_path
 
 
+def _get_notebook_output_path(handle: NotebookHandle, path: Optional[str] = None) -> str:
+    base_path = os.path.join(get_cache_folder(), NOTEBOOKS_CACHE_SUBFOLDER, handle.owner, handle.notebook, "output")
+    return os.path.join(base_path, path) if path else base_path
+
+
 def _get_competition_path(handle: CompetitionHandle, path: Optional[str] = None) -> str:
     base_path = os.path.join(get_cache_folder(), COMPETITIONS_CACHE_SUBFOLDER, handle.competition)
     return os.path.join(base_path, path) if path else base_path
@@ -167,6 +179,10 @@ def _get_competition_archive_path(handle: CompetitionHandle) -> str:
     )
 
 
+def _get_notebook_output_archive_path(handle: NotebookHandle) -> str:
+    return os.path.join(get_cache_folder(), NOTEBOOKS_CACHE_SUBFOLDER, handle.owner, handle.notebook, "output.archive")
+
+
 def _get_models_completion_marker_filepath(handle: ModelHandle, path: Optional[str] = None) -> str:
     if path:
         return os.path.join(
@@ -213,6 +229,20 @@ def _get_datasets_completion_marker_filepath(handle: DatasetHandle, path: Option
     )
 
 
+def _get_notebook_output_completion_marker_filepath(handle: NotebookHandle, path: Optional[str] = None) -> str:
+    if path:
+        return os.path.join(
+            get_cache_folder(),
+            NOTEBOOKS_CACHE_SUBFOLDER,
+            handle.owner,
+            handle.notebook,
+            FILE_COMPLETION_MARKER_FOLDER,
+            "output",
+            f"{path}.complete",
+        )
+    return os.path.join(get_cache_folder(), NOTEBOOKS_CACHE_SUBFOLDER, handle.owner, handle.notebook, "output.complete")
+
+
 def _get_competitions_completion_marker_filepath(handle: CompetitionHandle, path: Optional[str] = None) -> str:
     if path:
         return os.path.join(
diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py
@@ -10,7 +10,7 @@
 from typing import Optional, Union
 
 import requests
-from requests.exceptions import ConnectionError, Timeout
+from requests.exceptions import Timeout
 from tqdm import tqdm
 from tqdm.utils import CallbackIOWrapper
 
@@ -66,7 +66,7 @@ def get_size(size: float, precision: int = 0) -> str:
         while size >= 1024 and suffix_index < 4:  # noqa: PLR2004
             suffix_index += 1
             size /= 1024.0
-        return "%.*f%s" % (precision, size, suffixes[suffix_index])
+        return f"{size:.{precision}f}{suffixes[suffix_index]}"
 
 
 def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[tuple[str, list[str], list[str]]]:
@@ -109,7 +109,7 @@ def _check_uploaded_size(session_uri: str, file_size: int, backoff_factor: int =
                 return 0  # If no Range header, assume no bytes were uploaded
             else:
                 return file_size
-        except (ConnectionError, Timeout):
+        except (requests.ConnectionError, Timeout):
             logger.info(f"Network issue while checking uploaded size, retrying in {backoff_factor} seconds...")
             time.sleep(backoff_factor)
             backoff_factor = min(backoff_factor * 2, 60)
diff --git a/src/kagglehub/handle.py b/src/kagglehub/handle.py
@@ -12,6 +12,8 @@
 NUM_VERSIONED_MODEL_PARTS = 5  # e.g.: <owner>/<model>/<framework>/<variation>/<version>
 NUM_UNVERSIONED_MODEL_PARTS = 4  # e.g.: <owner>/<model>/<framework>/<variation>
 
+NUM_UNVERSIONED_NOTEBOOK_PARTS = 2  # e.g.: <owner>/<notebook>
+
 
 @dataclass
 class ResourceHandle:
@@ -83,6 +85,21 @@ def to_url(self) -> str:
         return base_url
 
 
+@dataclass
+class NotebookHandle(ResourceHandle):
+    owner: str
+    notebook: str
+
+    def __str__(self) -> str:
+        handle_str = f"{self.owner}/{self.notebook}"
+        return handle_str
+
+    def to_url(self) -> str:
+        endpoint = get_kaggle_api_endpoint()
+        base_url = f"{endpoint}/code/{self.owner}/{self.notebook}"
+        return base_url
+
+
 def parse_dataset_handle(handle: str) -> DatasetHandle:
     parts = handle.split("/")
 
@@ -152,3 +169,11 @@ def parse_competition_handle(handle: str) -> CompetitionHandle:
         raise ValueError(msg)
 
     return CompetitionHandle(competition=handle)
+
+
+def parse_notebook_handle(handle: str) -> NotebookHandle:
+    parts = handle.split("/")
+    if len(parts) != NUM_UNVERSIONED_NOTEBOOK_PARTS:
+        msg = f"Invalid notebook handle: {handle}"
+        raise ValueError(msg)
+    return NotebookHandle(owner=parts[0], notebook=parts[1])
diff --git a/src/kagglehub/http_resolver.py b/src/kagglehub/http_resolver.py
@@ -2,6 +2,7 @@
 import os
 import tarfile
 import zipfile
+from pathlib import Path
 from typing import Optional
 
 import requests
@@ -16,7 +17,7 @@
 )
 from kagglehub.clients import KaggleApiV1Client
 from kagglehub.exceptions import UnauthenticatedError
-from kagglehub.handle import CompetitionHandle, DatasetHandle, ModelHandle, ResourceHandle
+from kagglehub.handle import CompetitionHandle, DatasetHandle, ModelHandle, NotebookHandle, ResourceHandle
 from kagglehub.resolver import Resolver
 
 DATASET_CURRENT_VERSION_FIELD = "currentVersionNumber"
@@ -199,6 +200,68 @@ def _inner_download_file(file: str) -> None:
         return out_path
 
 
+class NotebookOutputHttpResolver(Resolver[NotebookHandle]):
+    def is_supported(self, *_, **__) -> bool:  # noqa: ANN002, ANN003
+        # Downloading files over HTTP is supported in all environments for all handles / paths.
+        return True
+
+    def __call__(self, h: NotebookHandle, path: Optional[str] = None, *, force_download: Optional[bool] = False) -> str:
+        api_client = KaggleApiV1Client()
+
+        cached_response = load_from_cache(h, path)
+        if cached_response and not force_download:
+            return cached_response  # Already cached
+        elif cached_response and force_download:
+            delete_from_cache(h, path)
+
+        download_url_root = f"kernels/output/download/{h.owner}/{h.notebook}"
+        output_root = Path(get_cached_path(h, path))
+
+        # List the files and decide how to download them:
+        # - <= 25 files: Download files in parallel
+        # > 25 files: Download the archive and uncompress
+        (files, has_more) = self._list_files(api_client, h) if not path else ([path], False)
+        if has_more:
+            # TODO(b/379761520): add support for .tar.gz archived downloads
+            logger.warning(
+                f"Too many files in {h} (capped at {MAX_NUM_FILES_DIRECT_DOWNLOAD}). "
+                "Unable to download notebook output."
+            )
+            return ""
+
+        # Download files individually in parallel
+        def _inner_download_file(filepath: str) -> None:
+            download_url_path = f"{download_url_root}/{filepath}"
+            full_output_filepath = output_root / filepath
+
+            os.makedirs(os.path.dirname(full_output_filepath), exist_ok=True)
+            api_client.download_file(download_url_path, str(full_output_filepath), h)
+
+        thread_map(
+            _inner_download_file,
+            files,
+            desc=f"Downloading {len(files)} files",
+            max_workers=8,  # Never use more than 8 threads in parallel to download files.
+        )
+
+        mark_as_complete(h, path)
+
+        # TODO(b/377510971): when notebook is a Kaggle utility script, update sys.path
+        return str(output_root)
+
+    def _list_files(self, api_client: KaggleApiV1Client, h: NotebookHandle) -> tuple[list[str], bool]:
+        query = f"kernels/output/list/{h.owner}/{h.notebook}?page_size={MAX_NUM_FILES_DIRECT_DOWNLOAD}"
+        json_response = api_client.get(query, h)
+        if "files" not in json_response:
+            msg = "Invalid ApiListKernelSessionOutput API response. Expected to include a 'files' field"
+            raise ValueError(msg)
+
+        files = [f["fileName"].lstrip("/") for f in json_response["files"]]
+        has_more = "nextPageToken" in json_response and json_response["nextPageToken"] != ""
+
+        return (files, has_more)
+
+
 def _extract_archive(archive_path: str, out_path: str) -> None:
     logger.info("Extracting files...")
     if tarfile.is_tarfile(archive_path):
diff --git a/src/kagglehub/notebooks.py b/src/kagglehub/notebooks.py
@@ -0,0 +1,27 @@
+import logging
+from typing import Optional
+
+from kagglehub import registry
+from kagglehub.handle import parse_notebook_handle
+from kagglehub.logger import EXTRA_CONSOLE_BLOCK
+
+logger = logging.getLogger(__name__)
+
+
+def notebook_output_download(handle: str, path: Optional[str] = None, *, force_download: Optional[bool] = False) -> str:
+    """[WORK IN PROGRESS]
+
+    Download notebook output files.
+
+    Args:
+        handle: (string) the notebook handle under https://kaggle.com/code.
+        path: (string) Optional path to a file within the notebook output.
+        force_download: (bool) Optional flag to force download motebook output, even if it's cached.
+
+
+    Returns:
+        A string representing the path to the requested notebook output files.
+    """
+    h = parse_notebook_handle(handle)
+    logger.info(f"Downloading Notebook Output: {h.to_url()} ...", extra={**EXTRA_CONSOLE_BLOCK})
+    return registry.notebook_output_resolver(h, path, force_download=force_download)
diff --git a/src/kagglehub/registry.py b/src/kagglehub/registry.py
@@ -31,3 +31,4 @@ def __call__(self, *args, **kwargs):  # noqa: ANN002, ANN003
 model_resolver = MultiImplRegistry("ModelResolver")
 dataset_resolver = MultiImplRegistry("DatasetResolver")
 competition_resolver = MultiImplRegistry("CompetitionResolver")
+notebook_output_resolver = MultiImplRegistry("NotebookOutputResolver")
diff --git a/tests/test_handle.py b/tests/test_handle.py
@@ -1,4 +1,4 @@
-from kagglehub.handle import parse_competition_handle, parse_dataset_handle, parse_model_handle
+from kagglehub.handle import parse_competition_handle, parse_dataset_handle, parse_model_handle, parse_notebook_handle
 from tests.fixtures import BaseTestCase
 
 
@@ -68,3 +68,14 @@ def test_competition_handle(self) -> None:
         h = parse_competition_handle(handle)
 
         self.assertEqual("titanic", h.competition)
+
+    def test_code_handle(self) -> None:
+        handle = "owner/notebook"
+        h = parse_notebook_handle(handle)
+
+        self.assertEqual("owner", h.owner)
+        self.assertEqual("notebook", h.notebook)
+
+    def test_invalid_code_handle(self) -> None:
+        with self.assertRaises(ValueError):
+            parse_notebook_handle("notebook")