Create dataset from dir (#412)

jean-lucas · web-flow · commit 2044e72b7aa8 · 2023-11-22T12:53:29.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,49 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
+## [0.16.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.10) - 2023-11-22
+
+Allow creating a dataset by crawling all images in a directory, recursively. Also supports privacy mode datasets.
+
+#### Example structure:
+```
+~/Documents/
+    data/
+        2022/
+            - img01.png
+            - img02.png
+        2023/
+            - img01.png
+            - img02.png
+```
+
+#### Default Example:
+
+```python
+data_dir = "~/Documents/data"
+client.create_dataset_from_dir(data_dir)
+# this will create a dataset named "data" and will contain 4 images, with the ref IDs:
+# ["2022/img01.png", "2022/img02.png", "2023/img01.png", "2023/img02.png"]
+```
+
+#### Example Privacy Mode:
+
+This requires that a proxy (or file server) is setup  and can serve files _relative_ to the data_dir
+
+```python
+data_dir = "~/Documents/data"
+client.create_dataset_from_dir(
+    data_dir,
+    dataset_name='my-dataset',
+    use_privacy_mode=True,
+    privacy_mode_proxy="http://localhost:5000/assets/"
+)
+```
+
+This would create a dataset `my-dataset`, and when opened in Nucleus, the images would be requested to the path:
+`<privacy_mode_proxy>/<img ref id>`, for example: `http://localhost:5000/assets/2022/img01.png`
+
+
 ## [0.16.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.9) - 2023-11-17
 
 ### Fixes
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -45,7 +45,7 @@
 import datetime
 import os
 import warnings
-from typing import Any, Dict, List, Optional, Sequence, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pydantic
 import requests
@@ -86,6 +86,7 @@
     ERROR_ITEMS,
     ERROR_PAYLOAD,
     ERRORS_KEY,
+    GLOB_SIZE_THRESHOLD_CHECK,
     I_KEY,
     IMAGE_KEY,
     IMAGE_URL_KEY,
@@ -150,6 +151,7 @@
 from .scene import Frame, LidarScene, VideoScene
 from .slice import Slice
 from .upload_response import UploadResponse
+from .utils import create_items_from_folder_crawl
 from .validate import Validate
 
 # pylint: disable=E1101
@@ -1177,3 +1179,62 @@ def _set_api_key(self, api_key):
             raise NoAPIKey()
 
         return api_key
+
+    def create_dataset_from_dir(
+        self,
+        dirname: str,
+        dataset_name: Optional[str] = None,
+        use_privacy_mode: bool = False,
+        privacy_mode_proxy: str = "",
+        allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
+        skip_size_warning: bool = False,
+    ) -> Union[Dataset, None]:
+        """
+        Create a dataset by recursively crawling through a directory.
+        A DatasetItem will be created for each unique image found.
+
+        Parameters:
+            dirname: Where to look for image files, recursively
+            dataset_name: If none is given, the parent folder name is used
+            use_privacy_mode: Whether the dataset should be treated as privacy
+            privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
+                The proxy should work based on the relative path of the images in the directory.
+            allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
+            skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
+        """
+
+        if use_privacy_mode:
+            assert (
+                privacy_mode_proxy
+            ), "When using privacy mode, must specify a proxy to serve the files"
+
+        # ensures path ends with a slash
+        _dirname = os.path.join(os.path.expanduser(dirname), "")
+        if not os.path.exists(_dirname):
+            raise ValueError(
+                f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
+            )
+
+        folder_name = os.path.basename(_dirname.rstrip("/"))
+        dataset_name = dataset_name or folder_name
+        items = create_items_from_folder_crawl(
+            _dirname,
+            allowed_file_types,
+            use_privacy_mode,
+            privacy_mode_proxy,
+        )
+
+        if len(items) == 0:
+            print(f"Did not find any items in {dirname}")
+            return None
+
+        if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
+            raise Exception(
+                f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
+            )
+
+        dataset = self.create_dataset(
+            name=dataset_name, use_privacy_mode=use_privacy_mode
+        )
+        dataset.append(items, asynchronous=False)
+        return dataset
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -166,3 +166,4 @@
 X_KEY = "x"
 Y_KEY = "y"
 Z_KEY = "z"
+GLOB_SIZE_THRESHOLD_CHECK = 500
diff --git a/nucleus/utils.py b/nucleus/utils.py
@@ -1,12 +1,14 @@
 """Shared stateless utility function library"""
-
+import glob
 import io
 import json
+import os
 import uuid
 from collections import defaultdict
-from typing import IO, TYPE_CHECKING, Dict, List, Sequence, Type, Union
+from typing import IO, TYPE_CHECKING, Dict, List, Sequence, Tuple, Type, Union
 
 import requests
+from PIL import Image
 from requests.models import HTTPError
 
 from nucleus.annotation import (
@@ -422,3 +424,57 @@ def paginate_generator(
             yield json_value
         if not next_token:
             break
+
+
+def get_image_dimension(image_fpath: str) -> Tuple[int, int]:
+    im = Image.open(image_fpath)
+    return im.size
+
+
+def find_matching_filepaths(
+    dirname: str, allowed_file_types: Tuple[str, ...]
+) -> List[str]:
+    """
+    Returns a list of filepaths *relative* to dirname that matched the file globs
+    """
+    relative_fpaths = []
+    for file_type in allowed_file_types:
+        pathname = os.path.join(dirname, f"**/*.{file_type}")
+        print(f"Searching for filepaths that match {pathname}")
+        fpaths = glob.glob(pathname=pathname, recursive=True)
+        # keep paths relative to dirname for easier management.
+        # TODO: this can be skipped in py version >= 3.10, where `root_dir` can be specified in the glob.
+        relative_fpaths.extend(
+            [fpath.replace(dirname, "") for fpath in fpaths]
+        )
+    return relative_fpaths
+
+
+def create_items_from_folder_crawl(
+    dirname: str,
+    allowed_file_types: Tuple[str, ...],
+    use_privacy_mode: bool,
+    privacy_mode_proxy: str,
+) -> List[DatasetItem]:
+    relative_fpaths = find_matching_filepaths(dirname, allowed_file_types)
+
+    dataset_items = []
+    for relative_fpath in relative_fpaths:
+        ref_id = relative_fpath
+
+        image_fpath = os.path.join(dirname, relative_fpath)
+        width, height = None, None
+
+        if use_privacy_mode:
+            width, height = get_image_dimension(image_fpath)
+            image_fpath = os.path.join(privacy_mode_proxy, relative_fpath)
+
+        item = DatasetItem(
+            image_location=image_fpath,
+            reference_id=ref_id,
+            width=width,
+            height=height,
+        )
+        dataset_items.append(item)
+
+    return dataset_items
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.16.9"
+version = "0.16.10"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]