Skip to content

Commit 2044e72

Browse files
authored
Create dataset from dir (#412)
1 parent 72ce737 commit 2044e72

File tree

5 files changed

+165
-4
lines changed

5 files changed

+165
-4
lines changed

CHANGELOG.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,49 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88

9+
## [0.16.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.10) - 2023-11-22
10+
11+
Allow creating a dataset by crawling all images in a directory, recursively. Also supports privacy mode datasets.
12+
13+
#### Example structure:
14+
```
15+
~/Documents/
16+
data/
17+
2022/
18+
- img01.png
19+
- img02.png
20+
2023/
21+
- img01.png
22+
- img02.png
23+
```
24+
25+
#### Default Example:
26+
27+
```python
28+
data_dir = "~/Documents/data"
29+
client.create_dataset_from_dir(data_dir)
30+
# this will create a dataset named "data" and will contain 4 images, with the ref IDs:
31+
# ["2022/img01.png", "2022/img02.png", "2023/img01.png", "2023/img02.png"]
32+
```
33+
34+
#### Example Privacy Mode:
35+
36+
This requires that a proxy (or file server) is setup and can serve files _relative_ to the data_dir
37+
38+
```python
39+
data_dir = "~/Documents/data"
40+
client.create_dataset_from_dir(
41+
data_dir,
42+
dataset_name='my-dataset',
43+
use_privacy_mode=True,
44+
privacy_mode_proxy="http://localhost:5000/assets/"
45+
)
46+
```
47+
48+
This would create a dataset `my-dataset`, and when opened in Nucleus, the images would be requested to the path:
49+
`<privacy_mode_proxy>/<img ref id>`, for example: `http://localhost:5000/assets/2022/img01.png`
50+
51+
952
## [0.16.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.9) - 2023-11-17
1053

1154
### Fixes

nucleus/__init__.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import datetime
4646
import os
4747
import warnings
48-
from typing import Any, Dict, List, Optional, Sequence, Union
48+
from typing import Any, Dict, List, Optional, Tuple, Union
4949

5050
import pydantic
5151
import requests
@@ -86,6 +86,7 @@
8686
ERROR_ITEMS,
8787
ERROR_PAYLOAD,
8888
ERRORS_KEY,
89+
GLOB_SIZE_THRESHOLD_CHECK,
8990
I_KEY,
9091
IMAGE_KEY,
9192
IMAGE_URL_KEY,
@@ -150,6 +151,7 @@
150151
from .scene import Frame, LidarScene, VideoScene
151152
from .slice import Slice
152153
from .upload_response import UploadResponse
154+
from .utils import create_items_from_folder_crawl
153155
from .validate import Validate
154156

155157
# pylint: disable=E1101
@@ -1177,3 +1179,62 @@ def _set_api_key(self, api_key):
11771179
raise NoAPIKey()
11781180

11791181
return api_key
1182+
1183+
def create_dataset_from_dir(
1184+
self,
1185+
dirname: str,
1186+
dataset_name: Optional[str] = None,
1187+
use_privacy_mode: bool = False,
1188+
privacy_mode_proxy: str = "",
1189+
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
1190+
skip_size_warning: bool = False,
1191+
) -> Union[Dataset, None]:
1192+
"""
1193+
Create a dataset by recursively crawling through a directory.
1194+
A DatasetItem will be created for each unique image found.
1195+
1196+
Parameters:
1197+
dirname: Where to look for image files, recursively
1198+
dataset_name: If none is given, the parent folder name is used
1199+
use_privacy_mode: Whether the dataset should be treated as privacy
1200+
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
1201+
The proxy should work based on the relative path of the images in the directory.
1202+
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
1203+
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
1204+
"""
1205+
1206+
if use_privacy_mode:
1207+
assert (
1208+
privacy_mode_proxy
1209+
), "When using privacy mode, must specify a proxy to serve the files"
1210+
1211+
# ensures path ends with a slash
1212+
_dirname = os.path.join(os.path.expanduser(dirname), "")
1213+
if not os.path.exists(_dirname):
1214+
raise ValueError(
1215+
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
1216+
)
1217+
1218+
folder_name = os.path.basename(_dirname.rstrip("/"))
1219+
dataset_name = dataset_name or folder_name
1220+
items = create_items_from_folder_crawl(
1221+
_dirname,
1222+
allowed_file_types,
1223+
use_privacy_mode,
1224+
privacy_mode_proxy,
1225+
)
1226+
1227+
if len(items) == 0:
1228+
print(f"Did not find any items in {dirname}")
1229+
return None
1230+
1231+
if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
1232+
raise Exception(
1233+
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
1234+
)
1235+
1236+
dataset = self.create_dataset(
1237+
name=dataset_name, use_privacy_mode=use_privacy_mode
1238+
)
1239+
dataset.append(items, asynchronous=False)
1240+
return dataset

nucleus/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,4 @@
166166
X_KEY = "x"
167167
Y_KEY = "y"
168168
Z_KEY = "z"
169+
GLOB_SIZE_THRESHOLD_CHECK = 500

nucleus/utils.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
"""Shared stateless utility function library"""
2-
2+
import glob
33
import io
44
import json
5+
import os
56
import uuid
67
from collections import defaultdict
7-
from typing import IO, TYPE_CHECKING, Dict, List, Sequence, Type, Union
8+
from typing import IO, TYPE_CHECKING, Dict, List, Sequence, Tuple, Type, Union
89

910
import requests
11+
from PIL import Image
1012
from requests.models import HTTPError
1113

1214
from nucleus.annotation import (
@@ -422,3 +424,57 @@ def paginate_generator(
422424
yield json_value
423425
if not next_token:
424426
break
427+
428+
429+
def get_image_dimension(image_fpath: str) -> Tuple[int, int]:
430+
im = Image.open(image_fpath)
431+
return im.size
432+
433+
434+
def find_matching_filepaths(
435+
dirname: str, allowed_file_types: Tuple[str, ...]
436+
) -> List[str]:
437+
"""
438+
Returns a list of filepaths *relative* to dirname that matched the file globs
439+
"""
440+
relative_fpaths = []
441+
for file_type in allowed_file_types:
442+
pathname = os.path.join(dirname, f"**/*.{file_type}")
443+
print(f"Searching for filepaths that match {pathname}")
444+
fpaths = glob.glob(pathname=pathname, recursive=True)
445+
# keep paths relative to dirname for easier management.
446+
# TODO: this can be skipped in py version >= 3.10, where `root_dir` can be specified in the glob.
447+
relative_fpaths.extend(
448+
[fpath.replace(dirname, "") for fpath in fpaths]
449+
)
450+
return relative_fpaths
451+
452+
453+
def create_items_from_folder_crawl(
454+
dirname: str,
455+
allowed_file_types: Tuple[str, ...],
456+
use_privacy_mode: bool,
457+
privacy_mode_proxy: str,
458+
) -> List[DatasetItem]:
459+
relative_fpaths = find_matching_filepaths(dirname, allowed_file_types)
460+
461+
dataset_items = []
462+
for relative_fpath in relative_fpaths:
463+
ref_id = relative_fpath
464+
465+
image_fpath = os.path.join(dirname, relative_fpath)
466+
width, height = None, None
467+
468+
if use_privacy_mode:
469+
width, height = get_image_dimension(image_fpath)
470+
image_fpath = os.path.join(privacy_mode_proxy, relative_fpath)
471+
472+
item = DatasetItem(
473+
image_location=image_fpath,
474+
reference_id=ref_id,
475+
width=width,
476+
height=height,
477+
)
478+
dataset_items.append(item)
479+
480+
return dataset_items

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running
2525

2626
[tool.poetry]
2727
name = "scale-nucleus"
28-
version = "0.16.9"
28+
version = "0.16.10"
2929
description = "The official Python client library for Nucleus, the Data Platform for AI"
3030
license = "MIT"
3131
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

0 commit comments

Comments
 (0)