Skip to content

Commit a949e6d

Browse files
author
Matt Sokoloff
committed
support bulk exporting datasets
1 parent 6983bfb commit a949e6d

File tree

3 files changed

+59
-5
lines changed

3 files changed

+59
-5
lines changed

labelbox/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name = "labelbox"
2-
__version__ = "2.6.0"
2+
__version__ = "2.7.0"
33

44
from labelbox.client import Client
55
from labelbox.schema.bulk_import_request import BulkImportRequest

labelbox/schema/dataset.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@
33
import logging
44
from itertools import islice
55
from multiprocessing.dummy import Pool as ThreadPool
6+
import time
7+
import ndjson
8+
from io import StringIO
9+
import requests
610

7-
from labelbox.exceptions import InvalidQueryError, ResourceNotFoundError, InvalidAttributeError
11+
from labelbox.exceptions import InvalidQueryError, LabelboxError, ResourceNotFoundError, InvalidAttributeError
812
from labelbox.orm.db_object import DbObject, Updateable, Deletable
913
from labelbox.orm.model import Entity, Field, Relationship
1014

@@ -75,15 +79,15 @@ def create_data_rows(self, items):
7579
is uploaded to Labelbox and a DataRow referencing it is created.
7680
7781
If an item is a `dict`, then it could support one of the two following structures
78-
1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values.
82+
1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values.
7983
At the minimum an `item` passed as a `dict` must contain a `DataRow.row_data` key and value.
8084
2. For tiled imagery the dict must match the import structure specified in the link below
8185
https://docs.labelbox.com/data-model/en/index-en#tiled-imagery-import
82-
86+
8387
>>> dataset.create_data_rows([
8488
>>> {DataRow.row_data:"http://my_site.com/photos/img_01.jpg"},
8589
>>> "path/to/file2.jpg",
86-
>>> {"tileLayerUrl" : "http://", ...}
90+
>>> {"tileLayerUrl" : "http://", ...}
8791
>>> ])
8892
8993
For an example showing how to upload tiled data_rows see the following notebook:
@@ -227,3 +231,43 @@ def data_row_for_external_id(self, external_id):
227231
f"More than one data_row has the provided external_id : `%s`. Use function data_rows_for_external_id to fetch all",
228232
external_id)
229233
return data_rows[0]
234+
235+
def export_data_rows(self, timeout_seconds=120):
236+
""" Returns a generator that produces all data rows that are currently attached to this datarow.
237+
238+
Args:
239+
timeout_seconds (float): Max waiting time, in seconds.
240+
as_dict (bool): Whether or not to return the data as a dictionary as opposed to a list of DataRow objects.
241+
This is recommended for exports larger than 100k data rows to reduce memory usage and load time.
242+
Returns:
243+
Generator of DataRow objects that belong to this dataset.
244+
Raises:
245+
LabelboxError: if the export fails or is unable to download within the specified time.
246+
"""
247+
id_param = "datasetId"
248+
query_str = """mutation GetQueuedDataRowsExportUrlPyApi($%s: ID!)
249+
{exportDatasetDataRows(data:{datasetId: $%s }) {downloadUrl createdAt status}}
250+
""" % (id_param, id_param)
251+
sleep_time = 2
252+
while True:
253+
res = self.client.execute(query_str, {id_param: self.uid})
254+
res = res["exportDatasetDataRows"]
255+
if res["status"] == "COMPLETE":
256+
download_url = res["downloadUrl"]
257+
response = requests.get(download_url)
258+
response.raise_for_status()
259+
reader = ndjson.reader(StringIO(response.text))
260+
return (
261+
Entity.DataRow(self.client, result) for result in reader)
262+
elif res["status"] == "FAILED":
263+
raise LabelboxError("Data row export failed.")
264+
265+
timeout_seconds -= sleep_time
266+
if timeout_seconds <= 0:
267+
raise LabelboxError(
268+
f"Unable to export data rows within {timeout_seconds} seconds."
269+
)
270+
271+
logger.debug("Dataset '%s' data row export, waiting for server...",
272+
self.uid)
273+
time.sleep(sleep_time)

tests/integration/test_dataset.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,13 @@ def test_upload_video_file(dataset, sample_video: str) -> None:
9797
response = requests.head(url, allow_redirects=True)
9898
assert int(response.headers['Content-Length']) == content_length
9999
assert response.headers['Content-Type'] == 'video/mp4'
100+
101+
102+
def test_data_row_export(dataset):
103+
n_data_rows = 5
104+
ids = {}
105+
for _ in range(n_data_rows):
106+
ids.add(dataset.create_data_row(row_data=IMG_URL))
107+
result = list(dataset.export_queued_data_rows())
108+
assert len(result) == n_data_rows
109+
assert set(result) == ids

0 commit comments

Comments
 (0)