15
15
from io import StringIO
16
16
import requests
17
17
18
- from labelbox .exceptions import InvalidQueryError , LabelboxError , ResourceNotFoundError , InvalidAttributeError
18
+ from labelbox .exceptions import InvalidQueryError , LabelboxError , ResourceNotFoundError , ResourceCreationError
19
19
from labelbox .orm .comparison import Comparison
20
20
from labelbox .orm .db_object import DbObject , Updateable , Deletable , experimental
21
21
from labelbox .orm .model import Entity , Field , Relationship
@@ -124,7 +124,6 @@ def data_rows(
124
124
125
125
def create_data_row (self , items = None , ** kwargs ) -> "DataRow" :
126
126
""" Creates a single DataRow belonging to this dataset.
127
-
128
127
>>> dataset.create_data_row(row_data="http://my_site.com/photos/img_01.jpg")
129
128
130
129
Args:
@@ -139,82 +138,31 @@ def create_data_row(self, items=None, **kwargs) -> "DataRow":
139
138
in `kwargs`.
140
139
InvalidAttributeError: in case the DB object type does not contain
141
140
any of the field names given in `kwargs`.
142
-
141
+ ResourceCreationError: If data row creation failed on the server side.
143
142
"""
144
143
invalid_argument_error = "Argument to create_data_row() must be either a dictionary, or kwargs containing `row_data` at minimum"
145
144
146
- def convert_field_keys (items ):
147
- if not isinstance (items , dict ):
148
- raise InvalidQueryError (invalid_argument_error )
149
- return {
150
- key .name if isinstance (key , Field ) else key : value
151
- for key , value in items .items ()
152
- }
153
-
154
145
if items is not None and len (kwargs ) > 0 :
155
146
raise InvalidQueryError (invalid_argument_error )
156
147
157
- DataRow = Entity .DataRow
158
- args = convert_field_keys (items ) if items is not None else kwargs
159
-
160
- if DataRow .row_data .name not in args :
161
- raise InvalidQueryError (
162
- "DataRow.row_data missing when creating DataRow." )
163
-
164
- row_data = args [DataRow .row_data .name ]
165
-
166
- if isinstance (row_data , str ) and row_data .startswith ("s3:/" ):
167
- raise InvalidQueryError (
168
- "row_data: s3 assets must start with 'https'." )
169
-
170
- if not isinstance (row_data , str ):
171
- # If the row data is an object, upload as a string
172
- args [DataRow .row_data .name ] = json .dumps (row_data )
173
- elif os .path .exists (row_data ):
174
- # If row data is a local file path, upload it to server.
175
- args [DataRow .row_data .name ] = self .client .upload_file (row_data )
176
-
177
- # Parse metadata fields, if they are provided
178
- if DataRow .metadata_fields .name in args :
179
- mdo = self .client .get_data_row_metadata_ontology ()
180
- args [DataRow .metadata_fields .name ] = mdo .parse_upsert_metadata (
181
- args [DataRow .metadata_fields .name ])
182
-
183
- if "embeddings" in args :
184
- args ["embeddings" ] = [
185
- EmbeddingVector (** e ).to_gql () for e in args ["embeddings" ]
186
- ]
148
+ args = items if items is not None else kwargs
187
149
188
- query_str = """mutation CreateDataRowPyApi(
189
- $row_data: String!,
190
- $metadata_fields: [DataRowCustomMetadataUpsertInput!],
191
- $attachments: [DataRowAttachmentInput!],
192
- $media_type : MediaType,
193
- $external_id : String,
194
- $global_key : String,
195
- $dataset: ID!,
196
- $embeddings: [DataRowEmbeddingVectorInput!]
197
- ){
198
- createDataRow(
199
- data:
200
- {
201
- rowData: $row_data
202
- mediaType: $media_type
203
- metadataFields: $metadata_fields
204
- externalId: $external_id
205
- globalKey: $global_key
206
- attachments: $attachments
207
- dataset: {connect: {id: $dataset}}
208
- embeddings: $embeddings
209
- }
210
- )
211
- {%s}
212
- }
213
- """ % query .results_query_part (Entity .DataRow )
214
- res = self .client .execute (query_str , {** args , 'dataset' : self .uid })
215
- return DataRow (self .client , res ['createDataRow' ])
150
+ file_upload_thread_count = 1
151
+ completed_task = self ._create_data_rows_sync (
152
+ [args ], file_upload_thread_count = file_upload_thread_count )
216
153
217
- def create_data_rows_sync (self , items ) -> None :
154
+ res = completed_task .result
155
+ if res is None or len (res ) == 0 :
156
+ raise ResourceCreationError (
157
+ f"Data row upload did not complete, task status { completed_task .status } task id { completed_task .uid } "
158
+ )
159
+
160
+ return self .client .get_data_row (res [0 ]['id' ])
161
+
162
+ def create_data_rows_sync (
163
+ self ,
164
+ items ,
165
+ file_upload_thread_count = FILE_UPLOAD_THREAD_COUNT ) -> None :
218
166
""" Synchronously bulk upload data rows.
219
167
220
168
Use this instead of `Dataset.create_data_rows` for smaller batches of data rows that need to be uploaded quickly.
@@ -228,32 +176,49 @@ def create_data_rows_sync(self, items) -> None:
228
176
None. If the function doesn't raise an exception then the import was successful.
229
177
230
178
Raises:
231
- InvalidQueryError : If the `items` parameter does not conform to
179
+ ResourceCreationError : If the `items` parameter does not conform to
232
180
the specification in Dataset._create_descriptor_file or if the server did not accept the
233
181
DataRow creation request (unknown reason).
234
182
InvalidAttributeError: If there are fields in `items` not valid for
235
183
a DataRow.
236
184
ValueError: When the upload parameters are invalid
237
185
"""
186
+ warnings .warn (
187
+ "This method is deprecated and will be "
188
+ "removed in a future release. Please use create_data_rows instead." )
189
+
190
+ self ._create_data_rows_sync (
191
+ items , file_upload_thread_count = file_upload_thread_count )
192
+
193
+ return None # Return None if no exception is raised
194
+
195
+ def _create_data_rows_sync (self ,
196
+ items ,
197
+ file_upload_thread_count = FILE_UPLOAD_THREAD_COUNT
198
+ ) -> "DataUpsertTask" :
238
199
max_data_rows_supported = 1000
239
- max_attachments_per_data_row = 5
240
200
if len (items ) > max_data_rows_supported :
241
201
raise ValueError (
242
202
f"Dataset.create_data_rows_sync() supports a max of { max_data_rows_supported } data rows."
243
203
" For larger imports use the async function Dataset.create_data_rows()"
244
204
)
245
- descriptor_url = DescriptorFileCreator (self .client ).create_one (
246
- items , max_attachments_per_data_row = max_attachments_per_data_row )
247
- dataset_param = "datasetId"
248
- url_param = "jsonUrl"
249
- query_str = """mutation AppendRowsToDatasetSyncPyApi($%s: ID!, $%s: String!){
250
- appendRowsToDatasetSync(data:{datasetId: $%s, jsonFileUrl: $%s}
251
- ){dataset{id}}} """ % (dataset_param , url_param , dataset_param ,
252
- url_param )
253
- self .client .execute (query_str , {
254
- dataset_param : self .uid ,
255
- url_param : descriptor_url
256
- })
205
+ if file_upload_thread_count < 1 :
206
+ raise ValueError (
207
+ "file_upload_thread_count must be a positive integer" )
208
+
209
+ task : DataUpsertTask = self .create_data_rows (items ,
210
+ file_upload_thread_count )
211
+ task .wait_till_done ()
212
+
213
+ if task .has_errors ():
214
+ raise ResourceCreationError (
215
+ f"Data row upload errors: { task .errors } " , cause = task .uid )
216
+ if task .status != "COMPLETE" :
217
+ raise ResourceCreationError (
218
+ f"Data row upload did not complete, task status { task .status } task id { task .uid } "
219
+ )
220
+
221
+ return task
257
222
258
223
def create_data_rows (self ,
259
224
items ,
@@ -287,14 +252,18 @@ def create_data_rows(self,
287
252
raise ValueError (
288
253
"file_upload_thread_count must be a positive integer" )
289
254
255
+ # Usage example
256
+ upload_items = self ._separate_and_process_items (items )
257
+ specs = DataRowCreateItem .build (self .uid , upload_items )
258
+ return self ._exec_upsert_data_rows (specs , file_upload_thread_count )
259
+
260
+ def _separate_and_process_items (self , items ):
290
261
string_items = [item for item in items if isinstance (item , str )]
291
262
dict_items = [item for item in items if isinstance (item , dict )]
292
263
dict_string_items = []
293
264
if len (string_items ) > 0 :
294
265
dict_string_items = self ._build_from_local_paths (string_items )
295
- specs = DataRowCreateItem .build (self .uid ,
296
- dict_items + dict_string_items )
297
- return self ._exec_upsert_data_rows (specs , file_upload_thread_count )
266
+ return dict_items + dict_string_items
298
267
299
268
def _build_from_local_paths (
300
269
self ,
0 commit comments