Skip to content

Commit 126d9c8

Browse files
author
Matt Sokoloff
committed
metadata bug fixes, remove need for embeddings
1 parent 4b061f8 commit 126d9c8

File tree

2 files changed

+27
-30
lines changed

2 files changed

+27
-30
lines changed

labelbox/schema/data_row_metadata.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# type: ignore
2-
import datetime
2+
from datetime import datetime
33
import warnings
44
from copy import deepcopy
55
from enum import Enum
@@ -42,7 +42,7 @@ def id(self):
4242

4343
# Constraints for metadata values
4444
Embedding: Type[List[float]] = conlist(float, min_items=128, max_items=128)
45-
DateTime: Type[datetime.datetime] = datetime.datetime # must be in UTC
45+
DateTime: Type[datetime] = datetime # must be in UTC
4646
String: Type[str] = constr(max_length=500)
4747
OptionId: Type[SchemaId] = SchemaId # enum option
4848
Number: Type[float] = float
@@ -62,7 +62,7 @@ class Config:
6262
# Metadata base class
6363
class DataRowMetadataField(_CamelCaseMixin):
6464
schema_id: SchemaId
65-
value: Any
65+
value: Union[DataRowMetadataValue, _DataRowMetadataValuePrimitives]
6666

6767

6868
class DataRowMetadata(_CamelCaseMixin):
@@ -489,7 +489,6 @@ def _validate_parse_number(
489489

490490
def _validate_parse_datetime(
491491
field: DataRowMetadataField) -> List[Dict[str, Union[SchemaId, str]]]:
492-
# TODO: better validate tzinfo
493492
return [{
494493
"schemaId": field.schema_id,
495494
"value": field.value.isoformat() + "Z", # needs to be UTC

tests/integration/test_data_row_metadata.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from labelbox.schema.data_row_metadata import DataRowMetadataField, DataRowMetadata, DeleteDataRowMetadata, \
88
DataRowMetadataOntology
99

10+
INVALID_SCHEMA_ID = "1" * 25
1011
FAKE_SCHEMA_ID = "0" * 25
1112
FAKE_DATAROW_ID = "D" * 25
1213
SPLIT_SCHEMA_ID = "cko8sbczn0002h2dkdaxb5kal"
@@ -15,6 +16,7 @@
1516
EMBEDDING_SCHEMA_ID = "ckpyije740000yxdk81pbgjdc"
1617
TEXT_SCHEMA_ID = "cko8s9r5v0001h2dk9elqdidh"
1718
CAPTURE_DT_SCHEMA_ID = "cko8sdzv70006h2dk8jg64zvb"
19+
PRE_COMPUTED_EMBEDDINGS_ID = 'ckrzang79000008l6hb5s6za1'
1820

1921
FAKE_NUMBER_FIELD = {
2022
"id": FAKE_SCHEMA_ID,
@@ -47,17 +49,6 @@ def big_dataset(dataset: Dataset, image_url):
4749
dataset.delete()
4850

4951

50-
def wait_for_embeddings_svc(data_row_ids, mdo):
51-
for idx in range(5):
52-
if all([
53-
len(metadata.fields)
54-
for metadata in mdo.bulk_export(data_row_ids)
55-
]):
56-
return
57-
time.sleep((idx + 1)**2)
58-
raise Exception("Embedding svc failed to update metadata.")
59-
60-
6152
def make_metadata(dr_id) -> DataRowMetadata:
6253
embeddings = [0.0] * 128
6354
msg = "A message"
@@ -97,18 +88,20 @@ def test_get_datarow_metadata_ontology(mdo):
9788

9889

9990
def test_bulk_upsert_datarow_metadata(datarow, mdo: DataRowMetadataOntology):
100-
wait_for_embeddings_svc([datarow.uid], mdo)
10191
metadata = make_metadata(datarow.uid)
10292
mdo.bulk_upsert([metadata])
103-
assert len(mdo.bulk_export([datarow.uid]))
104-
assert len(mdo.bulk_export([datarow.uid])[0].fields) == 5
93+
exported = mdo.bulk_export([datarow.uid])
94+
assert len(exported)
95+
assert len([
96+
field for field in exported[0].fields
97+
if field.schema_id != PRE_COMPUTED_EMBEDDINGS_ID
98+
]) == 4
10599

106100

107101
@pytest.mark.slow
108102
def test_large_bulk_upsert_datarow_metadata(big_dataset, mdo):
109103
metadata = []
110104
data_row_ids = [dr.uid for dr in big_dataset.data_rows()]
111-
wait_for_embeddings_svc(data_row_ids, mdo)
112105
for data_row_id in data_row_ids:
113106
metadata.append(make_metadata(data_row_id))
114107
errors = mdo.bulk_upsert(metadata)
@@ -119,14 +112,16 @@ def test_large_bulk_upsert_datarow_metadata(big_dataset, mdo):
119112
for metadata in mdo.bulk_export(data_row_ids)
120113
}
121114
for data_row_id in data_row_ids:
122-
assert len(metadata_lookup.get(data_row_id).fields)
115+
assert len([
116+
f for f in metadata_lookup.get(data_row_id).fields
117+
if f.schema_id != PRE_COMPUTED_EMBEDDINGS_ID
118+
]), metadata_lookup.get(data_row_id).fields
123119

124120

125121
def test_bulk_delete_datarow_metadata(datarow, mdo):
126122
"""test bulk deletes for all fields"""
127123
metadata = make_metadata(datarow.uid)
128124
mdo.bulk_upsert([metadata])
129-
130125
assert len(mdo.bulk_export([datarow.uid])[0].fields)
131126
upload_ids = [m.schema_id for m in metadata.fields[:-2]]
132127
mdo.bulk_delete(
@@ -155,7 +150,6 @@ def test_bulk_partial_delete_datarow_metadata(datarow, mdo):
155150
def test_large_bulk_delete_datarow_metadata(big_dataset, mdo):
156151
metadata = []
157152
data_row_ids = [dr.uid for dr in big_dataset.data_rows()]
158-
wait_for_embeddings_svc(data_row_ids, mdo)
159153
for data_row_id in data_row_ids:
160154
metadata.append(
161155
DataRowMetadata(data_row_id=data_row_id,
@@ -181,29 +175,33 @@ def test_large_bulk_delete_datarow_metadata(big_dataset, mdo):
181175
errors = mdo.bulk_delete(deletes)
182176
assert len(errors) == 0
183177
for data_row_id in data_row_ids:
184-
# 2 remaining because we delete the user provided embedding but text and labelbox generated embeddings still exist
185-
fields = mdo.bulk_export([data_row_id])[0].fields
186-
assert len(fields) == 2
178+
fields = [
179+
f for f in mdo.bulk_export([data_row_id])[0].fields
180+
if f.schema_id != PRE_COMPUTED_EMBEDDINGS_ID
181+
]
182+
assert len(fields) == 1, fields
187183
assert EMBEDDING_SCHEMA_ID not in [field.schema_id for field in fields]
188184

189185

190186
def test_bulk_delete_datarow_enum_metadata(datarow: DataRow, mdo):
191187
"""test bulk deletes for non non fields"""
192-
wait_for_embeddings_svc([datarow.uid], mdo)
193188
metadata = make_metadata(datarow.uid)
194189
metadata.fields = [
195190
m for m in metadata.fields if m.schema_id == SPLIT_SCHEMA_ID
196191
]
197192
mdo.bulk_upsert([metadata])
198193

199-
assert len(mdo.bulk_export([datarow.uid])[0].fields) == len(
194+
exported = mdo.bulk_export([datarow.uid])[0].fields
195+
assert len(exported) == len(
200196
set([x.schema_id for x in metadata.fields] +
201-
[x.schema_id for x in mdo.bulk_export([datarow.uid])[0].fields]))
197+
[x.schema_id for x in exported]))
202198

203199
mdo.bulk_delete([
204200
DeleteDataRowMetadata(data_row_id=datarow.uid, fields=[SPLIT_SCHEMA_ID])
205201
])
206-
assert len(mdo.bulk_export([datarow.uid])[0].fields) == 1
202+
exported = mdo.bulk_export([datarow.uid])[0].fields
203+
assert len(
204+
[f for f in exported if f.schema_id != PRE_COMPUTED_EMBEDDINGS_ID]) == 0
207205

208206

209207
def test_raise_enum_upsert_schema_error(datarow, mdo):
@@ -223,7 +221,7 @@ def test_upsert_non_existent_schema_id(datarow, mdo):
223221
metadata = DataRowMetadata(data_row_id=datarow.uid,
224222
fields=[
225223
DataRowMetadataField(
226-
schema_id=FAKE_SCHEMA_ID,
224+
schema_id=INVALID_SCHEMA_ID,
227225
value="message"),
228226
])
229227
with pytest.raises(ValueError):

0 commit comments

Comments
 (0)