clean up

Matt Sokoloff · Matt Sokoloff · commit 5096fc18d3fa · 2021-08-26T09:14:59.000-04:00
diff --git a/labelbox/schema/asset_attachment.py b/labelbox/schema/asset_attachment.py
@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import Dict
 
 from labelbox.orm.db_object import DbObject
 from labelbox.orm.model import Field
@@ -24,3 +25,20 @@ class AttachmentType(Enum):
 
     attachment_type = Field.String("attachment_type", "type")
     attachment_value = Field.String("attachment_value", "value")
+
+    @classmethod
+    def validate_attachment_json(cls, attachment_json: Dict[str, str]) -> None:
+        for required_key in ['type', 'value']:
+            if required_key not in attachment_json:
+                raise ValueError(
+                    f"Must provide a `{required_key}` key for each attachment. Found {attachment_json}."
+                )
+            cls.validate_attachment_type(attachment_json['type'])
+
+    @classmethod
+    def validate_attachment_type(cls, attachment_type: str) -> None:
+        valid_types = set(cls.AttachmentType.__members__)
+        if attachment_type not in valid_types:
+            raise ValueError(
+                f"meta_type must be one of {valid_types}. Found {attachment_type}"
+            )
diff --git a/labelbox/schema/data_row.py b/labelbox/schema/data_row.py
@@ -42,10 +42,8 @@ class DataRow(DbObject, Updateable, BulkDeletable):
     labels = Relationship.ToMany("Label", True)
     attachments = Relationship.ToMany("AssetAttachment", False, "attachments")
 
-    supported_meta_types = supported_attachment_types = {
-        attachment_type.value
-        for attachment_type in AssetAttachment.AttachmentType
-    }
+    supported_meta_types = supported_attachment_types = set(
+        AssetAttachment.AttachmentType.__members__)
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -103,11 +101,7 @@ def create_attachment(self, attachment_type, attachment_value):
         Raises:
             ValueError: asset_type must be one of the supported types.
         """
-
-        if attachment_type not in self.supported_attachment_types:
-            raise ValueError(
-                f"meta_type must be one of {self.supported_attachment_types}. Found {attachment_type}"
-            )
+        AssetAttachment.validate_attachment_type(attachment_type)
 
         attachment_type_param = "type"
         attachment_value_param = "value"
diff --git a/labelbox/schema/dataset.py b/labelbox/schema/dataset.py
@@ -75,27 +75,24 @@ def create_data_row(self, **kwargs):
         return self.client._create(DataRow, kwargs)
 
     def create_data_rows(self, items):
-
-        ## NOTE TODOS
-        """
-        Add attachments (works with all types)
-        Add external ids to bulk imports
-        improved error handling (why job was accepted or not)
-        """
         """ Creates multiple DataRow objects based on the given `items`.
 
         Each element in `items` can be either a `str` or a `dict`. If
         it is a `str`, then it is interpreted as a local file path. The file
         is uploaded to Labelbox and a DataRow referencing it is created.
 
         If an item is a `dict`, then it could support one of the two following structures
-            1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values.
-               At the minimum an `item` passed as a `dict` must contain a `DataRow.row_data` key and value.
+            1. For static imagery, video, and text it should map `DataRow` field names to values.
+               At the minimum an `item` passed as a `dict` must contain a `row_data` key and value.
+               If the value for row_data is a local file path and the path exists,
+               then the local file will be uploaded to labelbox.
+
             2. For tiled imagery the dict must match the import structure specified in the link below
                https://docs.labelbox.com/data-model/en/index-en#tiled-imagery-import
 
         >>> dataset.create_data_rows([
         >>>     {DataRow.row_data:"http://my_site.com/photos/img_01.jpg"},
+        >>>     {DataRow.row_data:"/path/to/file1.jpg"},
         >>>     "path/to/file2.jpg",
         >>>     {"tileLayerUrl" : "http://", ...}
         >>>     ])
@@ -123,72 +120,72 @@ def create_data_rows(self, items):
         DataRow = Entity.DataRow
 
         def upload_if_necessary(item):
-            if isinstance(item, str):
-                item_url = self.client.upload_file(item)
-                item = {DataRow.row_data: item_url, DataRow.external_id: item}
-            elif isinstance(item, dict):
-                if os.path.exists(item['row_data']):
-                    item_url = self.client.upload_file(item['row_data'])
-                    parts = {
-                        DataRow.row_data:
-                            item_url,
-                        DataRow.external_id:
-                            item.get('external_id', item['row_data'])
-                    }
-                    attachments = item.get('attachments')
-                    if attachments:
-                        item = {**parts, **{'attachments': attachments}}
-                    else:
-                        item = parts
+            row_data = item['row_data']
+            if os.path.exists(row_data):
+                item_url = self.client.upload_file(item['row_data'])
+                item = {
+                    "row_data": item_url,
+                    "external_id": item.get('external_id', item['row_data']),
+                    "attachments": item.get('attachments', [])
+                }
             return item
 
         def validate_attachments(item):
             attachments = item.get('attachments')
             if attachments:
                 if isinstance(attachments, list):
                     for attachment in attachments:
-                        for required_key in ['type', 'value']:
-                            if required_key not in attachment:
-                                raise ValueError(
-                                    f"Must provide a `{required_key}` key for each attachment. Found {attachment}."
-                                )
-                        attachment_type = attachment.get('type')
-                        if attachment_type not in DataRow.supported_attachment_types:
-                            raise ValueError(
-                                f"meta_type must be one of {DataRow.supported_attachment_types}. Found {attachment_type}"
-                            )
+                        Entity.AssetAttachment.validate_attachment_json(
+                            attachment)
                 else:
                     raise ValueError(
                         f"Attachments must be a list. Found {type(attachments)}"
                     )
             return attachments
 
-        def convert_item(item):
-            # Don't make any changes to tms data
-            validate_attachments(item)
-            if "tileLayerUrl" in item:
-                return item
-
-            item = upload_if_necessary(item)
-            # Convert fields to string names.
-            item = {
-                key.name if isinstance(key, Field) else key: value
-                for key, value in item.items()
-            }
+        def format_row(item):
+            # Formats user input into a consistent dict structure
+            if isinstance(item, dict):
+                # Convert fields to strings
+                item = {
+                    key.name if isinstance(key, Field) else key: value
+                    for key, value in item.items()
+                }
+            elif isinstance(item, str):
+                # The main advantage of using a string over a dict is that the user is specifying
+                # that the file should exist locally.
+                # That info is lost after this section so we should check for it here.
+                if not os.path.exists(item):
+                    raise ValueError(f"Filepath {item} does not exist.")
+                item = {"row_data": item, "external_id": item}
+            return item
 
+        def validate_keys(item):
             if 'row_data' not in item:
                 raise InvalidQueryError(
                     "`row_data` missing when creating DataRow.")
 
-            # TODO: This is technically breaking. but also idt anyone is using the other fields.
             invalid_keys = set(item) - {
-                'row_data', 'external_id', 'attachments'
+                *{f.name for f in DataRow.fields()}, 'attachments'
             }
             if invalid_keys:
                 raise InvalidAttributeError(DataRow, invalid_keys)
+            return item
+
+        def convert_item(item):
+            # Don't make any changes to tms data
+            if "tileLayerUrl" in item:
+                validate_attachments(item)
+                return item
+            # Convert all payload variations into the same dict format
+            item = format_row(item)
+            # Make sure required keys exist (and there are no extra keys)
+            validate_keys(item)
+            # Make sure attachments are valid
+            validate_attachments(item)
+            # Upload any local file paths
+            item = upload_if_necessary(item)
 
-            # Item is valid, convert it to a dict {graphql_field_name: value}
-            # Need to change the name of DataRow.row_data to "data"
             return {
                 "data" if key == "row_data" else utils.camel_case(key): value
                 for key, value in item.items()
@@ -207,7 +204,8 @@ def convert_item(item):
         query_str = """mutation AppendRowsToDatasetPyApi($%s: ID!, $%s: String!){
             appendRowsToDataset(data:{datasetId: $%s, jsonFileUrl: $%s}
             ){ taskId accepted errorMessage } } """ % (dataset_param, url_param,
-                                          dataset_param, url_param)
+                                                       dataset_param, url_param)
+
         res = self.client.execute(query_str, {
             dataset_param: self.uid,
             url_param: descriptor_url
diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py
@@ -44,8 +44,19 @@ def test_data_row_bulk_creation(dataset, rand_gen, image_url):
         task.wait_till_done()
         assert task.status == "COMPLETE"
 
+        task = dataset.create_data_rows([{
+            "row_data": fp.name,
+            'external_id': 'some_name'
+        }])
+        task.wait_till_done()
+        assert task.status == "COMPLETE"
+
+        task = dataset.create_data_rows([{"row_data": fp.name}])
+        task.wait_till_done()
+        assert task.status == "COMPLETE"
+
     data_rows = list(dataset.data_rows())
-    assert len(data_rows) == 3
+    assert len(data_rows) == 5
     url = ({data_row.row_data for data_row in data_rows} - {image_url}).pop()
     assert requests.get(url).content == data
 
@@ -64,7 +75,7 @@ def test_data_row_large_bulk_creation(dataset, image_url):
     assert task.status == "IN_PROGRESS"
     task.wait_till_done(timeout_seconds=120)
     assert task.status == "COMPLETE"
-    data_rows = len(list(dataset.data_rows())) == 5003
+    assert len(list(dataset.data_rows())) == 1000
 
 
 @pytest.mark.xfail(reason="DataRow.dataset() relationship not set")
@@ -168,3 +179,34 @@ def test_data_row_iteration(dataset, image_url) -> None:
     ])
     task.wait_till_done()
     assert next(dataset.data_rows())
+
+
+def test_data_row_attachments(dataset, image_url):
+    attachments = [("IMAGE", image_url), ("TEXT", "test-text"),
+                   ("IMAGE_OVERLAY", image_url), ("HTML", image_url)]
+    task = dataset.create_data_rows([{
+        "row_data": image_url,
+        "external_id": "test-id",
+        "attachments": [{
+            "type": attachment_type,
+            "value": attachment_value
+        }]
+    } for attachment_type, attachment_value in attachments])
+
+    task.wait_till_done()
+    assert task.status == "COMPLETE"
+    data_rows = list(dataset.data_rows())
+    assert len(data_rows) == len(attachments)
+    for data_row in data_rows:
+        assert len(list(data_row.attachments())) == 1
+        assert data_row.external_id == "test-id"
+
+    with pytest.raises(ValueError) as exc:
+        task = dataset.create_data_rows([{
+            "row_data": image_url,
+            "external_id": "test-id",
+            "attachments": [{
+                "type": "INVALID",
+                "value": "123"
+            }]
+        }])
diff --git a/tests/integration/test_dataset.py b/tests/integration/test_dataset.py
@@ -75,6 +75,11 @@ def test_get_data_row_for_external_id(dataset, rand_gen, image_url):
     dataset.create_data_row(row_data=image_url, external_id=external_id)
     assert len(dataset.data_rows_for_external_id(external_id)) == 2
 
+    task = dataset.create_data_rows(
+        [dict(row_data=image_url, external_id=external_id)])
+    task.wait_until_done()
+    assert len(dataset.data_rows_for_external_id(external_id)) == 3
+
 
 def test_upload_video_file(dataset, sample_video: str) -> None:
     """
@@ -104,4 +109,3 @@ def test_data_row_export(dataset, image_url):
     result = list(dataset.export_data_rows())
     assert len(result) == n_data_rows
     assert set(result) == ids
-