Skip to content

Commit 9b1f15f

Browse files
author
Matt Sokoloff
committed
recommended
1 parent 7432b66 commit 9b1f15f

File tree

7 files changed

+111
-102
lines changed

7 files changed

+111
-102
lines changed

labelbox/data/serialization/coco/annotation.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
from pydantic import BaseModel
2-
from collections import defaultdict
31
from typing import Tuple, List, Union
2+
from pathlib import Path
3+
from collections import defaultdict
4+
5+
from pydantic import BaseModel
46
import numpy as np
57

68

7-
def rle_decoding(rle_arr, w, h):
9+
def rle_decoding(rle_arr : List[int], w : int, h: int) -> np.ndarray:
810
indices = []
911
for idx, cnt in zip(rle_arr[0::2], rle_arr[1::2]):
1012
indices.extend(list(range(idx - 1,
@@ -51,5 +53,5 @@ class COCOObjectAnnotation(BaseModel):
5153
class PanopticAnnotation(BaseModel):
5254
# One to one relationship between image and panoptic annotation
5355
image_id: int
54-
file_name: str
56+
file_name: Path
5557
segments_info: List[SegmentInfo]
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
import sys
2+
13
from pydantic import BaseModel
2-
from typing import Optional
3-
import hashlib
44

55

66
class Categories(BaseModel):
@@ -11,4 +11,4 @@ class Categories(BaseModel):
1111

1212

1313
def hash_category_name(name: str) -> int:
14-
return int(hashlib.sha256(name.encode('utf-8')).hexdigest(), 16) % 10000
14+
return hash(name) + sys.maxsize

labelbox/data/serialization/coco/converter.py

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
from typing import Dict, Any
2+
from pathlib import Path
23
import os
34

45
from labelbox.data.annotation_types.collection import LabelCollection, LabelGenerator
56
from labelbox.data.serialization.coco.instance_dataset import CocoInstanceDataset
67
from labelbox.data.serialization.coco.panoptic_dataset import CocoPanopticDataset
78

89

9-
def create_path_if_not_exists(path: str, ignore_existing_data=False):
10-
if not os.path.exists(path):
11-
os.makedirs(path)
10+
def create_path_if_not_exists(path: Path, ignore_existing_data=False):
11+
if not path.exists():
12+
path.mkdir(parents=True, exist_ok=True)
1213
elif not ignore_existing_data and os.listdir(path):
1314
raise ValueError(
1415
f"Directory `{path}`` must be empty. Or set `ignore_existing_data=True`"
1516
)
1617

1718

18-
def validate_path(path, name):
19-
if not os.path.exists(path):
19+
def validate_path(path: Path, name: str):
20+
if not path.exists():
2021
raise ValueError(f"{name} `{path}` must exist")
2122

2223

@@ -28,10 +29,10 @@ class COCOConverter:
2829
Subclasses are currently ignored.
2930
To use subclasses, manually flatten them before using the converter.
3031
"""
31-
32+
@staticmethod
3233
def serialize_instances(labels: LabelCollection,
33-
image_root: str,
34-
ignore_existing_data=False) -> Dict[str, Any]:
34+
image_root: Path,
35+
ignore_existing_data=False, max_workers = 8) -> Dict[str, Any]:
3536
"""
3637
Convert a Labelbox LabelCollection into an mscoco dataset.
3738
This function will only convert masks, polygons, and rectangles.
@@ -43,18 +44,20 @@ def serialize_instances(labels: LabelCollection,
4344
image_root: Where to save images to
4445
ignore_existing_data: Whether or not to raise an exception if images already exist.
4546
This exists only to support detectons panoptic fpn model which requires two mscoco payloads for the same images.
47+
max_workers : Number of workers to process dataset with
4648
Returns:
4749
A dictionary containing labels in the coco object format.
4850
"""
4951
create_path_if_not_exists(image_root, ignore_existing_data)
5052
return CocoInstanceDataset.from_common(labels=labels,
51-
image_root=image_root).dict()
53+
image_root=image_root, max_workers = max_workers).dict()
5254

55+
@staticmethod
5356
def serialize_panoptic(labels: LabelCollection,
54-
image_root: str,
55-
mask_root: str,
57+
image_root: Path,
58+
mask_root: Path,
5659
all_stuff: bool = False,
57-
ignore_existing_data=False) -> Dict[str, Any]:
60+
ignore_existing_data=False, max_workers = 8) -> Dict[str, Any]:
5861
"""
5962
Convert a Labelbox LabelCollection into an mscoco dataset.
6063
This function will only convert masks, polygons, and rectangles.
@@ -69,6 +72,7 @@ def serialize_panoptic(labels: LabelCollection,
6972
To convert them to stuff class set `all_stuff=True`.
7073
ignore_existing_data: Whether or not to raise an exception if images already exist.
7174
This exists only to support detectons panoptic fpn model which requires two mscoco payloads for the same images.
75+
max_workers : Number of workers to process dataset with
7276
Returns:
7377
A dictionary containing labels in the coco panoptic format.
7478
"""
@@ -77,11 +81,11 @@ def serialize_panoptic(labels: LabelCollection,
7781
return CocoPanopticDataset.from_common(labels=labels,
7882
image_root=image_root,
7983
mask_root=mask_root,
80-
all_stuff=all_stuff).dict()
84+
all_stuff=all_stuff , max_workers = max_workers).dict()
8185

82-
@classmethod
83-
def deserialize_panoptic(cls, json_data: Dict[str, Any], image_root: str,
84-
mask_root: str) -> LabelGenerator:
86+
@staticmethod
87+
def deserialize_panoptic(json_data: Dict[str, Any], image_root: Path,
88+
mask_root: Path) -> LabelGenerator:
8589
"""
8690
Convert coco panoptic data into the labelbox format (as a LabelGenerator).
8791
@@ -98,9 +102,9 @@ def deserialize_panoptic(cls, json_data: Dict[str, Any], image_root: str,
98102
gen = objs.to_common(image_root, mask_root)
99103
return LabelGenerator(data=gen)
100104

101-
@classmethod
102-
def deserialize_instances(cls, json_data: Dict[str, Any],
103-
image_root: str) -> LabelGenerator:
105+
@staticmethod
106+
def deserialize_instances(json_data: Dict[str, Any],
107+
image_root: Path) -> LabelGenerator:
104108
"""
105109
Convert coco object data into the labelbox format (as a LabelGenerator).
106110

labelbox/data/serialization/coco/image.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,51 @@
1-
from labelbox.data.annotation_types import Label
2-
from pydantic import BaseModel
1+
from pathlib import Path
2+
33
from typing import Optional, Tuple
4-
import os
4+
from pydantic import BaseModel
55
from PIL import Image
6-
import hashlib
76
import imagesize
87

8+
from labelbox.data.annotation_types import Label
9+
910

1011
class CocoImage(BaseModel):
1112
id: int
1213
width: int
1314
height: int
14-
file_name: str
15+
file_name: Path
1516
license: Optional[int] = None
1617
flickr_url: Optional[str] = None
1718
coco_url: Optional[str] = None
18-
#date_captured: datetime
1919

2020

21-
def get_image_id(label: Label, idx: int):
21+
def get_image_id(label: Label, idx: int) -> int:
2222
if label.data.file_path is not None:
2323
file_name = label.data.file_path.replace(".jpg", "")
2424
if file_name.isdecimal():
25-
return label.data.file_path.replace(".jpg", "")
25+
return file_name
2626
return idx
2727

2828

29-
def get_image(label: Label, image_root, image_id):
30-
path = os.path.join(image_root, f"{image_id}.jpg")
31-
if not os.path.exists(path):
29+
def get_image(label: Label, image_root : Path, image_id: str) -> CocoImage:
30+
path = Path(image_root, f"{image_id}.jpg")
31+
if not path.exists():
3232
im = Image.fromarray(label.data.value)
3333
im.save(path)
3434
w, h = im.size
3535
else:
36-
w, h = imagesize.get(path)
36+
w, h = imagesize.get(str(path))
3737
return CocoImage(id=image_id,
3838
width=w,
3939
height=h,
40-
file_name=path.split(os.sep)[-1])
40+
file_name=Path(path.name))
4141

4242

4343
def id_to_rgb(id: int) -> Tuple[int, int, int]:
44-
blue = id // (256 * 256)
45-
rem = id - (blue * 256 * 256)
46-
green = rem // 256
47-
red = rem - (256 * green)
48-
return red, green, blue
44+
digits = []
45+
for _ in range(3):
46+
digits.append(id % 256)
47+
id //= 256
48+
return digits
4949

5050

5151
def rgb_to_id(red: int, green: int, blue: int) -> int:

labelbox/data/serialization/coco/instance_dataset.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
11
# https://cocodataset.org/#format-data
22

3-
from labelbox.data.serialization.coco.categories import Categories, hash_category_name
4-
from labelbox.data.serialization.coco.annotation import COCOObjectAnnotation, RLE, get_annotation_lookup, rle_decoding
5-
from labelbox.data.serialization.coco.image import CocoImage, get_image, get_image_id
6-
from typing import Any, Dict, List
7-
from pydantic import BaseModel
8-
from ...annotation_types import ImageData, MaskData, Mask, ObjectAnnotation, Label, Polygon, Point, Rectangle
3+
from concurrent.futures import ProcessPoolExecutor, as_completed
4+
from typing import Any, Dict, List, Tuple
5+
from pathlib import Path
6+
97
import numpy as np
10-
from PIL import Image
118
from tqdm import tqdm
12-
import os
13-
from concurrent.futures import ProcessPoolExecutor, as_completed
9+
from pydantic import BaseModel
10+
11+
from ...annotation_types import ImageData, MaskData, Mask, ObjectAnnotation, Label, Polygon, Point, Rectangle
1412
from ...annotation_types.collection import LabelCollection
13+
from .categories import Categories, hash_category_name
14+
from .annotation import COCOObjectAnnotation, RLE, get_annotation_lookup, rle_decoding
15+
from .image import CocoImage, get_image, get_image_id
1516

1617

17-
def mask_to_coco_object_annotation(annotation: ObjectAnnotation, annot_idx,
18-
image_id, category_id):
18+
def mask_to_coco_object_annotation(annotation: ObjectAnnotation, annot_idx : int,
19+
image_id : int, category_id : int) -> COCOObjectAnnotation:
1920
# This is going to fill any holes into the multipolygon
2021
# If you need to support holes use the panoptic data format
2122
shapely = annotation.value.shapely.simplify(1).buffer(0)
@@ -37,8 +38,8 @@ def mask_to_coco_object_annotation(annotation: ObjectAnnotation, annot_idx,
3738
iscrowd=0)
3839

3940

40-
def vector_to_coco_object_annotation(annotation: ObjectAnnotation, annot_idx,
41-
image_id: int, category_id):
41+
def vector_to_coco_object_annotation(annotation: ObjectAnnotation, annot_idx : int,
42+
image_id: int, category_id: int) -> COCOObjectAnnotation:
4243
shapely = annotation.value.shapely
4344
xmin, ymin, xmax, ymax = shapely.bounds
4445
segmentation = []
@@ -61,15 +62,15 @@ def vector_to_coco_object_annotation(annotation: ObjectAnnotation, annot_idx,
6162
iscrowd=0)
6263

6364

64-
def rle_to_common(class_annotations, class_name):
65+
def rle_to_common(class_annotations : COCOObjectAnnotation, class_name : str) -> ObjectAnnotation:
6566
mask = rle_decoding(class_annotations.segmentation.counts,
6667
*class_annotations.segmentation.size[::-1])
6768
return ObjectAnnotation(name=class_name,
6869
value=Mask(mask=MaskData.from_2D_arr(mask),
6970
color=[1, 1, 1]))
7071

7172

72-
def segmentations_to_common(class_annotations, class_name):
73+
def segmentations_to_common(class_annotations : COCOObjectAnnotation, class_name: str) -> List[ObjectAnnotation]:
7374
# Technically it is polygons. But the key in coco is called segmentations..
7475
annotations = []
7576
for points in class_annotations.segmentation:
@@ -83,9 +84,9 @@ def segmentations_to_common(class_annotations, class_name):
8384

8485

8586
def process_label(label: Label,
86-
idx,
87-
image_root,
88-
max_annotations_per_image=10000):
87+
idx : int,
88+
image_root :str,
89+
max_annotations_per_image=10000) -> Tuple[np.ndarray, List[COCOObjectAnnotation], Dict[str, str]]:
8990
annot_idx = idx * max_annotations_per_image
9091
image_id = get_image_id(label, idx)
9192
image = get_image(label, image_root, image_id)
@@ -118,14 +119,14 @@ class CocoInstanceDataset(BaseModel):
118119
categories: List[Categories]
119120

120121
@classmethod
121-
def from_common(cls, labels: LabelCollection, image_root):
122+
def from_common(cls, labels: LabelCollection, image_root : Path, max_workers = 8):
122123
all_coco_annotations = []
123124
categories = {}
124125
images = []
125126
futures = []
126127
coco_categories = {}
127128

128-
with ProcessPoolExecutor(max_workers=8) as exc:
129+
with ProcessPoolExecutor(max_workers=max_workers) as exc:
129130
futures = [
130131
exc.submit(process_label, label, idx, image_root)
131132
for idx, label in enumerate(labels)
@@ -161,13 +162,13 @@ def to_common(self, image_root):
161162
annotation_lookup = get_annotation_lookup(self.annotations)
162163

163164
for image in self.images:
164-
im_path = os.path.join(image_root, image.file_name)
165-
if not os.path.exists(im_path):
165+
im_path = Path(image_root, image.file_name)
166+
if not im_path.exists():
166167
raise ValueError(
167168
f"Cannot find file {im_path}. Make sure `image_root` is set properly"
168169
)
169170

170-
data = ImageData(file_path=im_path)
171+
data = ImageData(file_path=str(im_path))
171172
annotations = []
172173
for class_annotations in annotation_lookup[image.id]:
173174
if isinstance(class_annotations.segmentation, RLE):

0 commit comments

Comments
 (0)