Image Chip Generator (#396)

vayunalapati · web-flow · commit e2713d7fa186 · 2023-11-15T08:42:38.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.16.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.8) - 2023-11-13
+
+### Added
+
+- Added `dataset.items_and_annotation_chip_generator()` functionality to generate chips of images in s3 or locally.
+- Added `query` parameter for `dataset.items_and_annotation_generator()` to filter dataset items.
+
 ## [0.16.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.7) - 2023-11-03
 
 ### Added
diff --git a/nucleus/chip_utils.py b/nucleus/chip_utils.py
@@ -0,0 +1,202 @@
+"""Shared stateless utility function library for chipping images"""
+
+import io
+import json
+import os
+from itertools import product
+from typing import Dict, List
+
+import boto3
+import numpy as np
+from botocore.exceptions import ClientError
+from PIL import Image
+
+from .constants import (
+    ANNOTATION_LOCATION_KEY,
+    BOX_TYPE,
+    GEOMETRY_KEY,
+    HEIGHT_KEY,
+    IMAGE_LOCATION_KEY,
+    LABEL_KEY,
+    TYPE_KEY,
+    WIDTH_KEY,
+    X_KEY,
+    Y_KEY,
+)
+
+
+def split_s3_bucket_key(s3_path: str):
+    s3_bucket, s3_key = s3_path.split("//", 1)[-1].split("/", 1)
+    return s3_bucket, s3_key
+
+
+def fetch_image(s3_url: str):
+    s3_bucket, s3_key = split_s3_bucket_key(s3_url)
+    image = Image.open(
+        boto3.resource("s3").Bucket(s3_bucket).Object(s3_key).get()["Body"]
+    )
+    return image
+
+
+def fetch_chip(ref_id: str):
+    """
+    Fetches the locations of the image and its corresponding annotations.
+
+    This function checks if the reference ID starts with "s3" to determine if the
+    image and annotations are stored on S3, otherwise it checks the local filesystem.
+    If the image or annotations do not exist, it returns None for their locations.
+
+    Args:
+        ref_id (str): The reference ID for the image and annotations.
+
+    Returns:
+        A tuple containing the location of the image and the annotations.
+        If either is not found, None is returned in their place.
+    """
+    image_loc = None
+    annotation_loc = None
+    if ref_id.startswith("s3"):
+        s3_bucket, s3_key = split_s3_bucket_key(ref_id)
+        try:
+            boto3.resource("s3").Bucket(s3_bucket).Object(
+                s3_key + ".jpeg"
+            ).load()
+            image_loc = ref_id + ".jpeg"
+        except ClientError:
+            return None, None
+        try:
+            boto3.resource("s3").Bucket(s3_bucket).Object(
+                s3_key + ".json"
+            ).load()
+            annotation_loc = ref_id + ".json"
+        except ClientError:
+            return image_loc, None
+    else:
+        if os.path.exists(ref_id + ".jpeg"):
+            image_loc = ref_id + ".jpeg"
+            if os.path.exists(ref_id + ".json"):
+                annotation_loc = ref_id + ".json"
+    return image_loc, annotation_loc
+
+
+def write_chip(
+    ref_id: str, image: Image.Image, annotations: List[Dict[str, str]]
+):
+    if ref_id.startswith("s3"):
+        s3_bucket, s3_key = split_s3_bucket_key(ref_id)
+        byteio = io.BytesIO()
+        image.save(byteio, format="jpeg")
+        byteio.seek(0)
+        boto3.resource("s3").Bucket(s3_bucket).Object(
+            s3_key + ".jpeg"
+        ).upload_fileobj(byteio)
+        annotation_loc = None
+        if len(annotations) > 0:
+            boto3.resource("s3").Bucket(s3_bucket).Object(
+                s3_key + ".json"
+            ).put(
+                Body=json.dumps(annotations, ensure_ascii=False).encode(
+                    "UTF-8"
+                ),
+                ContentType="application/json",
+            )
+            annotation_loc = ref_id + ".json"
+        return ref_id + ".jpeg", annotation_loc
+    else:
+        dirs = ref_id.rsplit("/", 1)[0]
+        os.makedirs(dirs, exist_ok=True)
+        image_loc = ref_id + ".jpeg"
+        annotation_loc = None
+        image.save(image_loc)
+        if len(annotations) > 0:
+            annotation_loc = ref_id + ".json"
+            with open(annotation_loc, "w", encoding="utf-8") as f:
+                json.dump(annotations, f, ensure_ascii=False)
+        return image_loc, annotation_loc
+
+
+def generate_offsets(w: int, h: int, chip_size: int, stride_size: int):
+    xs = np.arange(0, w - stride_size, chip_size - stride_size)
+    ys = np.arange(0, h - stride_size, chip_size - stride_size)
+    if len(xs) > 1:
+        xs = np.round(xs * (w - chip_size) / xs[-1]).astype(int)
+    if len(ys) > 1:
+        ys = np.round(ys * (h - chip_size) / ys[-1]).astype(int)
+    yield from product(ys, xs)
+
+
+def chip_annotations(data, x0: int, y0: int, x1: int, y1: int):
+    """
+    Adjusts the annotations to fit within the chip defined by the rectangle
+    with top-left corner (x0, y0) and bottom-right corner (x1, y1).
+
+    Parameters:
+        data: List of annotation dictionaries to be adjusted.
+        x0: The x-coordinate of the top-left corner of the chip.
+        y0: The y-coordinate of the top-left corner of the chip.
+        x1: The x-coordinate of the bottom-right corner of the chip.
+        y1: The y-coordinate of the bottom-right corner of the chip.
+
+    Returns:
+        A list of adjusted annotation dictionaries that fit within the chip.
+    """
+    annotations = []
+    X_1_KEY = "x1"
+    Y_1_KEY = "y1"
+    for annotation in data:
+        geometry = annotation[GEOMETRY_KEY].copy()
+        geometry[X_1_KEY] = geometry[X_KEY] + geometry[WIDTH_KEY]
+        geometry[Y_1_KEY] = geometry[Y_KEY] + geometry[HEIGHT_KEY]
+        geometry[X_KEY] = max(min(geometry[X_KEY], x1), x0) - x0
+        geometry[X_1_KEY] = max(min(geometry[X_1_KEY], x1), x0) - x0
+        geometry[Y_KEY] = max(min(geometry[Y_KEY], y1), y0) - y0
+        geometry[Y_1_KEY] = max(min(geometry[Y_1_KEY], y1), y0) - y0
+        geometry[WIDTH_KEY] = geometry[X_1_KEY] - geometry[X_KEY]
+        geometry[HEIGHT_KEY] = geometry[Y_1_KEY] - geometry[Y_KEY]
+        geometry["area"] = geometry[WIDTH_KEY] * geometry[HEIGHT_KEY]
+        if geometry["area"] > 0:
+            annotations.append(
+                {
+                    LABEL_KEY: annotation[LABEL_KEY],
+                    TYPE_KEY: BOX_TYPE,
+                    GEOMETRY_KEY: {
+                        X_KEY: geometry[X_KEY],
+                        Y_KEY: geometry[Y_KEY],
+                        WIDTH_KEY: geometry[WIDTH_KEY],
+                        HEIGHT_KEY: geometry[HEIGHT_KEY],
+                    },
+                }
+            )
+    return annotations
+
+
+def process_chip(chip_arg):
+    (
+        offset,
+        chip_size,
+        w,
+        h,
+        item_ref_id,
+        cache_directory,
+        image,
+        annotations,
+    ) = chip_arg
+    x0, y0 = map(int, offset)
+    x1 = min(x0 + chip_size, w)
+    y1 = min(y0 + chip_size, h)
+    ref_id = f"{cache_directory}/{item_ref_id}_{x0}_{y0}_{x1}_{y1}"
+    chipped_image_loc, chipped_annotation_loc = fetch_chip(ref_id)
+    if chipped_image_loc:
+        return {
+            IMAGE_LOCATION_KEY: chipped_image_loc,
+            ANNOTATION_LOCATION_KEY: chipped_annotation_loc,
+        }
+    chipped_image = image.crop((x0, y0, x1, y1))
+    chipped_annotations = chip_annotations(annotations, x0, y0, x1, y1)
+    chipped_image_loc, chipped_annotation_loc = write_chip(
+        ref_id, chipped_image, chipped_annotations
+    )
+    return {
+        IMAGE_LOCATION_KEY: chipped_image_loc,
+        ANNOTATION_LOCATION_KEY: chipped_annotation_loc,
+    }
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -71,6 +71,8 @@
 IMAGE_KEY = "image"
 IMAGE_LOCATION_KEY = "image_location"
 IMAGE_URL_KEY = "image_url"
+PROCESSED_URL_KEY = "processed_url"
+ANNOTATION_LOCATION_KEY = "annotation_location"
 INDEX_KEY = "index"
 INDEX_ID_KEY = "index_id"
 INDEX_CONTINUOUS_ENABLE_KEY = "enable"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -1,6 +1,7 @@
 import datetime
 import os
 from enum import Enum
+from multiprocessing import Pool
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -17,6 +18,7 @@
 
 from nucleus.annotation_uploader import AnnotationUploader, PredictionUploader
 from nucleus.async_job import AsyncJob, EmbeddingsExportJob
+from nucleus.chip_utils import fetch_image, generate_offsets, process_chip
 from nucleus.embedding_index import EmbeddingIndex
 from nucleus.evaluation_match import EvaluationMatch
 from nucleus.prediction import from_json as prediction_from_json
@@ -36,6 +38,7 @@
     ANNOTATIONS_KEY,
     AUTOTAG_SCORE_THRESHOLD,
     BACKFILL_JOB_KEY,
+    BOX_TYPE,
     DATASET_ID_KEY,
     DATASET_IS_SCENE_KEY,
     DATASET_ITEM_IDS_KEY,
@@ -46,13 +49,16 @@
     EXPORT_FOR_TRAINING_KEY,
     EXPORTED_ROWS,
     FRAME_RATE_KEY,
+    ITEM_KEY,
     ITEMS_KEY,
     JOB_REQ_LIMIT,
     KEEP_HISTORY_KEY,
     MAX_ES_PAGE_SIZE,
     MESSAGE_KEY,
     NAME_KEY,
     OBJECT_IDS_KEY,
+    PROCESSED_URL_KEY,
+    REFERENCE_ID_KEY,
     REFERENCE_IDS_KEY,
     REQUEST_ID_KEY,
     SCENE_IDS_KEY,
@@ -1413,9 +1419,13 @@ def items_and_annotations(
 
     def items_and_annotation_generator(
         self,
+        query: Optional[str] = None,
     ) -> Iterable[Dict[str, Union[DatasetItem, Dict[str, List[Annotation]]]]]:
         """Provides a generator of all DatasetItems and Annotations in the dataset.
 
+        Args:
+            query: Structured query compatible with the `Nucleus query language <https://nucleus.scale.com/docs/query-language-reference>`_.
+
         Returns:
             Generator where each element is a dict containing the DatasetItem
             and all of its associated Annotations, grouped by type.
@@ -1439,11 +1449,72 @@ def items_and_annotation_generator(
             endpoint=f"dataset/{self.id}/exportForTrainingPage",
             result_key=EXPORT_FOR_TRAINING_KEY,
             page_size=10000,  # max ES page size
+            query=query,
         )
         for data in json_generator:
             for ia in convert_export_payload([data], has_predictions=False):
                 yield ia
 
+    def items_and_annotation_chip_generator(
+        self,
+        chip_size: int,
+        stride_size: int,
+        cache_directory: str,
+        query: Optional[str] = None,
+    ) -> Iterable[Dict[str, str]]:
+        """Provides a generator of chips for all DatasetItems and BoxAnnotations in the dataset.
+
+        A chip is an image created by tiling a source image.
+
+        Args:
+            chip_size: The size of the image chip
+            stride_size: The distance to move when creating the next image chip.
+              When stride is equal to chip size, there will be no overlap.
+              When stride is equal to half the chip size, there will be 50 percent overlap.
+            cache_directory: The s3 or local directory to store the image and annotations of a chip.
+              s3 directories must be in the format s3://s3-bucket/s3-key
+            query: Structured query compatible with the `Nucleus query language <https://nucleus.scale.com/docs/query-language-reference>`_.
+
+        Returns:
+            Generator where each element is a dict containing the location of the image chip (jpeg) and its annotations (json).
+            ::
+
+                Iterable[{
+                    "image_location": str,
+                    "annotation_location": str
+                }]
+        """
+        json_generator = paginate_generator(
+            client=self._client,
+            endpoint=f"dataset/{self.id}/exportForTrainingPage",
+            result_key=EXPORT_FOR_TRAINING_KEY,
+            page_size=10000,  # max ES page size
+            query=query,
+            chip=True,
+        )
+        for item in json_generator:
+            image = fetch_image(item[ITEM_KEY][PROCESSED_URL_KEY])
+            w, h = image.size
+            annotations = item[BOX_TYPE]
+            item_ref_id = item[ITEM_KEY][REFERENCE_ID_KEY]
+            offsets = generate_offsets(w, h, chip_size, stride_size)
+            with Pool() as pool:
+                chip_args = [
+                    (
+                        offset,
+                        chip_size,
+                        w,
+                        h,
+                        item_ref_id,
+                        cache_directory,
+                        image,
+                        annotations,
+                    )
+                    for offset in offsets
+                ]
+                for chip_result in pool.imap(process_chip, chip_args):
+                    yield chip_result
+
     def export_embeddings(
         self,
         asynchronous: bool = True,
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml