open-edge-platform · Bepitic · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
@@ -50,6 +50,7 @@ core = [
     "lightning>=2.2",
     "torch>=2",
     "torchmetrics>=1.3.2",
+    "openai>=1.38.0",
     # NOTE: open-clip-torch throws the following error on v2.26.1
     #   torch.onnx.errors.UnsupportedOperatorError: Exporting the operator
     #   'aten::_native_multi_head_attention' to ONNX opset version 14 is not supported

@@ -22,3 +22,4 @@ class TaskType(str, Enum):
     CLASSIFICATION = "classification"
     DETECTION = "detection"
     SEGMENTATION = "segmentation"
+    LANGUAGE = "language"
@@ -75,10 +75,10 @@ def setup(
         pixel_metric_names: list[str] | dict[str, dict[str, Any]]
         if self.pixel_metric_names is None:
             pixel_metric_names = []
-        elif self.task == TaskType.CLASSIFICATION:
+        elif self.task in (TaskType.CLASSIFICATION, TaskType.LANGUAGE):
             pixel_metric_names = []
             logger.warning(
-                "Cannot perform pixel-level evaluation when task type is classification. "
+                "Cannot perform pixel-level evaluation when task type is classification or language. "
                 "Ignoring the following pixel-level metrics: %s",
                 self.pixel_metric_names,
             )

diff --git a/src/anomalib/data/base/dataset.py b/src/anomalib/data/base/dataset.py
@@ -20,9 +20,11 @@
 from anomalib.data.utils import LabelName, masks_to_boxes, read_image, read_mask
 
 _EXPECTED_COLUMNS_CLASSIFICATION = ["image_path", "split"]
+_EXPECTED_COLUMNS_LANGUAGE = ["image_path", "split"]
 _EXPECTED_COLUMNS_SEGMENTATION = [*_EXPECTED_COLUMNS_CLASSIFICATION, "mask_path"]
 _EXPECTED_COLUMNS_PERTASK = {
     "classification": _EXPECTED_COLUMNS_CLASSIFICATION,
+    "language": _EXPECTED_COLUMNS_LANGUAGE,
     "segmentation": _EXPECTED_COLUMNS_SEGMENTATION,
     "detection": _EXPECTED_COLUMNS_SEGMENTATION,
 }
@@ -169,7 +171,7 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         image = read_image(image_path, as_tensor=True)
         item = {"image_path": image_path, "label": label_index}
 
-        if self.task == TaskType.CLASSIFICATION:
+        if self.task in (TaskType.CLASSIFICATION, TaskType.LANGUAGE):
             item["image"] = self.transform(image) if self.transform else image
         elif self.task in (TaskType.DETECTION, TaskType.SEGMENTATION):
             # Only Anomalous (1) images have masks in anomaly datasets

diff --git a/src/anomalib/data/base/depth.py b/src/anomalib/data/base/depth.py
@@ -48,7 +48,7 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         depth_image = to_tensor(read_depth_image(depth_path))
         item = {"image_path": image_path, "depth_path": depth_path, "label": label_index}
 
-        if self.task == TaskType.CLASSIFICATION:
+        if self.task in (TaskType.CLASSIFICATION, TaskType.LANGUAGE):
             item["image"], item["depth_image"] = (
                 self.transform(image, depth_image) if self.transform else (image, depth_image)
             )

@@ -277,7 +277,7 @@ def post_process(self, predictions: np.ndarray, metadata: dict | DictConfig | No
             pred_idx = pred_score >= metadata["image_threshold"]
             pred_label = LabelName.ABNORMAL if pred_idx else LabelName.NORMAL
 
-        if task == TaskType.CLASSIFICATION:
+        if task in (TaskType.CLASSIFICATION, TaskType.LANGUAGE):
             _, pred_score = self._normalize(pred_scores=pred_score, metadata=metadata)
         elif task in (TaskType.SEGMENTATION, TaskType.DETECTION):
             if "pixel_threshold" in metadata:

@@ -24,6 +24,7 @@
     Fastflow,
     Fre,
     Ganomaly,
+    GPTVad,
     Padim,
     Patchcore,
     ReverseDistillation,
@@ -51,6 +52,7 @@ class UnknownModelError(ModuleNotFoundError):
     "Fastflow",
     "Fre",
     "Ganomaly",
+    "GPTVad",
     "Padim",
     "Patchcore",
     "ReverseDistillation",

@@ -14,6 +14,7 @@
 from .fastflow import Fastflow
 from .fre import Fre
 from .ganomaly import Ganomaly
+from .gptvad import GPTVad
 from .padim import Padim
 from .patchcore import Patchcore
 from .reverse_distillation import ReverseDistillation
@@ -34,6 +35,7 @@
     "Fastflow",
     "Fre",
     "Ganomaly",
+    "GPTVad",
     "Padim",
     "Patchcore",
     "ReverseDistillation",

@@ -0,0 +1,8 @@
+"""Generative Pre-Trained Transformer (GPT) based Large Language Model (LLM)."""
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from .lightning_model import GPTVad
+
+__all__ = ["GPTVad"]
@@ -0,0 +1,127 @@
+"""Wrapper for the OpenAI calls to the VLM model."""
+
+import logging
+import os
+from typing import Any
+
+import openai
+
+
+class GPTWrapper:
+    """A wrapper class for making API calls to OpenAI's GPT-4 model to detect anomalies in images.
+
+    Environment variable OPENAI_API_KEY (str): API key for OpenAI.
+    https://platform.openai.com/docs/quickstart/step-2-set-up-your-api-key
+    Other possible models: https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4
+    All models with vision capabilities: 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo',
+    all versions of 'gpt-4o-mini', and 'gpt-4o'
+
+    Args:
+        model_name (str): Model name for OpenAI API VLM. Default "gpt-4o"
+        detail (bool): If the images will be sended with high detail or low detail.
+
+    """
+
+    def __init__(self, model_name: str = "gpt-4o", detail: bool = True) -> None:
+        openai_key = os.getenv("OPENAI_API_KEY")
+        self.model_name = model_name
+        self.detail = detail
+        if not openai_key:
+            from anomalib.engine.engine import UnassignedError
+
+            msg = "OpenAI environment key not found.(OPENAI_API_KEY)"
+            raise UnassignedError(msg)
+
+    def api_call(
+        self,
+        images: list[str],
+        extension: str = "png",
+    ) -> str:
+        """Makes an API call to OpenAI's GPT-4 model to detect anomalies in an image.
+
+        Args:
+            images (list[str]): List of base64 images that serve as examples and last one to check for anomalies.
+            extension (str): Extension of the group of images that needs to be checked for anomalies. Default = 'png'
+
+        Returns:
+            str: The response from the GPT-4 model indicating whether the image has anomalies or not.
+                  It returns 'NO' if there are no anomalies and 'YES: description' if there are anomalies,
+                  where 'description' provides details of the anomaly and its position.
+
+        Raises:
+            openai.error.OpenAIError: If there is an error during the API call.
+        """
+        prompt: str = ""
+        if len(images) > 0:
+            prompt = """
+             You will receive an image that is going to be an example of the typical image without any anomaly,
+             and the last image that you need to decide if it has an anomaly or not.
+             Answer with a 'NO' if it does not have any anomalies and 'YES: description'
+             where description is a description of the anomaly provided, position.
+            """
+        else:
+            prompt = """
+            Examine the provided image carefully to determine if there is an obvious anomaly present.
+            Anomalies may include mechanical malfunctions, unexpected objects, safety hazards, structural damages,
+            or unusual patterns or defects in the objects.
+
+            Instructions:
+
+            1. Thoroughly inspect the image for any irregularities or deviations from normal operating conditions.
+
+            2. Clearly state if an obvious anomaly is detected.
+            - If an anomaly is detected, begin with 'YES,' followed by a detailed description of the anomaly.
+            - If no anomaly is detected, simply state 'NO' and end the analysis.
+
+            Example Output Structure:
+
+            'YES:
+            - Description: Conveyor belt misalignment causing potential blockages.
+            This may result in production delays and equipment damage.
+            Immediate realignment and inspection are recommended.'
+
+            'NO'
+
+            Considerations:
+
+            - Ensure accuracy in identifying anomalies to prevent overlooking critical issues.
+            - Provide clear and concise descriptions for any detected anomalies.
+            - Focus on obvious anomalies that could impact final use of the object operation or safety.
+            """
+
+        detail_img = "high" if self.detail else "low"
+        messages: list[dict[str, Any]] = [
+            {
+                "role": "system",
+                "content": prompt,
+            },
+        ]
+        for image in images:
+            image_message = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/{extension};base64,{image}",
+                                "detail": detail_img,
+                            },
+                        },
+                    ],
+                },
+            ]
+            messages.extend(image_message)
+
+        try:
+            # Make the API call using the openai library
+            response = openai.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                max_tokens=300,
+            )
+            return response.choices[-1].message.content or ""
+        except Exception:
+            msg = "The openai API trow an exception."
+            logging.exception(msg)
+            raise
@@ -0,0 +1,155 @@
+"""OpenAI Visual Large Model: Zero-/Few-Shot Anomaly Classification.
+
+Paper (No paper)
+"""
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import logging
+from pathlib import Path
+
+import torch
+from lightning.pytorch.utilities.types import STEP_OUTPUT
+from torch.utils.data import DataLoader
+
+from anomalib import LearningType
+from anomalib.metrics.threshold import ManualThreshold
+from anomalib.models.components import AnomalyModule
+
+from .chatgpt import GPTWrapper
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["GPTVad"]
+
+
+class GPTVad(AnomalyModule):
+    """OpenAI VLM Lightning model using OpenAI's GPT-4 for image anomaly detection.
+
+    Args:
+        k_shot(int): The number of images that will compare to detect if it is an anomaly.
+        model_name (str): The OpenAI VLM for visual anomaly detection.
+        detail (bool): The detail of the input in the vlm for the image detection 'high'(true) 'low'(false).
+    """
+
+    def __init__(
+        self,
+        k_shot: int = 0,
+        model_name: str = "gpt-4o",
+        detail: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.k_shot = k_shot
+
+        self.model_name = model_name
+        self.detail = detail
+        self.image_threshold = ManualThreshold()
+        self.vlm = GPTWrapper(model_name=self.model_name, detail=self.detail)
+
+    def _setup(self) -> None:
+        dataloader = self.trainer.datamodule.train_dataloader()
+        pre_images = self.collect_reference_images(dataloader)
+        self.pre_images = pre_images
+
+    def _encode_image(self, image_path: str) -> str:
+        """Function to encode the image into base64 to send it with the prompt."""
+        path = Path(image_path)
+        with path.open("rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    def training_step(self, batch: dict[str, str | torch.Tensor], *args, **kwargs) -> dict[str, str | torch.Tensor]:
+        """Train Step of LLM."""
+        del args, kwargs  # These variables are not used.
+        # no train on llm
+        return batch
+
+    @staticmethod
+    def configure_optimizers() -> None:
+        """OpenaiVlm doesn't require optimization, therefore returns no optimizers."""
+        return
+
+    def validation_step(
+        self,
+        batch: dict[str, str | list[str] | torch.Tensor],
+        *args,
+        **kwargs,
+    ) -> STEP_OUTPUT:
+        """Get batch of anomaly maps from input image batch.
+
+        Args:
+            batch (dict[str, str | list[str] | torch.Tensor]): Batch containing image filename, image, label and mask
+            args: Additional arguments.
+            kwargs: Additional keyword arguments.
+
+        Returns:
+            dict[str, Any]: str_otput and pred_scores, the output of the Llm and pred_scores 1.0 if is an anomaly image.
+        """
+        del args, kwargs  # These variables are not used.
+        batch_size = len(batch["image_path"])
+        outputs: list[str] = []
+        predictions: list[float] = []
+        for i in range(batch_size):
+            # Getting the base64 string
+            base64_images = [self._encode_image(img) for img in self.pre_images]
+            base64_images.append(self._encode_image(batch["image_path"][i]))
+
+            try:
+                output = self.vlm.api_call(base64_images)
+            except Exception:
+                logging.exception(
+                    f"Error calling openAI API for image {batch['image_path'][i]}",
+                )
+                output = "Error"
+
+            # set an error and get to normal if not followed
+            prediction = 0.0
+            if output.startswith("N"):
+                prediction = 0.0
+            elif output.startswith("Y"):
+                prediction = 1.0
+            else:
+                logging.warning(
+                    f"(Set predition to '0' Normal)Could not identify if there is anomaly by the output:\n{output}",
+                )
+
+            outputs.append(output)
+            predictions.append(prediction)
+            logging.debug(f"Output: {output}, Prediction: {prediction}")
+
+        batch["str_output"] = outputs
+        batch["pred_scores"] = torch.tensor(predictions).to(self.device)
+        batch["pred_labels"] = torch.tensor(predictions).to(self.device)
+        return batch
+
+    @property
+    def trainer_arguments(self) -> dict[str, int | float]:
+        """Set model-specific trainer arguments."""
+        return {}
+
+    @property
+    def learning_type(self) -> LearningType:
+        """The learning type of the model.
+
+        Llm is a zero-/few-shot model, depending on the user configuration. Therefore, the learning type is
+        set to ``LearningType.FEW_SHOT`` when ``k_shot`` is greater than zero and ``LearningType.ZERO_SHOT`` otherwise.
+        """
+        return LearningType.ZERO_SHOT if self.k_shot == 0 else LearningType.FEW_SHOT
+
+    def collect_reference_images(self, dataloader: DataLoader) -> list[str]:
+        """Collect reference images for few-shot inference.
+
+        The reference images are collected by iterating the training dataset until the required number of images are
+        collected.
+
+        Returns:
+            ref_images list[str]: A list containing the reference images path.
+        """
+        reference_images_paths: list[str] = []
+        for batch in dataloader:
+            image_paths = batch["image_path"][: self.k_shot - len(reference_images_paths)]
+            reference_images_paths.extend(image_paths)
+            if self.k_shot == len(reference_images_paths):
+                break
+        return reference_images_paths