open-edge-platform · rajeshgangireddy · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 7, 2025
@@ -0,0 +1,15 @@
+model:
+  class_path: anomalib.models.Dinomaly
+  init_args:
+    encoder_name: dinov2reg_vit_base_14
+    bottleneck_dropout: 0.2
+    decoder_depth: 8
+
+trainer:
+  max_steps: 5000
+  callbacks:
+    - class_path: lightning.pytorch.callbacks.EarlyStopping
+      init_args:
+        patience: 20
+        monitor: image_AUROC
+        mode: max
@@ -60,6 +60,7 @@
     Csflow,
     Dfkde,
     Dfm,
+    Dinomaly,
     Draem,
     Dsr,
     EfficientAd,
@@ -97,6 +98,7 @@ class UnknownModelError(ModuleNotFoundError):
     "Dfkde",
     "Dfm",
     "Draem",
+    "Dinomaly",
     "Dsr",
     "EfficientAd",
     "Fastflow",

@@ -49,6 +49,7 @@
 from .csflow import Csflow
 from .dfkde import Dfkde
 from .dfm import Dfm
+from .dinomaly import Dinomaly
 from .draem import Draem
 from .dsr import Dsr
 from .efficient_ad import EfficientAd
@@ -84,4 +85,5 @@
     "Uflow",
     "VlmAd",
     "WinClip",
+    "Dinomaly",
 ]
@@ -0,0 +1,53 @@
+# Dinomaly: Vision Transformer-based Anomaly Detection with Feature Reconstruction
+
+This is the implementation of the Dinomaly model based on the [original implementation](https://github.com/guojiajeremy/Dinomaly).
+
+Model Type: Segmentation
+
+## Description
+
+Dinomaly is a Vision Transformer-based anomaly detection model that uses an encoder-decoder architecture for feature reconstruction. The model leverages pre-trained DINOv2 Vision Transformer features and employs a reconstruction-based approach to detect anomalies by comparing encoder and decoder features.
+
+### Feature Extraction
+
+Features are extracted from multiple intermediate layers of a pre-trained DINOv2 Vision Transformer encoder. The model typically uses features from layers 2-9 for base models, providing multi-scale feature representations that capture both low-level and high-level semantic information.
+
+### Architecture
+
+The Dinomaly model consists of three main components:
+
+1. **DINOv2 Encoder**: Pre-trained Vision Transformer that extracts multi-layer features
+2. **Bottleneck MLP**: Compresses the multi-layer features before reconstruction
+3. **Vision Transformer Decoder**: Reconstructs the compressed features back to the original feature space
+
+### Anomaly Detection
+
+Anomaly detection is performed by computing cosine similarity between encoder and decoder features at multiple scales. The model generates anomaly maps by analyzing the reconstruction quality of features, where poor reconstruction indicates anomalous regions. Both anomaly detection (image-level) and localization (pixel-level) are supported.
+
+## Usage
+
+`anomalib train --model Dinomaly --data MVTecAD --data.category <category>`
+
+## Benchmark
+
+All results gathered with seed `42`.
+
+## [MVTec AD Dataset](https://www.mvtec.com/company/research/datasets/mvtec-ad)
+
+### Image-Level AUC
+
+|          | Avg | Carpet | Grid | Leather | Tile | Wood | Bottle | Cable | Capsule | Hazelnut | Metal Nut | Pill | Screw | Toothbrush | Transistor | Zipper |
+| -------- | :-: | :----: | :--: | :-----: | :--: | :--: | :----: | :---: | :-----: | :------: | :-------: | :--: | :---: | :--------: | :--------: | :----: |
+| Dinomaly |  -  |   -    |  -   |    -    |  -   |  -   |   -    |   -   |    -    |    -     |     -     |  -   |   -   |     -      |     -      |   -    |
+
+### Pixel-Level AUC
+
+|          | Avg | Carpet | Grid | Leather | Tile | Wood | Bottle | Cable | Capsule | Hazelnut | Metal Nut | Pill | Screw | Toothbrush | Transistor | Zipper |
+| -------- | :-: | :----: | :--: | :-----: | :--: | :--: | :----: | :---: | :-----: | :------: | :-------: | :--: | :---: | :--------: | :--------: | :----: |
+| Dinomaly |  -  |   -    |  -   |    -    |  -   |  -   |   -    |   -   |    -    |    -     |     -     |  -   |   -   |     -      |     -      |   -    |
+
+### Image F1 Score
+
+|          | Avg | Carpet | Grid | Leather | Tile | Wood | Bottle | Cable | Capsule | Hazelnut | Metal Nut | Pill | Screw | Toothbrush | Transistor | Zipper |
+| -------- | :-: | :----: | :--: | :-----: | :--: | :--: | :----: | :---: | :-----: | :------: | :-------: | :--: | :---: | :--------: | :--------: | :----: |
+| Dinomaly |  -  |   -    |  -   |    -    |  -   |  -   |   -    |   -   |    -    |    -     |     -     |  -   |   -   |     -      |     -      |   -    |
@@ -0,0 +1,38 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Dinomaly: Vision Transformer-based Anomaly Detection with Feature Reconstruction.
+
+The Dinomaly model implements a Vision Transformer encoder-decoder architecture for
+anomaly detection using pre-trained DINOv2 features. The model extracts features from
+multiple intermediate layers of a DINOv2 encoder, compresses them through a bottleneck
+MLP, and reconstructs them using a Vision Transformer decoder.
+
+Anomaly detection is performed by computing cosine similarity between encoder and decoder
+features at multiple scales. The model is particularly effective for visual anomaly
+detection tasks where the goal is to identify regions or images that deviate from
+normal patterns learned during training.
+
+Example:
+    >>> from anomalib.models.image import Dinomaly
+    >>> model = Dinomaly()
+
+The model can be used with any of the supported datasets and task modes in
+anomalib. It leverages the powerful feature representations from DINOv2 Vision
+Transformers combined with a reconstruction-based approach for robust anomaly detection.
+
+Notes:
+    - Uses DINOv2 Vision Transformer as the backbone encoder
+    - Features are extracted from intermediate layers for multi-scale analysis
+    - Employs feature reconstruction loss for unsupervised learning
+    - Supports both anomaly detection and localization tasks
+    - Requires significant GPU memory due to Vision Transformer architecture
+
+See Also:
+    :class:`anomalib.models.image.dinomaly.lightning_model.Dinomaly`:
+        Lightning implementation of the Dinomaly model.
+"""
+
+from anomalib.models.image.dinomaly.lightning_model import Dinomaly
+
+__all__ = ["Dinomaly"]
@@ -0,0 +1,50 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Components module for Dinomaly model.
+
+This module provides all the necessary components for the Dinomaly Vision Transformer
+architecture including layers, model loader, utilities, and vision transformer implementations.
+"""
+
+# Layer components
+from .layers import (
+    Attention,
+    Block,
+    DinomalyMLP,
+    LinearAttention,
+    MemEffAttention,
+)
+
+# Model loader
+from .model_loader import DinoV2Loader, load
+
+# Utility functions and classes
+from .training_utils import (
+    CosineHardMiningLoss,
+    StableAdamW,
+    WarmCosineScheduler,
+)
+
+# Vision transformer components
+from .vision_transformer import (
+    DinoVisionTransformer,
+)
+
+__all__ = [
+    # Layers
+    "Attention",
+    "Block",
+    "DinomalyMLP",
+    "LinearAttention",
+    "MemEffAttention",
+    # Model loader
+    "DinoV2Loader",
+    "load",
+    # Utils
+    "StableAdamW",
+    "WarmCosineScheduler",
+    "CosineHardMiningLoss",
+    # Vision transformer
+    "DinoVisionTransformer",
+]