open-edge-platform · jpcbertoldo · Sep 26, 2023 · Sep 26, 2023 · Sep 27, 2023 · Sep 29, 2023
diff --git a/src/anomalib/data/base/datamodule.py b/src/anomalib/data/base/datamodule.py
@@ -157,6 +157,11 @@ def _create_val_split(self) -> None:
             # converted from random training sample
             self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio, seed=self.seed)
             self.val_data = SyntheticAnomalyDataset.from_dataset(normal_val_data)
+        elif self.val_split_mode == ValSplitMode.FROM_TRAIN:
+            # randomly sampled from training set
+            self.train_data, self.val_data = random_split(
+                self.train_data, self.val_split_ratio, label_aware=True, seed=self.seed
+            )
         elif self.val_split_mode != ValSplitMode.NONE:
             raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
 

diff --git a/src/anomalib/data/utils/split.py b/src/anomalib/data/utils/split.py
@@ -46,6 +46,7 @@ class ValSplitMode(str, Enum):
     NONE = "none"
     SAME_AS_TEST = "same_as_test"
     FROM_TEST = "from_test"
+    FROM_TRAIN = "from_train"
     SYNTHETIC = "synthetic"
 
 

@@ -15,7 +15,7 @@ dataset:
     eval: null
   test_split_mode: from_dir # options: [from_dir, synthetic]
   test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
-  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_mode: from_train # options: [same_as_test, from_test, synthetic]
   val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
 
 model:

@@ -63,6 +63,9 @@ class EfficientAd(AnomalyModule):
         pad_maps (bool): relevant if padding is set to False. In this case, pad_maps = True pads the
             output anomaly maps so that their size matches the size in the padding = True case.
         batch_size (int): batch size for imagenet dataloader
+        pretraining_images_dir (str): path to folder with images used to pretrain the teacher model
+            TODO note in PR: the vocabulary is not consistent with the paper, where they call it "pretraining dataset"
+                             and the code is calling it "imagenette", but it could be any dataset.
     """
 
     def __init__(
@@ -75,6 +78,7 @@ def __init__(
         padding: bool = False,
         pad_maps: bool = True,
         batch_size: int = 1,
+        pretraining_images_dir: str = "./datasets/imagenette",
     ) -> None:
         super().__init__()
 
@@ -90,6 +94,7 @@ def __init__(
         self.image_size = image_size
         self.lr = lr
         self.weight_decay = weight_decay
+        self.pretraining_images_dir = pretraining_images_dir
 
         self.prepare_pretrained_model()
         self.prepare_imagenette_data()
@@ -115,8 +120,9 @@ def prepare_imagenette_data(self) -> None:
             ]
         )
 
-        imagenet_dir = Path("./datasets/imagenette")
+        imagenet_dir = Path(self.pretraining_images_dir)
         if not imagenet_dir.is_dir():
+            raise FileNotFoundError(f"Imagenette dataset not found at {imagenet_dir}")
             download_and_extract(imagenet_dir, IMAGENETTE_DOWNLOAD_INFO)
         imagenet_dataset = ImageFolder(imagenet_dir, transform=TransformsWrapper(t=self.data_transforms_imagenet))
         self.imagenet_loader = DataLoader(imagenet_dataset, batch_size=self.batch_size, shuffle=True, pin_memory=True)