intsystems
diff --git a/‎Udeneev2025Surrogate.pdf
181 KB b/‎Udeneev2025Surrogate.pdf
181 KB
diff --git a/‎code/data_generator.ipynb
Lines changed: 11 additions & 10 deletions b/‎code/data_generator.ipynb
Lines changed: 11 additions & 10 deletions
diff --git a/‎code/dataset/arch_dicts.json
Lines changed: 1 addition & 1 deletion b/‎code/dataset/arch_dicts.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎code/results/model_10_results.json
Lines changed: 3418 additions & 3418 deletions b/‎code/results/model_10_results.json
Lines changed: 3418 additions & 3418 deletions
diff --git a/‎code/results/model_1_results.json
Lines changed: 2845 additions & 2845 deletions b/‎code/results/model_1_results.json
Lines changed: 2845 additions & 2845 deletions
diff --git a/‎code/results/model_2_results.json
Lines changed: 2838 additions & 2838 deletions b/‎code/results/model_2_results.json
Lines changed: 2838 additions & 2838 deletions
diff --git a/‎code/results/model_3_results.json
Lines changed: 3029 additions & 3029 deletions b/‎code/results/model_3_results.json
Lines changed: 3029 additions & 3029 deletions
diff --git a/‎code/results/model_4_results.json
Lines changed: 3349 additions & 3349 deletions b/‎code/results/model_4_results.json
Lines changed: 3349 additions & 3349 deletions
diff --git a/‎code/results/model_5_results.json
Lines changed: 3463 additions & 3463 deletions b/‎code/results/model_5_results.json
Lines changed: 3463 additions & 3463 deletions
diff --git a/‎code/results/model_6_results.json
Lines changed: 2903 additions & 2903 deletions b/‎code/results/model_6_results.json
Lines changed: 2903 additions & 2903 deletions
diff --git a/‎code/results/model_7_results.json
Lines changed: 2964 additions & 2964 deletions b/‎code/results/model_7_results.json
Lines changed: 2964 additions & 2964 deletions
diff --git a/‎code/results/model_8_results.json
Lines changed: 2753 additions & 2753 deletions b/‎code/results/model_8_results.json
Lines changed: 2753 additions & 2753 deletions
diff --git a/‎code/results/model_9_results.json
Lines changed: 3158 additions & 3158 deletions b/‎code/results/model_9_results.json
Lines changed: 3158 additions & 3158 deletions
diff --git a/‎code/train_models.ipynb
Lines changed: 234 additions & 204 deletions b/‎code/train_models.ipynb
Lines changed: 234 additions & 204 deletions
diff --git a/‎code/train_models.py
Lines changed: 214 additions & 0 deletions b/‎code/train_models.py
Lines changed: 214 additions & 0 deletions
diff --git a/‎paper/main.tex
Lines changed: 26 additions & 7 deletions b/‎paper/main.tex
Lines changed: 26 additions & 7 deletions
@@ -0,0 +1,214 @@
+import os
+import json
+import numpy as np
+import torch
+import nni
+from torch.utils.data import SubsetRandomSampler
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from nni.nas.evaluator.pytorch import DataLoader, Classification
+from nni.nas.hub.pytorch import DARTS as DartsSpace
+from nni.nas.space import model_context
+from tqdm import tqdm
+from IPython.display import clear_output
+
+ARCHITECTURES_PATH = "dataset/arch_dicts.json"
+MAX_EPOCHS = 50
+LEARNING_RATE = 1e-3
+BATCH_SIZE = 256
+CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+
+def load_arch_dicts(json_path):
+    """
+    Загружает словари архитектур из JSON файла.
+
+    Аргументы:
+        json_path (str): Путь к JSON файлу, содержащему словари архитектур.
+
+    Возвращает:
+        dict: Словарь, содержащий конфигурации архитектур.
+    """
+    with open(json_path, "r") as f:
+        arch_dicts = json.load(f)
+    return arch_dicts
+
+
+def get_data_loaders(batch_size=512):
+    """
+    Возвращает загрузчики данных для обучения и валидации.
+
+    Параметры:
+    batch_size (int): Размер батча для загрузчиков данных. По умолчанию 1024.
+
+    Возвращает:
+    tuple: Кортеж, содержащий два объекта DataLoader:
+        - search_train_loader: Загрузчик данных для обучения.
+        - search_valid_loader: Загрузчик данных для валидации.
+    """
+    transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+        ]
+    )
+
+    train_data = nni.trace(CIFAR10)(
+        root="./data", train=True, download=True, transform=transform
+    )
+    num_samples = len(train_data)
+    indices = np.random.permutation(num_samples)
+    split = num_samples // 2
+
+    search_train_loader = DataLoader(
+        train_data,
+        batch_size=batch_size,
+        num_workers=6,
+        sampler=SubsetRandomSampler(indices[:split]),
+    )
+
+    search_valid_loader = DataLoader(
+        train_data,
+        batch_size=batch_size,
+        num_workers=6,
+        sampler=SubsetRandomSampler(indices[split:]),
+    )
+
+    return search_train_loader, search_valid_loader
+
+
+def train_model(
+    architecture, train_loader, valid_loader, max_epochs=10, learning_rate=1e-3
+):
+    """
+    Обучает модель на основе заданной архитектуры и данных.
+    Параметры:
+    architecture (str): Архитектура модели, которая будет использоваться.
+    train_loader (DataLoader): DataLoader для обучающих данных.
+    valid_loader (DataLoader): DataLoader для валидационных данных.
+    max_epochs (int, необязательно): Максимальное количество эпох для обучения. По умолчанию 10.
+    learning_rate (float, необязательно): Скорость обучения. По умолчанию 1e-3.
+    Возвращает:
+    model: Обученная модель.
+    """
+    with model_context(architecture):
+        model = DartsSpace(width=16, num_cells=3, dataset="cifar")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if torch.cuda.device_count() > 1:
+        model = torch.nn.DataParallel(model)  # Enable multi-GPU training
+
+    model.to(device)
+
+    evaluator = Classification(
+        learning_rate=learning_rate,
+        weight_decay=1e-4,
+        train_dataloaders=train_loader,
+        val_dataloaders=valid_loader,
+        max_epochs=max_epochs,
+        num_classes=10,
+        export_onnx=False,  # Disable ONNX export for this experiment
+        fast_dev_run=False,  # Should be false for fully training
+    )
+
+    evaluator.fit(model)
+    return model
+
+def evaluate_and_save_results(
+    models, architectures, batch_size=512, num_workers=6, folder_name="results"
+):
+    """
+    Оценивает модели на тестовом наборе данных CIFAR-10 и сохраняет результаты в файлы JSON.
+    Аргументы:
+    models (list): Список обученных моделей.
+    architectures (list): Список архитектур моделей.
+    batch_size (int, необязательно): Размер батча для загрузчика данных. По умолчанию 1024.
+    num_workers (int, необязательно): Количество потоков для загрузчика данных. По умолчанию 6.
+    folder_name (str, необязательно): Имя папки для сохранения результатов. По умолчанию "results".
+    Исключения:
+    ValueError: Если количество моделей и архитектур не совпадает.
+    Результаты:
+    Для каждой модели создается файл JSON с результатами, содержащий:
+    - architecture: Архитектура модели.
+    - test_predictions: Предсказания модели на тестовом наборе данных.
+    - test_accuracy: Точность модели на тестовом наборе данных.
+    """
+    if len(models) != len(architectures):
+        raise ValueError("Количество моделей и архитектур должно совпадать")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    os.makedirs(folder_name, exist_ok=True)
+
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
+        ]
+    )
+    test_dataset = CIFAR10(
+        root="./data", train=False, download=True, transform=transform
+    )
+    test_loader = DataLoader(
+        test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+    )
+
+    for i, (model, architecture) in enumerate(zip(models, architectures)):
+        model.to(device)
+        model.eval()
+
+        test_correct = 0
+        test_total = 0
+        test_preds = []
+
+        with torch.no_grad():
+            for images, labels in test_loader:
+                images, labels = images.to(device), labels.to(device)
+                outputs = model(images)
+                _, predicted = torch.max(outputs, 1)
+                test_preds.extend(predicted.cpu().tolist())
+                test_correct += (predicted == labels).sum().item()
+                test_total += labels.size(0)
+
+        test_accuracy = test_correct / test_total
+
+        result = {
+            "architecture": architecture,
+            "test_predictions": test_preds,
+            "test_accuracy": test_accuracy,
+        }
+
+        file_name = f"model_{i+1}_results.json"
+        file_path = os.path.join(folder_name, file_name)
+
+        with open(file_path, "w") as f:
+            json.dump(result, f, indent=4)
+
+        print(f"Results for model_{i + 1} saved to {file_path}")
+
+
+if __name__ == "__main__":
+    arch_dicts = load_arch_dicts(ARCHITECTURES_PATH)  # Загружаем словари архитектур
+    search_train_loader, search_valid_loader = get_data_loaders(
+        batch_size=BATCH_SIZE
+    )  # Получаем загрузчики CIFAR10
+
+    models = []
+    architectures = []
+    for architecture in tqdm(arch_dicts):
+        model = train_model(  # Обучаем модель
+            architecture,
+            search_train_loader,
+            search_valid_loader,
+            max_epochs=MAX_EPOCHS,
+            learning_rate=LEARNING_RATE,
+        )
+        models.append(model)
+        architectures.append(architecture)
+        clear_output(wait=True)
+
+    evaluate_and_save_results(
+        models, architectures, batch_size=BATCH_SIZE
+    )  # Оцениваем и сохраняем архитектуры, предсказания на тестовом наборе CIFAR10 и accuracy
@@ -94,8 +94,9 @@
 %\dedication{...}
 
 \abstract{
-The automated search for optimal neural network architectures is a challenging computational problem, and Neural Ensemble Search (NES) is even more complex. In this work, we propose a surrogate-based approach to estimate ensemble diversity. Neural architectures are represented as graphs, and their predictions on a dataset serve as training data for the surrogate function. Using this method, we develop an efficient NES framework that enables the selection of diverse and high-performing architectures. The resulting ensemble achieves superior predictive accuracy on CIFAR-10 compared to other one-shot NES methods, demonstrating the effectiveness of our approach.
+The automated search for optimal neural network architectures (NAS) is a challenging computational problem, and Neural Ensemble Search (NES) is even more complex. In this work, we propose a surrogate-based approach for ensebmle creation. Neural architectures are represented as graphs, and their predictions on a dataset serve as training data for the surrogate function. Using this function, we develop an efficient NES framework that enables the selection of diverse and high-performing architectures. The resulting ensemble achieves superior predictive accuracy on CIFAR-10 compared to other one-shot NES methods, demonstrating the effectiveness of our approach.
 }
+
 \keywords{NES, GCN, triplet loss, surrogate function.}
 
 
@@ -104,19 +105,37 @@
 
 \section{Introduction}
 
-Neural network ensembles often demonstrate better generalization ability compared to single models, especially in classification and regression tasks \cite{E_Ren_2016, Hansen1990}. However, the key factor for a successful ensemble is not only the number of models but also their architectural diversity and ability to complement each other. Selecting an optimal architecture for even a single model is a challenging task, particularly when considering data-specific constraints and computational limitations \cite{B_Swarup_2023}.
+Neural network ensembles often demonstrate better accuracy compared to single models, especially in classification and regression tasks \cite{E_Ren_2016, Hansen1990}. This fact gives rise to the problem of constructing an efficient ensemble of models (NES) \cite{Zaidi2021}. NES, in turn, relies on Neural Architecture Search (NAS) methods, which are extensively studied and applied to search for individual neural network architectures, such as evolutionary algorithms \cite{real2017large, real2019regularized}, reinforcement learning \cite{Zoph2017, xie2018snas, Liu2023}, and Bayesian optimization \cite{jin2019auto, kandasamy2018neural}. Selecting an optimal architecture for even a single model is a challenging task, particularly when considering data-specific constraints and computational limitations \cite{B_Swarup_2023}.
 
-One approach to automating ensemble construction is Neural Ensemble Search (NES) \cite{Zaidi2021}, which aims to find the optimal combination of neural networks. NES, in turn, relies on Neural Architecture Search (NAS) methods, which are extensively studied and applied to search for individual neural network architectures \cite{Zoph2017, Baeck2018, Liu2023}. Unlike traditional NAS, which focuses on finding a single model, NES is designed to efficiently combine multiple networks into an ensemble.
+The simplest approach for ensemble construction is the use of DeepEns \cite{lakshminarayanan2017simple}, implemented through DARTS \cite{Liu2018}. It involves a random search for several architectures, which are then combined into an ensemble. Despite its simplicity in implementation and hyperparameter tuning, this method is computationally expensive. More sophisticated adaptation techniques are presented in some recent works \cite{pmlr-v180-shu22a, Zaidi2021, O_Chen_2021}, which are designed to efficiently combine multiple networks into an ensemble.
 
-Modern NAS methods widely use surrogate functions to estimate architecture quality without requiring full model training \cite{Lu2022, Lu2020}. These functions significantly reduce computational costs, which is particularly important when searching for an optimal ensemble. For example, in \cite{Lu2022}, evolutionary algorithms were proposed in combination with surrogate models.
+Our research also adapts ideas from NAS for NES, specifically using a surrogate function. Some modern NAS methods widely use surrogate functions to estimate architecture quality without requiring full model training \cite{Lu2022, Lu2020, Calisto2021}. These functions significantly reduce computational costs, expanding the applicability of such methods. For example, in \cite{Lu2022}, evolutionary algorithms were proposed in combination with surrogate models for real-time semantic segmentation. In \cite{Calisto2021}, a Surrogate-assisted Multiobjective Evolutionary-based Algorithm (SaMEA) is used for 3D medical image segmentation.
 
-In this work, we propose a method for constructing neural network ensembles using a surrogate function that accounts for both model classification accuracy and architectural diversity. Diversity is crucial because ensembles consisting of similar models often fail to provide a significant performance gain. To achieve this, we encode architectures and their predictions on the CIFAR-10 dataset into a latent space \cite{S_Xue_2024}. Based on the encoded dataset, we train a Graph Convolutional Network (GCN) \cite{Kipf2017}. We claim that ensembles constructed in this manner achieve higher accuracy compared to one-shot models, such as DARTS \cite{Liu2018}, or single models.
+In this work, we propose a method for constructing neural network ensembles using a surrogate function that accounts for both model classification accuracy and architectural diversity. Diversity is crucial because ensembles consisting of similar models often fail to provide a significant performance gain. The surrogate function is used to encode the architecture into a latent space \cite{S_Xue_2024}, which reflects both the diversity and predictive ability of the architectures. Since a neural network architecture is represented as a graph, using a Graph Neural Network (GNN) \cite{Kipf2017} as a surrogate function \cite{wen2020neural} seems natural. To train it to predict model diversity, we use Triplet Loss \cite{schroff2015facenet}, similar to \cite{S_Xue_2024}. We validate this approach on CIFAR-10, demonstrating the effectiveness of the surrogate function for predicting diversity and constructing ensembles. We claim that ensembles constructed in this manner achieve state-of-the-art accuracy compared to one-shot NES algorithms, such as DeepEns \cite{lakshminarayanan2017simple}.
 
 Main Contributions:
 
-1) We adapt surrogate functions for ensemble construction, taking into account both predictive performance and architectural diversity.
+1) We propose a method for encoding the DARTS \cite{Liu2018} search space into a representation suitable for training a Graph Neural Network (GNN), where graph nodes correspond to operations within the network.
+
+2) We propose a way for training the surrogate function to predict the diversity of architectures.
+
+3) We adapt surrogate functions for ensemble construction, taking into account both predictive performance and architectural diversity.
+
+
+\section{Problem statement}
+
+\subsection{Neural Architecture Search}
+
+Let $\mathcal{V} = {1, \dots, N}$ be the set of vertices, where $N$ is the number of vertices, and let $\mathcal{E} = \{(i, j) \in \mathcal{V} \times \mathcal{V} \mid i < j \}$ be the set of edges connecting them. Furthermore, let $\mathcal{O}$ denote the set of possible operations between vertices (e.g., pooling, convolutions, etc.).For
+each edge there is an operation $o \in \mathcal{O}$  that transits information from one node
+to another. The neural architecture search (NAS) problem can be formulated as finding an operation $o^{(i, j)} \in \mathcal{O}$ for each edge $(i, j)$.
+
+Consider $\alpha \in \mathcal{A}$ as a parameter vector representing the operations assigned to edges. Then, the NAS problem can be formulated as:
+
+\begin{equation} \begin{aligned} & \min_{\alpha \in \mathcal{A}} \mathcal{L}_{val}(\omega^*{\alpha}, \alpha) \\ & \text{s.t.} \quad \omega^*_{\alpha} = \arg \min_{\omega \in \mathcal{W}} \mathcal{L}_{train}(\omega, \alpha) \end{aligned} \label{eq:nas_problem} \end{equation}
+
+where $\mathcal{W}$ is the set of all possible weights associated with operations for all potential edges in the architecture. The main challenge is the vast architecture search space (e.g., in DARTS \cite{Liu2018}, it is approximately $10^{25}$).
 
-2) We propose a method for encoding the DARTS search space into a representation suitable for training a Graph Convolutional Network (GCN), where graph nodes correspond to operations within the network.
 
 \bibliographystyle{unsrtnat}