CaroP-unige · CaroP-unige · Apr 7, 2025 · Apr 7, 2025
diff --git a/KNNClassifier.py b/KNNClassifier.py
@@ -0,0 +1,29 @@
+from typing import Any
+
+import numpy as np
+from numpy import floating
+
+
+class KNNClassifier:
+    def __init__(self, k: int = 3) -> None:
+        self.x_train: np.ndarray = np.array([])
+        self.y_train: np.ndarray = np.array([])
+        self.k: int = k
+
+        self.x_test: np.ndarray = np.array([])
+
+    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
+        self.x_train = x
+        self.y_train = y
+
+    def predict(self, x: np.ndarray, y: np.ndarray | None = None) -> np.ndarray | tuple[np.ndarray, float]:
+        predictions = []
+        for i in range(x.shape[0]):
+            distances = np.linalg.norm(self.x_train - x[i], axis=1)
+            k_indices = np.argsort(distances)[:self.k]
+            k_labels = self.y_train[k_indices]
+            prediction = np.bincount(k_labels).argmax()
+            predictions.append(prediction)
+        if y is None:
+            return np.array(predictions)
+        return np.array(predictions), np.mean(predictions != y)
diff --git a/README.md b/README.md
@@ -1 +1,84 @@
-# MachineLearning
+# Lab 3 – k-Nearest Neighbors (kNN) Classifier
+
+This branch contains the implementation and report for Lab 3: kNN Classifier, assigned as part of the Machine Learning course in the Robotics Engineering program.
+
+*🧭 Assignment Objectives*
+
+- Load and preprocess a suitable classification dataset
+
+- Implement a flexible k-Nearest Neighbors (kNN) classifier
+
+- Evaluate classifier performance across various settings and configurations
+
+- Generate meaningful statistics and visualize results
+
+*📁 Task 1 - Dataset Options*
+
+🔹Large Option – MNIST Handwritten Digits
+
+  - 70,000 grayscale images (28×28 pixels) of handwritten digits (0–9)
+
+  - Pre-split into:
+
+      - 60,000 training samples
+
+      - 10,000 test samples
+
+
+🔸Smaller Option – Wine Dataset
+
+  - 178 observations, 13 chemical descriptors
+
+  - 3 wine classes
+
+
+*⚙️ Task 2 – Implementing the kNN Classifier*
+
+  - Create a function with the following inputs:
+
+    - train_X: (n × d) training data
+
+    - train_y: (n × 1) training labels
+
+    - test_X: (m × d) test data
+
+    - k: number of neighbors
+
+    - (Optional) test_y: (m × 1) ground truth labels
+
+  - The function should:
+
+    - Validate argument count (nargin)
+
+    - Ensure training/test dimensions match
+
+    - Check that k > 0 and k ≤ n
+
+    - Classify test data according to the kNN rule
+
+    - If ground truth is provided, compute and return error rate
+
+*📊 Task 3 – Evaluation and Testing*
+
+  - MNIST Evaluation:
+    Perform binary classification:
+
+    - For each digit (0–9), classify it vs the remaining 9 (e.g., "is it a 1?" vs "not 1")
+
+    - Use multiple k values: k = 1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50
+
+    - Suggestion: avoid k values divisible by the number of classes to reduce ties
+
+✅ For each configuration:
+
+  - Compute confusion matrix
+
+  - Derive metrics: accuracy, precision, recall, F1-score
+
+  - Aggregate results (e.g., average and standard deviation or interquartile range)
+
+📈 Present results in:
+
+  - Plots (e.g., accuracy vs k)
+
+  - Tables summarizing classification performance for each digit and k value
diff --git a/main.py b/main.py
@@ -0,0 +1,179 @@
+import sys
+import numpy as np
+import KNNClassifier
+from tabulate import tabulate
+import matplotlib.pyplot as plt
+from sklearn.datasets import load_wine
+from sklearn.model_selection import train_test_split
+
+
+def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
+    num_classes = max(y_true.max(), y_pred.max()) + 1
+    cm = np.zeros((num_classes, num_classes), dtype=int)
+    for i in range(len(y_true)):
+        cm[y_true[i], y_pred[i]] += 1
+    return cm
+
+
+def precision(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    cm = confusion_matrix(y_true, y_pred)
+    return np.diag(cm) / np.sum(cm, axis=0)
+
+
+def recall(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    cm = confusion_matrix(y_true, y_pred)
+    return np.diag(cm) / np.sum(cm, axis=1)
+
+
+def f1_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    precision_val = np.nanmean(precision(y_true, y_pred))
+    recall_val = np.nanmean(recall(y_true, y_pred))
+    denominator = precision_val + recall_val
+    if denominator == 0:
+        return 0
+    return 2 * (precision_val * recall_val) / denominator
+
+
+if __name__ == "__main__":
+
+    k: list[int]
+
+    wine = load_wine()
+
+    data: np.ndarray = wine.data
+    target: np.ndarray = wine.target
+
+    x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=42)
+
+    # Normalize train_x and test_x
+    train_min = x_train.min(axis=0)
+    train_max = x_train.max(axis=0)
+    # Normalize train_x
+    x_train = (x_train - train_min) / (train_max - train_min)
+    # Normalize test_x
+    x_test = (x_test - train_min) / (train_max - train_min)
+
+    # Values of k to test
+    k = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50]
+    num_mandatory_args = 5  # List of all mandatory arguments
+    num_received_args = len([x_train, y_train, x_test, y_test] + list(k))  # List of all arguments
+    if num_received_args < num_mandatory_args:
+        raise ValueError("All arguments must be provided")
+
+    classes = np.unique(y_train)
+    for i in range(len(k)):
+        assert 0 <= k[i] <= x_train.shape[0], "k must be between 0 and the number of training samples"
+        assert isinstance(k[i], int), "k must be an integer"
+    # Accuracy results storage
+    accuracy_results: dict = {cls: {k_val: [] for k_val in k} for cls in classes}
+
+    mean_val_prec: list = []
+    median_val_prec: list = []
+    avd_val_prec: list = []
+
+    mean_val_rec: list = []
+    median_val_rec: list = []
+    avd_val_rec: list = []
+
+    mean_val_f1: list = []
+    median_val_f1: list = []
+    avd_val_f1: list = []
+
+    first_quantile_prec: list = []
+    third_quantile_prec: list = []
+
+    first_quantile_rec: list = []
+    third_quantile_rec: list = []
+
+    first_quantile_f1: list = []
+    third_quantile_f1: list = []
+
+    for cls in classes:
+        # Create one-vs-all labels for training and testing
+        binary_y_train = np.where(y_train == cls, 1, 0)
+        binary_y_test = np.where(y_test == cls, 1, 0)
+
+        precision_val: list = []
+        recall_val: list = []
+        f1_score_val: list = []
+
+        for neighbors in k:
+            # Create a k-NN classifier
+            kNN = KNNClassifier.KNNClassifier(k=neighbors)
+            # Train on binary labels
+            kNN.fit(x_train, binary_y_train)
+            # Predict on test set
+            y_pred: np.ndarray
+            error_rate: float
+            y_pred, error_rate = kNN.predict(x_test, y_test)
+            print(f"Class {cls}, k={neighbors}, Error rate: {error_rate}")
+
+            # Compute accuracy
+            accuracy = np.mean(y_pred == binary_y_test)
+            accuracy_results[cls][neighbors].append(accuracy)
+
+            # Compute confusion matrix
+            conf_matrix: np.ndarray = confusion_matrix(binary_y_test, y_pred)
+            # Compute precision, recall, and F1 score
+            precision_val.append(precision(binary_y_test, y_pred))
+            recall_val.append(recall(binary_y_test, y_pred))
+            f1_score_val.append(f1_score(binary_y_test, y_pred))
+
+        # Compute statistical measures of precision, recall, and F1 score
+        mean_val_prec.append(np.mean(precision_val))
+        median_val_prec.append(np.median(precision_val))
+        avd_val_prec.append(np.mean(np.abs(np.array(precision_val) - mean_val_prec[-1])))
+        first_quantile_prec.append(np.percentile(precision_val, 25))
+        third_quantile_prec.append(np.percentile(precision_val, 75))
+
+        mean_val_rec.append(np.mean(recall_val))
+        median_val_rec.append(np.median(recall_val))
+        avd_val_rec.append(np.mean(np.abs(np.array(recall_val) - mean_val_rec[-1])))
+        first_quantile_rec.append(np.percentile(recall_val, 25))
+        third_quantile_rec.append(np.percentile(recall_val, 75))
+
+        mean_val_f1.append(np.mean(f1_score_val))
+        median_val_f1.append(np.median(f1_score_val))
+        avd_val_f1.append(np.mean(np.abs(np.array(f1_score_val) - mean_val_f1[-1])))
+        first_quantile_f1.append(np.percentile(f1_score_val, 25))
+        third_quantile_f1.append(np.percentile(f1_score_val, 75))
+
+        # Print table for each class
+        headers = ["Metric", "Mean", "Median", "AAD", "1st Quantile", "3rd Quantile"]
+        table_data = [
+            ["Precision", mean_val_prec[-1], median_val_prec[-1], avd_val_prec[-1], first_quantile_prec[-1],
+             third_quantile_prec[-1]],
+            ["Recall", mean_val_rec[-1], median_val_rec[-1], avd_val_rec[-1], first_quantile_rec[-1],
+             third_quantile_rec[-1]],
+            ["F1 Score", mean_val_f1[-1], median_val_f1[-1], avd_val_f1[-1], first_quantile_f1[-1],
+             third_quantile_f1[-1]]
+        ]
+        print(f"Class {cls} Statistics over all k values:")
+        print(tabulate(table_data, headers=headers, tablefmt="grid"))
+
+    # Print total table
+    # Print total table
+    headers = ["Metric", "Mean", "Median", "AAD", "1st Quantile", "3rd Quantile"]
+    total_data = [
+        ["Precision", np.mean(mean_val_prec), np.median(mean_val_prec), np.mean(avd_val_prec),
+         np.percentile(mean_val_prec, 25), np.percentile(mean_val_prec, 75)],
+        ["Recall", np.mean(mean_val_rec), np.median(mean_val_rec), np.mean(avd_val_rec),
+         np.percentile(mean_val_rec, 25), np.percentile(mean_val_rec, 75)],
+        ["F1 Score", np.mean(mean_val_f1), np.median(mean_val_f1), np.mean(avd_val_f1), np.percentile(mean_val_f1, 25),
+         np.percentile(mean_val_f1, 75)]
+    ]
+    print("Total Statistics for all classes:")
+    print(tabulate(total_data, headers=headers, tablefmt="grid"))
+
+    # Plot results
+    plt.figure(figsize=(12, 8))
+    for cls in classes:
+        plt.plot(k, [100 * np.mean(acc) for acc in accuracy_results[cls].values()], label=f'Class {cls}', alpha=0.4,
+                 marker='o', markersize=5, linewidth=2)
+    plt.xlabel('Number of Neighbors (k)')
+    plt.ylabel('Accuracy (%)')
+    plt.title('k-NN Accuracy for Each Class vs. All Others')
+    plt.legend(loc='best')
+    plt.grid(True, linestyle='--', alpha=0.15)
+    plt.xticks(k)
+    plt.show()