eXascaleInfolab · qnater · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/imputegap/algorithms/__pycache__/zero_impute.cpython-38.pyc b/imputegap/algorithms/__pycache__/zero_impute.cpython-38.pyc
diff --git a/imputegap/algorithms/zero_impute.py b/imputegap/algorithms/zero_impute.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+
+def zero_impute(ground_truth, contamination, params=None):
+    """
+    Template zero impute for adding your own algorithms
+    :param imputegap:
+    :return: imputation matrix
+    """
+    imputation = np.nan_to_num(contamination, nan=0)
+
+    return imputation
diff --git a/imputegap/assets/test_contamination.png b/imputegap/assets/test_contamination.png
diff --git a/imputegap/assets/test_imputation.png b/imputegap/assets/test_imputation.png
diff --git a/imputegap/contamination/README.md b/imputegap/contamination/README.md
@@ -0,0 +1,45 @@
+![My Logo](../../assets/imputegab_logo.png)
+
+# Scenarios
+<table>
+    <tr>
+        <td>N</td><td>Lentgh of time series</td>
+    </tr>
+    <tr>
+        <td>M</td><td>Number of time series</td>
+    </tr>
+    <tr>
+        <td>R</td><td>Missing rate of the scenario</td>
+    </tr>
+    <tr>
+        <td>W</td><td>Total number of values to remove</td>
+    </tr>
+</table>
+
+### MCAR
+MCAR removed from a random series at a random position until a total of W of all points of time series are missing.
+This scenario uses random number generator with fixed seed and will produce the same blocks every run
+
+<table>
+    <tbody>Definition</tbody>
+    <tr>
+        <td>N</td><td>Selection by the user</td>
+    </tr>
+    <tr>
+        <td>M</td><td>Max</td>
+    </tr>
+    <tr>
+        <td>R</td><td>1 to 80%</td>
+    </tr>
+    <tr>
+        <td>W</td><td>N * M * R</td>
+    </tr>
+    <tbody>Details</tbody>
+    <tr>
+        <td>Starting position</td><td>1 to 15% in the beginning of the series</td>
+    </tr>
+    <tr>
+        <td>Missing blocks</td><td>1 to N-1</td>
+    </tr>
+
+ </table>
diff --git a/imputegap/contamination/__pycache__/_contamination.cpython-312.pyc b/imputegap/contamination/__pycache__/_contamination.cpython-312.pyc
diff --git a/imputegap/contamination/__pycache__/_contamination.cpython-38.pyc b/imputegap/contamination/__pycache__/_contamination.cpython-38.pyc
diff --git a/imputegap/contamination/__pycache__/contamination.cpython-38.pyc b/imputegap/contamination/__pycache__/contamination.cpython-38.pyc
diff --git a/imputegap/contamination/_contamination.py b/imputegap/contamination/_contamination.py
diff --git a/imputegap/contamination/contamination.py b/imputegap/contamination/contamination.py
@@ -0,0 +1,94 @@
+import math
+import numpy as np
+
+
+class Contamination:
+
+    def format_selection(ts, selection):
+        """
+        Format the selection of series based on keywords
+        @author Quentin Nater
+
+        :param selection: current selection of series
+        :param ts: dataset to contaminate
+        :return series_selected : correct format of selection series
+        """
+        if not selection:
+            selection = ["*"]
+
+        if selection == ["*"]:
+            series_selected = []
+            for i in range(0, ts.shape[0]):
+                series_selected.append(str(i))
+            return series_selected
+
+        elif "-" in selection[0]:
+            series_selected = []
+            value = selection[0]
+            ending = int(value[1:])
+            for i in range(0, ts.shape[0] - ending):
+                series_selected.append(str(i))
+            return series_selected
+
+        elif "+" in selection[0]:
+            series_selected = []
+            value = selection[0]
+            starting = int(value[1:])
+            for i in range(starting, ts.shape[0]):
+                series_selected.append(str(i))
+            return series_selected
+
+        else:
+            return selection
+
+    def scenario_mcar(ts, series_impacted=0.1, missing_rate=0.1, block_size=10, protection=0.1, use_seed=True, seed=42):
+
+        if use_seed:
+            np.random.seed(seed)
+
+        ts_contaminated = ts.copy()
+        M, _ = ts_contaminated.shape
+
+        nbr_series_impacted = int(np.ceil(M * series_impacted))
+        series_indices = [str(idx) for idx in np.random.choice(M, nbr_series_impacted, replace=False)]
+        series_selected = Contamination.format_selection(ts_contaminated, series_indices)
+
+
+        print("\n\nMCAR contamination has been called with :"
+              "\n\ta number of series impacted ", series_impacted * 100, "%",
+              "\n\ta missing rate of ", missing_rate * 100, "%",
+              "\n\ta starting position at ", protection,
+              "\n\ta block size of ", block_size,
+              "\n\twith a seed option set to ", use_seed,
+              "\n\tshape of the set ", ts_contaminated.shape,
+              "\n\tthis selection of series", *series_selected, "\n\n")
+
+        for series in series_selected:
+            S = int(series)
+            N = len(ts_contaminated[S])  # number of values in the series
+            P = int(N * protection)  # values to protect in the begining of the series
+            W = int((N - P) * missing_rate)  # number of data to remove
+            B = int(W / block_size)  # number of block to remove
+
+            if B <= 0:
+                raise ValueError("The number of block to remove must be greater than 0. "
+                                 "The dataset or the number of blocks may not be appropriate.")
+
+            data_to_remove = np.random.choice(range(P, N), B, replace=False)
+
+            for start_point in data_to_remove:
+                for jump in range(block_size):  # remove the block size for each random position
+                    position = start_point + jump
+
+                    if position >= N:  # If block exceeds the series length
+                        position = P + (position - N)  # Wrap around to the start after protection
+
+                    while np.isnan(ts_contaminated[S, position]):
+                        position = position+1
+
+                        if position >= N:  # If block exceeds the series length
+                            position = P + (position - N)  # Wrap around to the start after protection
+
+                    ts_contaminated[S, position] = np.nan
+
+        return ts_contaminated
diff --git a/imputegap/evaluation/__pycache__/_evaluation.cpython-38.pyc b/imputegap/evaluation/__pycache__/_evaluation.cpython-38.pyc
diff --git a/imputegap/evaluation/__pycache__/evaluation.cpython-38.pyc b/imputegap/evaluation/__pycache__/evaluation.cpython-38.pyc
diff --git a/imputegap/evaluation/_evaluation.py → imputegap/evaluation/evaluation.py b/imputegap/evaluation/_evaluation.py → imputegap/evaluation/evaluation.py
@@ -3,7 +3,7 @@
 from scipy.stats import pearsonr
 
 
-class EvaluationGAP:
+class Evaluation:
 
     def __init__(self, ground_truth, imputation, contamination):
         """
@@ -16,6 +16,24 @@ def __init__(self, ground_truth, imputation, contamination):
         self.imputation = imputation
         self.contamination = contamination
 
+    def metrics_computation(self):
+        """
+        Compute the metrics to express the results of the imputation based on the ground truth and the contamination set
+
+        :param ground_truth: original time series without contamination
+        :param imputation: new time series with imputation values
+        :param contamination: time series with contamination
+        :return: metrics, dictionary containing each metric of the imputation
+        """
+        rmse = self.compute_rmse()
+        mae = self.compute_mae()
+        mi_d = self.compute_mi()
+        correlation = self.compute_correlation()
+
+        metrics = {"RMSE": rmse, "MAE": mae, "MI": mi_d, "CORRELATION": correlation}
+
+        return metrics
+
     def compute_rmse(self):
         """
         Compute the RMSE score based on the ground_truth, the imputation values and the contamination set

diff --git a/imputegap/imputation/__pycache__/_imputation.cpython-312.pyc b/imputegap/imputation/__pycache__/_imputation.cpython-312.pyc
diff --git a/imputegap/imputation/__pycache__/_imputation.cpython-38.pyc b/imputegap/imputation/__pycache__/_imputation.cpython-38.pyc
diff --git a/imputegap/imputation/__pycache__/imputation.cpython-38.pyc b/imputegap/imputation/__pycache__/imputation.cpython-38.pyc