Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file not shown.
12 changes: 12 additions & 0 deletions imputegap/algorithms/zero_impute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import numpy as np


def zero_impute(ground_truth, contamination, params=None):
"""
Template zero impute for adding your own algorithms
:param imputegap:
:return: imputation matrix
"""
imputation = np.nan_to_num(contamination, nan=0)

return imputation
Binary file modified imputegap/assets/test_contamination.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified imputegap/assets/test_imputation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 45 additions & 0 deletions imputegap/contamination/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
![My Logo](../../assets/imputegab_logo.png)

# Scenarios
<table>
<tr>
<td>N</td><td>Lentgh of time series</td>
</tr>
<tr>
<td>M</td><td>Number of time series</td>
</tr>
<tr>
<td>R</td><td>Missing rate of the scenario</td>
</tr>
<tr>
<td>W</td><td>Total number of values to remove</td>
</tr>
</table>

### MCAR
MCAR removed from a random series at a random position until a total of W of all points of time series are missing.
This scenario uses random number generator with fixed seed and will produce the same blocks every run

<table>
<tbody>Definition</tbody>
<tr>
<td>N</td><td>Selection by the user</td>
</tr>
<tr>
<td>M</td><td>Max</td>
</tr>
<tr>
<td>R</td><td>1 to 80%</td>
</tr>
<tr>
<td>W</td><td>N * M * R</td>
</tr>
<tbody>Details</tbody>
<tr>
<td>Starting position</td><td>1 to 15% in the beginning of the series</td>
</tr>
<tr>
<td>Missing blocks</td><td>1 to N-1</td>
</tr>

</table>
Binary file not shown.
Binary file not shown.
Binary file not shown.
110 changes: 0 additions & 110 deletions imputegap/contamination/_contamination.py

This file was deleted.

94 changes: 94 additions & 0 deletions imputegap/contamination/contamination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import math
import numpy as np


class Contamination:

def format_selection(ts, selection):
"""
Format the selection of series based on keywords
@author Quentin Nater

:param selection: current selection of series
:param ts: dataset to contaminate
:return series_selected : correct format of selection series
"""
if not selection:
selection = ["*"]

if selection == ["*"]:
series_selected = []
for i in range(0, ts.shape[0]):
series_selected.append(str(i))
return series_selected

elif "-" in selection[0]:
series_selected = []
value = selection[0]
ending = int(value[1:])
for i in range(0, ts.shape[0] - ending):
series_selected.append(str(i))
return series_selected

elif "+" in selection[0]:
series_selected = []
value = selection[0]
starting = int(value[1:])
for i in range(starting, ts.shape[0]):
series_selected.append(str(i))
return series_selected

else:
return selection

def scenario_mcar(ts, series_impacted=0.1, missing_rate=0.1, block_size=10, protection=0.1, use_seed=True, seed=42):

if use_seed:
np.random.seed(seed)

ts_contaminated = ts.copy()
M, _ = ts_contaminated.shape

nbr_series_impacted = int(np.ceil(M * series_impacted))
series_indices = [str(idx) for idx in np.random.choice(M, nbr_series_impacted, replace=False)]
series_selected = Contamination.format_selection(ts_contaminated, series_indices)


print("\n\nMCAR contamination has been called with :"
"\n\ta number of series impacted ", series_impacted * 100, "%",
"\n\ta missing rate of ", missing_rate * 100, "%",
"\n\ta starting position at ", protection,
"\n\ta block size of ", block_size,
"\n\twith a seed option set to ", use_seed,
"\n\tshape of the set ", ts_contaminated.shape,
"\n\tthis selection of series", *series_selected, "\n\n")

for series in series_selected:
S = int(series)
N = len(ts_contaminated[S]) # number of values in the series
P = int(N * protection) # values to protect in the begining of the series
W = int((N - P) * missing_rate) # number of data to remove
B = int(W / block_size) # number of block to remove

if B <= 0:
raise ValueError("The number of block to remove must be greater than 0. "
"The dataset or the number of blocks may not be appropriate.")

data_to_remove = np.random.choice(range(P, N), B, replace=False)

for start_point in data_to_remove:
for jump in range(block_size): # remove the block size for each random position
position = start_point + jump

if position >= N: # If block exceeds the series length
position = P + (position - N) # Wrap around to the start after protection

while np.isnan(ts_contaminated[S, position]):
position = position+1

if position >= N: # If block exceeds the series length
position = P + (position - N) # Wrap around to the start after protection

ts_contaminated[S, position] = np.nan

return ts_contaminated
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from scipy.stats import pearsonr


class EvaluationGAP:
class Evaluation:

def __init__(self, ground_truth, imputation, contamination):
"""
Expand All @@ -16,6 +16,24 @@ def __init__(self, ground_truth, imputation, contamination):
self.imputation = imputation
self.contamination = contamination

def metrics_computation(self):
"""
Compute the metrics to express the results of the imputation based on the ground truth and the contamination set

:param ground_truth: original time series without contamination
:param imputation: new time series with imputation values
:param contamination: time series with contamination
:return: metrics, dictionary containing each metric of the imputation
"""
rmse = self.compute_rmse()
mae = self.compute_mae()
mi_d = self.compute_mi()
correlation = self.compute_correlation()

metrics = {"RMSE": rmse, "MAE": mae, "MI": mi_d, "CORRELATION": correlation}

return metrics

def compute_rmse(self):
"""
Compute the RMSE score based on the ground_truth, the imputation values and the contamination set
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading