|
| 1 | +""" |
| 2 | +Shared and general data handling functionality. |
| 3 | +""" |
| 4 | + |
| 5 | +import os |
| 6 | +import pickle |
| 7 | +import json |
| 8 | + |
| 9 | +import numpy as np |
| 10 | +from sklearn.utils import shuffle |
| 11 | + |
| 12 | + |
| 13 | +def index_make_random_shuffle(x): |
| 14 | + """ |
| 15 | + Shuffle indexarray. |
| 16 | +
|
| 17 | + Args: |
| 18 | + x (np.array): Index to shuffle. |
| 19 | +
|
| 20 | + Returns: |
| 21 | + np.array: Shuffled index. |
| 22 | +
|
| 23 | + """ |
| 24 | + return shuffle(x) |
| 25 | + |
| 26 | + |
| 27 | +def make_random_shuffle(datalist, shuffle_ind=None): |
| 28 | + """ |
| 29 | + Shuffle a list od data. |
| 30 | +
|
| 31 | + Args: |
| 32 | + datalist (list): List of numpy arrays of same length (axis=0). |
| 33 | + shuffle_ind (np.array): Array of shuffled index |
| 34 | +
|
| 35 | + Returns: |
| 36 | + outlist (list): List of the shuffled data. |
| 37 | +
|
| 38 | + """ |
| 39 | + datalen = len(datalist[0]) # this should be x data |
| 40 | + for x in datalist: |
| 41 | + if len(x) != datalen: |
| 42 | + print("Error: Data has inconsisten length") |
| 43 | + |
| 44 | + if shuffle_ind is None: |
| 45 | + allind = shuffle(np.arange(datalen)) |
| 46 | + else: |
| 47 | + allind = shuffle_ind |
| 48 | + if len(allind) != datalen: |
| 49 | + print("Warning: Datalength and shuffle index does not match") |
| 50 | + |
| 51 | + outlist = [] |
| 52 | + for x in datalist: |
| 53 | + outlist.append(x[allind]) |
| 54 | + return allind, outlist |
| 55 | + |
| 56 | + |
| 57 | +def save_data_to_folder(x, y, target_model, mod_dir, random_shuffle): |
| 58 | + """ |
| 59 | + Save all training data for model mlp_eg to folder. |
| 60 | +
|
| 61 | + Args: |
| 62 | + x (np.array): Coordinates as x-data. |
| 63 | + y (list): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc. |
| 64 | + target_model (str): Name of the Model to save data for. |
| 65 | + mod_dir (str): Path of model directory. |
| 66 | + random_shuffle (bool, optional): Whether to shuffle data before save. The default is False. |
| 67 | +
|
| 68 | + Returns: |
| 69 | + None. |
| 70 | +
|
| 71 | + """ |
| 72 | + # Save data: |
| 73 | + if not random_shuffle: |
| 74 | + with open(os.path.join(mod_dir, 'data_x'), 'wb') as f: |
| 75 | + pickle.dump(x, f) |
| 76 | + with open(os.path.join(mod_dir, 'data_y'), 'wb') as f: |
| 77 | + pickle.dump(y, f) |
| 78 | + else: |
| 79 | + if isinstance(y, list): |
| 80 | + shuffle_list = [x] + y |
| 81 | + else: |
| 82 | + shuffle_list = [x] + [y] |
| 83 | + # Make random shuffle |
| 84 | + ind_shuffle, datalist = make_random_shuffle(shuffle_list) |
| 85 | + x_out = datalist[0] |
| 86 | + if len(datalist) > 2: |
| 87 | + y_out = datalist[1:] |
| 88 | + else: |
| 89 | + y_out = datalist[1] |
| 90 | + np.save(os.path.join(mod_dir, 'shuffle_index.npy'), ind_shuffle) |
| 91 | + with open(os.path.join(mod_dir, 'data_x'), 'wb') as f: |
| 92 | + pickle.dump(x_out, f) |
| 93 | + with open(os.path.join(mod_dir, 'data_y'), 'wb') as f: |
| 94 | + pickle.dump(y_out, f) |
| 95 | + |
| 96 | + |
| 97 | +def split_validation_training_index(allind, splitsize, do_offset, offset_steps): |
| 98 | + """ |
| 99 | + Make a train-validation split for indexarray. Validation set is taken from beginning with possible offset. |
| 100 | + |
| 101 | + Args: |
| 102 | + allind (np.array): Indexlist for full dataset of same length. |
| 103 | + splitsize (int): Total number of validation samples to take. |
| 104 | + do_offset (bool): Whether to take validation set not from beginnig but with offset. |
| 105 | + offset_steps (int): Number of validation sizes offseted from the beginning to start to take validation set. |
| 106 | +
|
| 107 | + Returns: |
| 108 | + i_train (np.array): Training indices |
| 109 | + i_val (np.array): Validation indices. |
| 110 | +
|
| 111 | + """ |
| 112 | + i = offset_steps |
| 113 | + lval = splitsize |
| 114 | + if not do_offset: |
| 115 | + i_val = allind[:lval] |
| 116 | + i_train = allind[lval:] |
| 117 | + else: |
| 118 | + i_val = allind[i * lval:(i + 1) * lval] |
| 119 | + i_train = np.concatenate([allind[0:i * lval], allind[(i + 1) * lval:]], axis=0) |
| 120 | + if len(i_val) <= 0: |
| 121 | + print("Warning: #Validation data is 0, take 1 training sample instead") |
| 122 | + i_val = i_train[:1] |
| 123 | + |
| 124 | + return i_train, i_val |
| 125 | + |
| 126 | + |
| 127 | +def merge_np_arrays_in_chunks(data1, data2, split_size): |
| 128 | + """ |
| 129 | + Merge data in chunks of split-size. Goal is to keep validation k-splits for fit. |
| 130 | + |
| 131 | + Idea: [a+a+a] + [b+b+b] = [(a+b)+(a+b)+(a+b)] and NOT [a+a+a+b+b+b]. |
| 132 | +
|
| 133 | + Args: |
| 134 | + data1 (np.array): Data to merge. |
| 135 | + data2 (np.array): Data to merge. |
| 136 | + split_size (float): Relative size of junks 0 < split_size < 1. |
| 137 | +
|
| 138 | + Returns: |
| 139 | + np.array: Merged data. |
| 140 | +
|
| 141 | + """ |
| 142 | + pacs1 = int(len(data1) * split_size) |
| 143 | + pacs2 = int(len(data2) * split_size) |
| 144 | + |
| 145 | + data1frac = [data1[i * pacs1:(i + 1) * pacs1] for i in range(int(np.ceil(1 / split_size)))] |
| 146 | + data2frac = [data2[i * pacs2:(i + 1) * pacs2] for i in range(int(np.ceil(1 / split_size)))] |
| 147 | + |
| 148 | + for i in range(len(data1frac)): |
| 149 | + data1frac[i] = np.concatenate([data1frac[i], data2frac[i]], axis=0) |
| 150 | + |
| 151 | + return np.concatenate(data1frac, axis=0) |
| 152 | + |
| 153 | + |
| 154 | +def model_make_random_shuffle(x, y, shuffle_ind): |
| 155 | + """ |
| 156 | + Shuffle data according to model. |
| 157 | +
|
| 158 | + Args: |
| 159 | + x (np.array): Coordinates as x-data. |
| 160 | + y (list): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc. |
| 161 | + shuffle_ind (np.array): Index order of datapoints in dataset to shuffle after. |
| 162 | +
|
| 163 | + Returns: |
| 164 | + None. |
| 165 | +
|
| 166 | + """ |
| 167 | + if isinstance(y, list): |
| 168 | + _, temp = make_random_shuffle([x] + y, shuffle_ind) |
| 169 | + return temp[0], temp[1:] |
| 170 | + else: |
| 171 | + return make_random_shuffle([x, y], shuffle_ind)[1] |
| 172 | + |
| 173 | + |
| 174 | +def model_merge_data_in_chunks(mx1, my1, mx2, my2, val_split=0.1): |
| 175 | + """ |
| 176 | + Merge Data in chunks. |
| 177 | +
|
| 178 | + Args: |
| 179 | + mx1 (list,np.array): Coordinates as x-data. |
| 180 | + my1 (list,np.array): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc. |
| 181 | + mx2 (list,np.array): Coordinates as x-data. |
| 182 | + my2 (list,np.array): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc. |
| 183 | + val_split (float, optional): Validation split. Defaults to 0.1. |
| 184 | +
|
| 185 | + Raises: |
| 186 | + TypeError: Unkown model type. |
| 187 | +
|
| 188 | + Returns: |
| 189 | + x: Merged x data. Depending on model. |
| 190 | + y: Merged y data. Depending on model. |
| 191 | +
|
| 192 | + """ |
| 193 | + if isinstance(my1, list) and isinstance(my2, list): |
| 194 | + x_merge = merge_np_arrays_in_chunks(mx1, mx2, val_split) |
| 195 | + y_merge = [merge_np_arrays_in_chunks(my1[i], my2[i], val_split) for i in range(len(my1))] |
| 196 | + return x_merge, y_merge |
| 197 | + else: |
| 198 | + x_merge = merge_np_arrays_in_chunks(mx1, mx2, val_split) |
| 199 | + y_merge = merge_np_arrays_in_chunks(my1, my2, val_split) |
| 200 | + return x_merge, y_merge |
| 201 | + |
| 202 | + |
| 203 | +def model_save_data_to_folder(x, y, |
| 204 | + target_model, |
| 205 | + mod_dir, |
| 206 | + random_shuffle=False): |
| 207 | + """ |
| 208 | + Save Data to model folder. Always dumps data_x and data_y as pickle. |
| 209 | + Args: |
| 210 | + x (np.array): Coordinates as x-data. |
| 211 | + y (list): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc. |
| 212 | + target_model (str): Name of the Model to save data for. |
| 213 | + mod_dir (str): Path of model directory. |
| 214 | + random_shuffle (bool, optional): Whether to shuffle data before save. The default is False. |
| 215 | +
|
| 216 | + Returns: |
| 217 | + None. |
| 218 | + """ |
| 219 | + return save_data_to_folder(x, y, target_model, mod_dir, random_shuffle) |
| 220 | + |
| 221 | + |
| 222 | +def save_hyp(hyperparameter, filepath): |
| 223 | + """ |
| 224 | + Save hyper-parameters as json dict. |
| 225 | +
|
| 226 | + Args: |
| 227 | + hyperparameter: |
| 228 | + filepath: |
| 229 | +
|
| 230 | + Returns: |
| 231 | + None |
| 232 | + """ |
| 233 | + with open(filepath, 'w') as f: |
| 234 | + json.dump(hyperparameter, f) |
| 235 | + |
| 236 | + |
| 237 | +def load_hyp(filepath): |
| 238 | + """ |
| 239 | + Load hyper-parameters from filepath |
| 240 | +
|
| 241 | + Args: |
| 242 | + filepath: |
| 243 | +
|
| 244 | + Returns: |
| 245 | + dict: hyper-parameters |
| 246 | + """ |
| 247 | + with open(filepath, 'r') as f: |
| 248 | + return json.load(f) |
0 commit comments