Skip to content

Commit cbaeacc

Browse files
committed
refactored
1 parent 01b42ec commit cbaeacc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+3882
-4233
lines changed

pyNNsMD/datasets/general.py

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
"""
2+
Shared and general data handling functionality.
3+
"""
4+
5+
import os
6+
import pickle
7+
import json
8+
9+
import numpy as np
10+
from sklearn.utils import shuffle
11+
12+
13+
def index_make_random_shuffle(x):
14+
"""
15+
Shuffle indexarray.
16+
17+
Args:
18+
x (np.array): Index to shuffle.
19+
20+
Returns:
21+
np.array: Shuffled index.
22+
23+
"""
24+
return shuffle(x)
25+
26+
27+
def make_random_shuffle(datalist, shuffle_ind=None):
28+
"""
29+
Shuffle a list od data.
30+
31+
Args:
32+
datalist (list): List of numpy arrays of same length (axis=0).
33+
shuffle_ind (np.array): Array of shuffled index
34+
35+
Returns:
36+
outlist (list): List of the shuffled data.
37+
38+
"""
39+
datalen = len(datalist[0]) # this should be x data
40+
for x in datalist:
41+
if len(x) != datalen:
42+
print("Error: Data has inconsisten length")
43+
44+
if shuffle_ind is None:
45+
allind = shuffle(np.arange(datalen))
46+
else:
47+
allind = shuffle_ind
48+
if len(allind) != datalen:
49+
print("Warning: Datalength and shuffle index does not match")
50+
51+
outlist = []
52+
for x in datalist:
53+
outlist.append(x[allind])
54+
return allind, outlist
55+
56+
57+
def save_data_to_folder(x, y, target_model, mod_dir, random_shuffle):
58+
"""
59+
Save all training data for model mlp_eg to folder.
60+
61+
Args:
62+
x (np.array): Coordinates as x-data.
63+
y (list): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc.
64+
target_model (str): Name of the Model to save data for.
65+
mod_dir (str): Path of model directory.
66+
random_shuffle (bool, optional): Whether to shuffle data before save. The default is False.
67+
68+
Returns:
69+
None.
70+
71+
"""
72+
# Save data:
73+
if not random_shuffle:
74+
with open(os.path.join(mod_dir, 'data_x'), 'wb') as f:
75+
pickle.dump(x, f)
76+
with open(os.path.join(mod_dir, 'data_y'), 'wb') as f:
77+
pickle.dump(y, f)
78+
else:
79+
if isinstance(y, list):
80+
shuffle_list = [x] + y
81+
else:
82+
shuffle_list = [x] + [y]
83+
# Make random shuffle
84+
ind_shuffle, datalist = make_random_shuffle(shuffle_list)
85+
x_out = datalist[0]
86+
if len(datalist) > 2:
87+
y_out = datalist[1:]
88+
else:
89+
y_out = datalist[1]
90+
np.save(os.path.join(mod_dir, 'shuffle_index.npy'), ind_shuffle)
91+
with open(os.path.join(mod_dir, 'data_x'), 'wb') as f:
92+
pickle.dump(x_out, f)
93+
with open(os.path.join(mod_dir, 'data_y'), 'wb') as f:
94+
pickle.dump(y_out, f)
95+
96+
97+
def split_validation_training_index(allind, splitsize, do_offset, offset_steps):
98+
"""
99+
Make a train-validation split for indexarray. Validation set is taken from beginning with possible offset.
100+
101+
Args:
102+
allind (np.array): Indexlist for full dataset of same length.
103+
splitsize (int): Total number of validation samples to take.
104+
do_offset (bool): Whether to take validation set not from beginnig but with offset.
105+
offset_steps (int): Number of validation sizes offseted from the beginning to start to take validation set.
106+
107+
Returns:
108+
i_train (np.array): Training indices
109+
i_val (np.array): Validation indices.
110+
111+
"""
112+
i = offset_steps
113+
lval = splitsize
114+
if not do_offset:
115+
i_val = allind[:lval]
116+
i_train = allind[lval:]
117+
else:
118+
i_val = allind[i * lval:(i + 1) * lval]
119+
i_train = np.concatenate([allind[0:i * lval], allind[(i + 1) * lval:]], axis=0)
120+
if len(i_val) <= 0:
121+
print("Warning: #Validation data is 0, take 1 training sample instead")
122+
i_val = i_train[:1]
123+
124+
return i_train, i_val
125+
126+
127+
def merge_np_arrays_in_chunks(data1, data2, split_size):
128+
"""
129+
Merge data in chunks of split-size. Goal is to keep validation k-splits for fit.
130+
131+
Idea: [a+a+a] + [b+b+b] = [(a+b)+(a+b)+(a+b)] and NOT [a+a+a+b+b+b].
132+
133+
Args:
134+
data1 (np.array): Data to merge.
135+
data2 (np.array): Data to merge.
136+
split_size (float): Relative size of junks 0 < split_size < 1.
137+
138+
Returns:
139+
np.array: Merged data.
140+
141+
"""
142+
pacs1 = int(len(data1) * split_size)
143+
pacs2 = int(len(data2) * split_size)
144+
145+
data1frac = [data1[i * pacs1:(i + 1) * pacs1] for i in range(int(np.ceil(1 / split_size)))]
146+
data2frac = [data2[i * pacs2:(i + 1) * pacs2] for i in range(int(np.ceil(1 / split_size)))]
147+
148+
for i in range(len(data1frac)):
149+
data1frac[i] = np.concatenate([data1frac[i], data2frac[i]], axis=0)
150+
151+
return np.concatenate(data1frac, axis=0)
152+
153+
154+
def model_make_random_shuffle(x, y, shuffle_ind):
155+
"""
156+
Shuffle data according to model.
157+
158+
Args:
159+
x (np.array): Coordinates as x-data.
160+
y (list): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc.
161+
shuffle_ind (np.array): Index order of datapoints in dataset to shuffle after.
162+
163+
Returns:
164+
None.
165+
166+
"""
167+
if isinstance(y, list):
168+
_, temp = make_random_shuffle([x] + y, shuffle_ind)
169+
return temp[0], temp[1:]
170+
else:
171+
return make_random_shuffle([x, y], shuffle_ind)[1]
172+
173+
174+
def model_merge_data_in_chunks(mx1, my1, mx2, my2, val_split=0.1):
175+
"""
176+
Merge Data in chunks.
177+
178+
Args:
179+
mx1 (list,np.array): Coordinates as x-data.
180+
my1 (list,np.array): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc.
181+
mx2 (list,np.array): Coordinates as x-data.
182+
my2 (list,np.array): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc.
183+
val_split (float, optional): Validation split. Defaults to 0.1.
184+
185+
Raises:
186+
TypeError: Unkown model type.
187+
188+
Returns:
189+
x: Merged x data. Depending on model.
190+
y: Merged y data. Depending on model.
191+
192+
"""
193+
if isinstance(my1, list) and isinstance(my2, list):
194+
x_merge = merge_np_arrays_in_chunks(mx1, mx2, val_split)
195+
y_merge = [merge_np_arrays_in_chunks(my1[i], my2[i], val_split) for i in range(len(my1))]
196+
return x_merge, y_merge
197+
else:
198+
x_merge = merge_np_arrays_in_chunks(mx1, mx2, val_split)
199+
y_merge = merge_np_arrays_in_chunks(my1, my2, val_split)
200+
return x_merge, y_merge
201+
202+
203+
def model_save_data_to_folder(x, y,
204+
target_model,
205+
mod_dir,
206+
random_shuffle=False):
207+
"""
208+
Save Data to model folder. Always dumps data_x and data_y as pickle.
209+
Args:
210+
x (np.array): Coordinates as x-data.
211+
y (list): A possible list of np.arrays for y-values. Energy, Gradients, NAC etc.
212+
target_model (str): Name of the Model to save data for.
213+
mod_dir (str): Path of model directory.
214+
random_shuffle (bool, optional): Whether to shuffle data before save. The default is False.
215+
216+
Returns:
217+
None.
218+
"""
219+
return save_data_to_folder(x, y, target_model, mod_dir, random_shuffle)
220+
221+
222+
def save_hyp(hyperparameter, filepath):
223+
"""
224+
Save hyper-parameters as json dict.
225+
226+
Args:
227+
hyperparameter:
228+
filepath:
229+
230+
Returns:
231+
None
232+
"""
233+
with open(filepath, 'w') as f:
234+
json.dump(hyperparameter, f)
235+
236+
237+
def load_hyp(filepath):
238+
"""
239+
Load hyper-parameters from filepath
240+
241+
Args:
242+
filepath:
243+
244+
Returns:
245+
dict: hyper-parameters
246+
"""
247+
with open(filepath, 'r') as f:
248+
return json.load(f)

pyNNsMD/layers/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)