Skip to content

Bug ad index mordred #389

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 62 additions & 49 deletions atomsci/ddm/pipeline/compare_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1804,6 +1804,7 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
elif 'model_metadata.json' in tar.getnames():
with tar.extractfile('model_metadata.json') as meta:
meta=json.loads(meta.read())
meta['model_path']=tar_file
else:
continue
if meta['model_parameters']['prediction_type']==pred_type:
Expand All @@ -1817,9 +1818,9 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
with open(model_path, 'r') as model:
meta=json.loads(model.read())
tarfiles=[x for x in tar_list if meta['model_uuid'] in x]
if len(tarfiles)==1:
try:
meta['model_path']=tarfiles[0]
else:
except:
meta['model_path']=os.path.dirname(model_path)
if meta['model_parameters']['prediction_type']==pred_type:
if (dataset_key is not None) and (meta['training_dataset']['dataset_key']==dataset_key):
Expand Down Expand Up @@ -1856,57 +1857,69 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
# manipulate dfs
models['features']=np.where(models.featurizer=='computed_descriptors',models.descriptor_type, models.featurizer)
keep_dicts=keep_dicts[keep_dicts.model_uuid.isin(models.model_uuid)]

mt_models=models[models.num_model_tasks.astype(int)>1]
st_models=models[models.num_model_tasks.astype(int)==1]
assert(len(mt_models)+len(st_models)==len(models))

models_dfs=[ mt_models,st_models,]

# deal with metrics
# deal with metrics for st and mt separately
# do metrics by subset
tm=pd.DataFrame(training_metrics.training_metrics.tolist())
preds=[]
for col in tm.columns:
final_preds=[]
for col in tm.columns: # each subset
preds=[]
for models_df in models_dfs:
# check for > 1 dataset
if len(set(models_df.dataset_key.astype(str)))>1:
raise Exception (f"Warning: you cannot export multitask model performances for more than one dataset at a time. Please provide the dataset_key as an additional parameter. Your {pred_type} options are: {list(set(models.dataset_key))}.")

# get metrics and metric label
met=pd.DataFrame(tm[col].tolist())
metlabel=met.label.iloc[0]+'_'+met.subset.iloc[0]

# get metrics and metric label
met=pd.DataFrame(tm[col].tolist())
metlabel=met.label.iloc[0]+'_'+met.subset.iloc[0]

# expand metrics to get scores
pred=pd.DataFrame(met.prediction_results.tolist())
pred=models[['model_uuid','response_cols']].join(pred)

# check for > 1 dataset
if len(set(models.response_cols.astype(str)))>1:
raise Exception (f"Warning: you cannot export multitask model performances for more than one dataset at a time. Please provide the dataset_key as an additional parameter. Your {pred_type} options are: {list(set(models.dataset_key))}.")
# expand metrics to get scores
pred=pd.DataFrame(met.prediction_results.tolist())
pred=models_df[['model_uuid','response_cols']].join(pred)

# get num_model_tasks
num_model_tasks=models_df.num_model_tasks.astype(int).iloc[0]

num_model_tasks=models.num_model_tasks.iloc[0]
# get task scores - long form and rename columns
taskcols=['response_cols']
taskcols.extend([x for x in pred.columns if 'task' in x])
task_preds=pred[['model_uuid']+taskcols].set_index('model_uuid').explode(taskcols).reset_index()

# get task scores - long form and rename columns
taskcols=['response_cols']
taskcols.extend([x for x in pred.columns if 'task' in x])
task_preds=pred[['model_uuid']+taskcols].set_index('model_uuid').explode(taskcols).reset_index()

# get full model scores and rename columns
predcols=[x for x in pred.columns if 'task' not in x]
predcols.remove('response_cols')
pred=pred[predcols].copy()
pred.columns=[metlabel+'_'+col if col!='model_uuid' else col for col in predcols]
pred['response_cols']='full_model'
# get full model scores and rename columns
predcols=[x for x in pred.columns if 'task' not in x]
predcols.remove('response_cols')
pred=pred[predcols].copy()
pred.columns=[metlabel+'_'+col if col!='model_uuid' else col for col in predcols]
pred['response_cols']='full_model'

# rename task_pred columns to match full model names
coldict={}
for col in task_preds.columns:
if col not in ['model_uuid','response_cols']:
coldict[col]=[predcol for predcol in pred.columns if predcol.replace(metlabel+'_','').startswith(col.replace('task_','')[0:3])][0]
task_preds=task_preds.rename(columns=coldict)

# concatenate all scores
if num_model_tasks>1:
pred=pd.concat([pred,task_preds])

# if single task model, rename response columns and filter out empty rows
if num_model_tasks==1:
pred=pred[pred.response_cols=='full_model']
pred['response_cols']=[x[0] for x in models.response_cols]

# append to list
preds.append(pred)
# rename task_pred columns to match full model names
coldict={}
for task_col in task_preds.columns:
if task_col not in ['model_uuid','response_cols']:
coldict[task_col]=[predcol for predcol in pred.columns if predcol.replace(metlabel+'_','').startswith(task_col.replace('task_','')[0:3])][0]
task_preds=task_preds.rename(columns=coldict)

# concatenate all scores
if num_model_tasks>1:
pred=pd.concat([pred,task_preds])
pred['multitask']=1

# if single task model, rename response columns and filter out empty rows
if num_model_tasks==1:
pred['multitask']=0
pred=pred[pred.response_cols=='full_model']
pred['response_cols']=[x[0] for x in models_df.response_cols]
# append to list
preds.append(pred)
preds=pd.concat(preds).reset_index(drop=True)
final_preds.append(preds)

# trim model df columns - add compatibility for new metadata weight_transform_type
models=models.filter(items=['model_uuid', 'time_built', 'ampl_version','dataset_key', 'model_path',
'model_type', 'prediction_type', 'splitter',
Expand All @@ -1915,8 +1928,8 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
'smiles_col', 'features','model_choice_score_type',])

# merge model info and pred_df info
for pred in preds:
models=models.merge(pred)
for pred in final_preds:
models=models.merge(pred, how='left')

# deal with info left in dicts
models=models.merge(keep_dicts)
Expand All @@ -1929,7 +1942,7 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
models=models.drop(columns=col)
except Exception:
pass

return models


Expand Down
9 changes: 5 additions & 4 deletions atomsci/ddm/pipeline/model_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,16 +839,17 @@ def get_featurized_data(self, dset_df, is_featurized=False):
self.vals = np.zeros((nrows,ncols))
self.attr = pd.DataFrame({params.smiles_col: dset_df[params.smiles_col].values},
index=dset_df[params.id_col])
if params.model_type != "hybrid":
self.vals, weights = feat.make_weights(self.vals, is_class=params.prediction_type=='classification')
self.log.warning("Done")
else:
self.log.warning("Featurizing data...")
features, ids, self.vals, self.attr, weights, featurized_dset_df = self.featurization.featurize_data(dset_df,
params, self.contains_responses)
features, ids, self.vals, self.attr, weights, featurized_dset_df = self.featurization.featurize_data(dset_df, params, self.contains_responses)
self.log.warning("Done")
self.n_features = self.featurization.get_feature_count()

self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids)
self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids, w=weights)
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=weights)

# ****************************************************************************************
def save_featurized_data(self, featurized_dset_df):
Expand Down
9 changes: 8 additions & 1 deletion atomsci/ddm/pipeline/model_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
pred_data = self.predict_embedding(dset_df, dset_params=dset_params)
else:
pred_data = copy.deepcopy(self.data.dataset.X)

if self.featurization.descriptor_type=='mordred_filtered':
pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)

try:
if not hasattr(self, 'featurized_train_data'):
Expand All @@ -926,14 +930,17 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
train_dset = dc.data.NumpyDataset(train_X)
self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
else:
if self.featurization.descriptor_type=='mordred_filtered':
train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
self.featurized_train_data = train_X

if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric:
self.train_pair_dis = pairwise_distances(X=self.featurized_train_data, metric=dist_metric)
self.train_pair_dis_metric = dist_metric

self.log.debug("Calculating AD index.")

if AD_method == "local_density":
result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric)
else:
Expand Down
Loading