From 365646a39a3d599f541ea941cd25653f450112e0 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Tue, 14 Jan 2025 18:49:52 -0800 Subject: [PATCH 1/3] update multitask perf code to handle a mix of ST and MT model metrics --- atomsci/ddm/pipeline/compare_models.py | 111 ++++++++++++++----------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py index 3e95c88e..312ae3a3 100644 --- a/atomsci/ddm/pipeline/compare_models.py +++ b/atomsci/ddm/pipeline/compare_models.py @@ -1804,6 +1804,7 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase elif 'model_metadata.json' in tar.getnames(): with tar.extractfile('model_metadata.json') as meta: meta=json.loads(meta.read()) + meta['model_path']=tar_file else: continue if meta['model_parameters']['prediction_type']==pred_type: @@ -1817,9 +1818,9 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase with open(model_path, 'r') as model: meta=json.loads(model.read()) tarfiles=[x for x in tar_list if meta['model_uuid'] in x] - if len(tarfiles)==1: + try: meta['model_path']=tarfiles[0] - else: + except: meta['model_path']=os.path.dirname(model_path) if meta['model_parameters']['prediction_type']==pred_type: if (dataset_key is not None) and (meta['training_dataset']['dataset_key']==dataset_key): @@ -1856,57 +1857,69 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase # manipulate dfs models['features']=np.where(models.featurizer=='computed_descriptors',models.descriptor_type, models.featurizer) keep_dicts=keep_dicts[keep_dicts.model_uuid.isin(models.model_uuid)] + + mt_models=models[models.num_model_tasks.astype(int)>1] + st_models=models[models.num_model_tasks.astype(int)==1] + assert(len(mt_models)+len(st_models)==len(models)) + + models_dfs=[ mt_models,st_models,] - # deal with metrics + # deal with metrics for st and mt separately + # do metrics by subset tm=pd.DataFrame(training_metrics.training_metrics.tolist()) - preds=[] - for col in tm.columns: + final_preds=[] + for col in tm.columns: # each subset + preds=[] + for models_df in models_dfs: + # check for > 1 dataset + if len(set(models_df.dataset_key.astype(str)))>1: + raise Exception (f"Warning: you cannot export multitask model performances for more than one dataset at a time. Please provide the dataset_key as an additional parameter. Your {pred_type} options are: {list(set(models.dataset_key))}.") + + # get metrics and metric label + met=pd.DataFrame(tm[col].tolist()) + metlabel=met.label.iloc[0]+'_'+met.subset.iloc[0] - # get metrics and metric label - met=pd.DataFrame(tm[col].tolist()) - metlabel=met.label.iloc[0]+'_'+met.subset.iloc[0] - - # expand metrics to get scores - pred=pd.DataFrame(met.prediction_results.tolist()) - pred=models[['model_uuid','response_cols']].join(pred) - - # check for > 1 dataset - if len(set(models.response_cols.astype(str)))>1: - raise Exception (f"Warning: you cannot export multitask model performances for more than one dataset at a time. Please provide the dataset_key as an additional parameter. Your {pred_type} options are: {list(set(models.dataset_key))}.") + # expand metrics to get scores + pred=pd.DataFrame(met.prediction_results.tolist()) + pred=models_df[['model_uuid','response_cols']].join(pred) + + # get num_model_tasks + num_model_tasks=models_df.num_model_tasks.astype(int).iloc[0] - num_model_tasks=models.num_model_tasks.iloc[0] + # get task scores - long form and rename columns + taskcols=['response_cols'] + taskcols.extend([x for x in pred.columns if 'task' in x]) + task_preds=pred[['model_uuid']+taskcols].set_index('model_uuid').explode(taskcols).reset_index() - # get task scores - long form and rename columns - taskcols=['response_cols'] - taskcols.extend([x for x in pred.columns if 'task' in x]) - task_preds=pred[['model_uuid']+taskcols].set_index('model_uuid').explode(taskcols).reset_index() - - # get full model scores and rename columns - predcols=[x for x in pred.columns if 'task' not in x] - predcols.remove('response_cols') - pred=pred[predcols].copy() - pred.columns=[metlabel+'_'+col if col!='model_uuid' else col for col in predcols] - pred['response_cols']='full_model' + # get full model scores and rename columns + predcols=[x for x in pred.columns if 'task' not in x] + predcols.remove('response_cols') + pred=pred[predcols].copy() + pred.columns=[metlabel+'_'+col if col!='model_uuid' else col for col in predcols] + pred['response_cols']='full_model' - # rename task_pred columns to match full model names - coldict={} - for col in task_preds.columns: - if col not in ['model_uuid','response_cols']: - coldict[col]=[predcol for predcol in pred.columns if predcol.replace(metlabel+'_','').startswith(col.replace('task_','')[0:3])][0] - task_preds=task_preds.rename(columns=coldict) - - # concatenate all scores - if num_model_tasks>1: - pred=pd.concat([pred,task_preds]) - - # if single task model, rename response columns and filter out empty rows - if num_model_tasks==1: - pred=pred[pred.response_cols=='full_model'] - pred['response_cols']=[x[0] for x in models.response_cols] - - # append to list - preds.append(pred) + # rename task_pred columns to match full model names + coldict={} + for task_col in task_preds.columns: + if task_col not in ['model_uuid','response_cols']: + coldict[task_col]=[predcol for predcol in pred.columns if predcol.replace(metlabel+'_','').startswith(task_col.replace('task_','')[0:3])][0] + task_preds=task_preds.rename(columns=coldict) + # concatenate all scores + if num_model_tasks>1: + pred=pd.concat([pred,task_preds]) + pred['multitask']=1 + + # if single task model, rename response columns and filter out empty rows + if num_model_tasks==1: + pred['multitask']=0 + pred=pred[pred.response_cols=='full_model'] + pred['response_cols']=[x[0] for x in models_df.response_cols] + # append to list + preds.append(pred) + preds=pd.concat(preds).reset_index(drop=True) + final_preds.append(preds) + # trim model df columns - add compatibility for new metadata weight_transform_type models=models.filter(items=['model_uuid', 'time_built', 'ampl_version','dataset_key', 'model_path', 'model_type', 'prediction_type', 'splitter', @@ -1915,8 +1928,8 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase 'smiles_col', 'features','model_choice_score_type',]) # merge model info and pred_df info - for pred in preds: - models=models.merge(pred) + for pred in final_preds: + models=models.merge(pred, how='left') # deal with info left in dicts models=models.merge(keep_dicts) @@ -1929,7 +1942,7 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase models=models.drop(columns=col) except Exception: pass - + return models From 739dc5d26cacd94d292430e11e4538a8bea9b394 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Tue, 14 Jan 2025 18:52:05 -0800 Subject: [PATCH 2/3] update MinimalDataset to create weights for predicting on classification models with balancing transformers --- atomsci/ddm/pipeline/model_datasets.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index e89b45bb..4b0454ee 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -839,16 +839,17 @@ def get_featurized_data(self, dset_df, is_featurized=False): self.vals = np.zeros((nrows,ncols)) self.attr = pd.DataFrame({params.smiles_col: dset_df[params.smiles_col].values}, index=dset_df[params.id_col]) + if params.model_type != "hybrid": + self.vals, weights = feat.make_weights(self.vals, is_class=params.prediction_type=='classification') self.log.warning("Done") else: self.log.warning("Featurizing data...") - features, ids, self.vals, self.attr, weights, featurized_dset_df = self.featurization.featurize_data(dset_df, - params, self.contains_responses) + features, ids, self.vals, self.attr, weights, featurized_dset_df = self.featurization.featurize_data(dset_df, params, self.contains_responses) self.log.warning("Done") self.n_features = self.featurization.get_feature_count() - self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids) - self.dataset = NumpyDataset(features, self.vals, ids=ids) + self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids, w=weights) + self.dataset = NumpyDataset(features, self.vals, ids=ids, w=weights) # **************************************************************************************** def save_featurized_data(self, featurized_dset_df): From 32e4dec3348127b549b2518e47848f6e3c59a001 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Thu, 16 Jan 2025 16:42:43 -0800 Subject: [PATCH 3/3] exclude all-NaN columns from mordred features and impute the column mean for other NaNs to calculate AD index --- atomsci/ddm/pipeline/model_pipeline.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 7d49092a..2ae45c56 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= pred_data = self.predict_embedding(dset_df, dset_params=dset_params) else: pred_data = copy.deepcopy(self.data.dataset.X) + + if self.featurization.descriptor_type=='mordred_filtered': + pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)] + pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data) try: if not hasattr(self, 'featurized_train_data'): @@ -926,6 +930,9 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= train_dset = dc.data.NumpyDataset(train_X) self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset) else: + if self.featurization.descriptor_type=='mordred_filtered': + train_X = train_X[:,~np.isnan(train_X).all(axis=0)] + train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X) self.featurized_train_data = train_X if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric: @@ -933,7 +940,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= self.train_pair_dis_metric = dist_metric self.log.debug("Calculating AD index.") - + if AD_method == "local_density": result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric) else: