From 365646a39a3d599f541ea941cd25653f450112e0 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Tue, 14 Jan 2025 18:49:52 -0800
Subject: [PATCH 1/3] update multitask perf code to handle a mix of ST and MT
 model metrics

---
 atomsci/ddm/pipeline/compare_models.py | 111 ++++++++++++++-----------
 1 file changed, 62 insertions(+), 49 deletions(-)

diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py
index 3e95c88e..312ae3a3 100644
--- a/atomsci/ddm/pipeline/compare_models.py
+++ b/atomsci/ddm/pipeline/compare_models.py
@@ -1804,6 +1804,7 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
                 elif 'model_metadata.json' in tar.getnames():
                     with tar.extractfile('model_metadata.json') as meta:
                         meta=json.loads(meta.read())
+                        meta['model_path']=tar_file
                 else:
                     continue
             if meta['model_parameters']['prediction_type']==pred_type:
@@ -1817,9 +1818,9 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
             with open(model_path, 'r') as model:
                 meta=json.loads(model.read())
                 tarfiles=[x for x in tar_list if meta['model_uuid'] in x]
-                if len(tarfiles)==1:
+                try:
                     meta['model_path']=tarfiles[0]
-                else:
+                except:
                     meta['model_path']=os.path.dirname(model_path)               
             if meta['model_parameters']['prediction_type']==pred_type:
                 if (dataset_key is not None) and (meta['training_dataset']['dataset_key']==dataset_key):
@@ -1856,57 +1857,69 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
     # manipulate dfs
     models['features']=np.where(models.featurizer=='computed_descriptors',models.descriptor_type, models.featurizer)
     keep_dicts=keep_dicts[keep_dicts.model_uuid.isin(models.model_uuid)]
+    
+    mt_models=models[models.num_model_tasks.astype(int)>1]
+    st_models=models[models.num_model_tasks.astype(int)==1]
+    assert(len(mt_models)+len(st_models)==len(models))
+    
+    models_dfs=[ mt_models,st_models,]
 
-    # deal with metrics
+    # deal with metrics for st and mt separately
+    # do metrics by subset
     tm=pd.DataFrame(training_metrics.training_metrics.tolist())
-    preds=[]
-    for col in tm.columns:
+    final_preds=[]
+    for col in tm.columns: # each subset
+        preds=[]
+        for models_df in models_dfs:
+            # check for > 1 dataset
+            if len(set(models_df.dataset_key.astype(str)))>1:
+                raise Exception (f"Warning: you cannot export multitask model performances for more than one dataset at a time. Please provide the dataset_key as an additional parameter. Your {pred_type} options are: {list(set(models.dataset_key))}.")
+            
+            # get metrics and metric label
+            met=pd.DataFrame(tm[col].tolist())        
+            metlabel=met.label.iloc[0]+'_'+met.subset.iloc[0]
         
-        # get metrics and metric label
-        met=pd.DataFrame(tm[col].tolist())
-        metlabel=met.label.iloc[0]+'_'+met.subset.iloc[0]
-    
-        # expand metrics to get scores
-        pred=pd.DataFrame(met.prediction_results.tolist())
-        pred=models[['model_uuid','response_cols']].join(pred)
-
-        # check for > 1 dataset
-        if len(set(models.response_cols.astype(str)))>1:
-            raise Exception (f"Warning: you cannot export multitask model performances for more than one dataset at a time. Please provide the dataset_key as an additional parameter. Your {pred_type} options are: {list(set(models.dataset_key))}.")
+            # expand metrics to get scores
+            pred=pd.DataFrame(met.prediction_results.tolist())
+            pred=models_df[['model_uuid','response_cols']].join(pred)
+            
+            # get num_model_tasks
+            num_model_tasks=models_df.num_model_tasks.astype(int).iloc[0]
 
-        num_model_tasks=models.num_model_tasks.iloc[0]
+            # get task scores - long form and rename columns
+            taskcols=['response_cols']
+            taskcols.extend([x for x in pred.columns if 'task' in x])    
+            task_preds=pred[['model_uuid']+taskcols].set_index('model_uuid').explode(taskcols).reset_index()
         
-        # get task scores - long form and rename columns
-        taskcols=['response_cols']
-        taskcols.extend([x for x in pred.columns if 'task' in x])    
-        task_preds=pred[['model_uuid']+taskcols].set_index('model_uuid').explode(taskcols).reset_index()
-
-        # get full model scores and rename columns
-        predcols=[x for x in pred.columns if 'task' not in x]
-        predcols.remove('response_cols')
-        pred=pred[predcols].copy()
-        pred.columns=[metlabel+'_'+col if col!='model_uuid' else col for col in predcols]
-        pred['response_cols']='full_model'
+            # get full model scores and rename columns
+            predcols=[x for x in pred.columns if 'task' not in x]
+            predcols.remove('response_cols')
+            pred=pred[predcols].copy()
+            pred.columns=[metlabel+'_'+col if col!='model_uuid' else col for col in predcols]
+            pred['response_cols']='full_model'
         
-        # rename task_pred columns to match full model names
-        coldict={}
-        for col in task_preds.columns:
-            if col not in ['model_uuid','response_cols']:
-                coldict[col]=[predcol for predcol in pred.columns if predcol.replace(metlabel+'_','').startswith(col.replace('task_','')[0:3])][0]
-        task_preds=task_preds.rename(columns=coldict)
-
-        # concatenate all scores
-        if num_model_tasks>1:
-            pred=pd.concat([pred,task_preds])
-            
-        # if single task model, rename response columns and filter out empty rows
-        if num_model_tasks==1:
-            pred=pred[pred.response_cols=='full_model']
-            pred['response_cols']=[x[0] for x in models.response_cols]
-    
-        # append to list
-        preds.append(pred)
+            # rename task_pred columns to match full model names
+            coldict={}
+            for task_col in task_preds.columns:
+                if task_col not in ['model_uuid','response_cols']:
+                    coldict[task_col]=[predcol for predcol in pred.columns if predcol.replace(metlabel+'_','').startswith(task_col.replace('task_','')[0:3])][0]
+            task_preds=task_preds.rename(columns=coldict)
         
+            # concatenate all scores
+            if num_model_tasks>1:
+                pred=pd.concat([pred,task_preds])
+                pred['multitask']=1
+
+            # if single task model, rename response columns and filter out empty rows
+            if num_model_tasks==1:
+                pred['multitask']=0
+                pred=pred[pred.response_cols=='full_model']
+                pred['response_cols']=[x[0] for x in models_df.response_cols]
+            # append to list
+            preds.append(pred)
+        preds=pd.concat(preds).reset_index(drop=True)
+        final_preds.append(preds)
+    
     # trim model df columns - add compatibility for new metadata weight_transform_type
     models=models.filter(items=['model_uuid', 'time_built', 'ampl_version','dataset_key', 'model_path',
            'model_type', 'prediction_type', 'splitter',
@@ -1915,8 +1928,8 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
            'smiles_col', 'features','model_choice_score_type',])
     
     # merge model info and pred_df info
-    for pred in preds:
-        models=models.merge(pred)
+    for pred in final_preds:
+        models=models.merge(pred, how='left')
 
     # deal with info left in dicts
     models=models.merge(keep_dicts)
@@ -1929,7 +1942,7 @@ def get_multitask_perf_from_files_new(result_dir, pred_type='regression', datase
             models=models.drop(columns=col)
         except Exception: 
             pass
-
+    
     return models
 
 

From 739dc5d26cacd94d292430e11e4538a8bea9b394 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Tue, 14 Jan 2025 18:52:05 -0800
Subject: [PATCH 2/3] update MinimalDataset to create weights for predicting on
 classification models with balancing transformers

---
 atomsci/ddm/pipeline/model_datasets.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
index e89b45bb..4b0454ee 100644
--- a/atomsci/ddm/pipeline/model_datasets.py
+++ b/atomsci/ddm/pipeline/model_datasets.py
@@ -839,16 +839,17 @@ def get_featurized_data(self, dset_df, is_featurized=False):
                 self.vals = np.zeros((nrows,ncols))
             self.attr = pd.DataFrame({params.smiles_col: dset_df[params.smiles_col].values},
                                  index=dset_df[params.id_col])
+            if params.model_type != "hybrid":
+                self.vals, weights = feat.make_weights(self.vals, is_class=params.prediction_type=='classification')
             self.log.warning("Done")
         else:
             self.log.warning("Featurizing data...")
-            features, ids, self.vals, self.attr, weights, featurized_dset_df  = self.featurization.featurize_data(dset_df, 
-                                                                                    params, self.contains_responses)
+            features, ids, self.vals, self.attr, weights, featurized_dset_df  = self.featurization.featurize_data(dset_df, params, self.contains_responses)
             self.log.warning("Done")
         self.n_features = self.featurization.get_feature_count()
         
-        self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids)
-        self.dataset = NumpyDataset(features, self.vals, ids=ids)
+        self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids, w=weights)
+        self.dataset = NumpyDataset(features, self.vals, ids=ids, w=weights)
 
     # ****************************************************************************************
     def save_featurized_data(self, featurized_dset_df):

From 32e4dec3348127b549b2518e47848f6e3c59a001 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Thu, 16 Jan 2025 16:42:43 -0800
Subject: [PATCH 3/3] exclude all-NaN columns from mordred features and impute
 the column mean for other NaNs to calculate AD index

---
 atomsci/ddm/pipeline/model_pipeline.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index 7d49092a..2ae45c56 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                 pred_data = self.predict_embedding(dset_df, dset_params=dset_params)
             else:
                 pred_data = copy.deepcopy(self.data.dataset.X)
+                
+            if self.featurization.descriptor_type=='mordred_filtered':
+                pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
+                pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)
 
             try:
                 if not hasattr(self, 'featurized_train_data'):
@@ -926,6 +930,9 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                         train_dset = dc.data.NumpyDataset(train_X)
                         self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
                     else:
+                        if self.featurization.descriptor_type=='mordred_filtered':
+                            train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
+                            train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
                         self.featurized_train_data = train_X
 
                 if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric:
@@ -933,7 +940,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                     self.train_pair_dis_metric = dist_metric
 
                 self.log.debug("Calculating AD index.")
-
+                
                 if AD_method == "local_density":
                     result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric)
                 else: