ATOMScience-org · stewarthe6 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/atomsci/ddm/data/descriptor_sets_sources_by_descr_type.csv b/atomsci/ddm/data/descriptor_sets_sources_by_descr_type.csv
diff --git a/atomsci/ddm/docs/PARAMETERS.md b/atomsci/ddm/docs/PARAMETERS.md
@@ -16,7 +16,6 @@ The AMPL pipeline contains many parameters and options to fit models and make pr
   - [Hybrid model](#Hybrid-model)
   - [Splitting](#Splitting)
   - [Transformers](#Transformers)
-  - [UMAP](#UMAP)
   - [XGBoost](#XGBoost)
   - [Additional DeepChem Models](#Auto-DCModels)
 - [Model Saving](#Model-Saving)
@@ -626,16 +625,17 @@ the model will train for max_epochs regardless of validation error.|
 
 |||
 |-|-|
-|*Description:*|type of transformation for the features|
-|*Default:*|normalization|
-|*Type:*|Choice|
+|*Description:*|type of transformation for the features. Choices are {"normalization", "RobustScaler", "PowerTransformer"}.|
+|*Default:*|"normalizaton"|
+|*Type:*|choice|
 
 - **response\_transform\_type**  
 
 |||
 |-|-|
-|*Description:*|type of transformation for the response column (defaults to "normalization") TODO: Not currently implemented|
-|*Default:*|normalization|
+|*Description:*|type of transformation for the response column. Choices are {"normalization"}|
+|*Default:*|"normalization"|
+|*Type:*|choice|
 
 - **weight\_transform\_type**  
 
@@ -673,46 +673,63 @@ the model will train for max_epochs regardless of validation error.|
 |*Default:*|TRUE|
 |*Type:*|Bool|
 
----
-
-<a name="UMAP"></a>
-## UMAP  
-
-- **umap\_dim**  
+- **robustscaler_with_centering**  
 
 |||
 |-|-|
-|*Description:*|Dimension of projected feature space, if UMAP transformation is requested. Can be input as a comma separated list for hyperparameter search (e.g. '2,6,10').|
-|*Default:*|10|
-
-- **umap\_metric**  
+|*Description:*|If `True`, center the data before scaling. This will cause `transform` to raise an exception when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.|
+|*Default:*|TRUE|
+|*Type:*|Bool|
+
+- **robustscaler_with_scaling**  
 
 |||
 |-|-|
-|*Description:*|Distance metric used, if UMAP transformation is requested. Can be input as a comma separated list for hyperparameter search (e.g. 'euclidean','cityblock')|
-|*Default:*|euclidean|
-
-- **umap\_min\_dist**  
+|*Description:*|If `True`, scale the data to interquartile range.|
+|*Default:*|TRUE|
+|*Type:*|Bool|
+
+  - **robustscaler_quartile_range**  
 
 |||
 |-|-|
-|*Description:*|Minimum distance used in UMAP projection, if UMAP transformation is requested. Can be input as a comma separated list for hyperparameter search (e.g. '0.01,0.02,0.05')|
-|*Default:*|0.05|
-
-- **umap\_neighbors**  
+|*Description:*|Quantile range used to calculate `scale_`. By default this is equal to the IQR, i.e., `q_min` is the first quantile and `q_max` is the third quantile. `(q_min, q_max), 0.0 < q_min < q_max < 100.0`|
+|*Default:*|(25.0, 75.0)|
+|*Type:*|List|
+
+  - **robustscaler_unit_variance**  
 
 |||
 |-|-|
-|*Description:*|Number of nearest neighbors used in UMAP projection, if UMAP transformation is requested. Can be input as a comma separated list for hyperparameter search (e.g. '10,20,30')|
-|*Default:*|20|
+|*Description:*|If `True`, scale data so that normally distributed features have a variance of 1. In general, if the difference between the x-values of `q_max` and `q_min` for a standard normal distribution is greater than 1, the dataset will be scaled down. If less than 1, the dataset will be scaled up.|
+|*Default:*|FALSE|
+|*Type:*|bool|
+
+  - **powertransformer_method**  
 
-- **umap\_targ\_wt**  
+|||
+|-|-|
+|*Description:*|The power transform method. Available methods are: ‘yeo-johnson’ , works with positive and negative values ‘box-cox’, only works with strictly positive values. Choices are {"yeo-johnson", "box-cox"}|
+|*Default:*|"yeo-johnson"|
+|*Type:*|choice|
+
+  - **powertransformer_standardize**  
 
 |||
 |-|-|
-|*Description:*|Weight given to training set response values in UMAP projection, if UMAP transformation is requested. Can be input as a comma separated list for hyperparameter search (e.g. '0.0,0.1,0.2')|
-|*Default:*|0.0|
+|*Description:*|Set to True to apply zero-mean, unit-variance normalization to the transformed output.|
+|*Default:*|TRUE|
+|*Type:*|Bool|
+
+  - **imputer_strategy**  
 
+|||
+|-|-|
+|*Description:*|This sets the imputer strategy for the SimpleImputer for use with PowerTransformer or RobustScaler. Choices are {"mean", "median", "most_frequent"}|
+|*Default:*|"mean"|
+|*Type:*|choice|
+
+
 ---
 
 <a name="XGBoost"></a>

diff --git a/atomsci/ddm/pipeline/compare_models.py b/atomsci/ddm/pipeline/compare_models.py
@@ -691,202 +691,6 @@ def get_best_models_info(col_names=None, bucket='public', pred_type="regression"
                 top_models_df.to_csv(os.path.join(output_dir, 'best_models_metadata_%s.csv' % shortened_key), index=False)
     return top_models_df
 
-
-# TODO: This function looks like work in progress, should we delete it?
-'''
-#---------------------------------------------------------------------------------------------------------
-def _get_best_grouped_models_info(collection='pilot_fixed', pred_type='regression', top_n=1, subset='test'):
-    """Get results for models in the given collection."""
-
-    if not mlmt_supported:
-        print("Model tracker not supported in your environment; can examine models saved in filesystem only.")
-        return
-
-    res_dir = '/usr/local/data/%s_perf' % collection
-    plt_dir = '%s/Plots' % res_dir
-    os.makedirs(plt_dir, exist_ok=True)
-    res_files = os.listdir(res_dir)
-    suffix = '_%s_model_perf_metrics.csv' % collection
-
-    if pred_type == 'regression':
-        metric_type = 'r2_score'
-    else:
-        metric_type = 'roc_auc_score'
-    for res_file in res_files:
-        try:
-            if not res_file.endswith(suffix):
-                continue
-            res_path = os.path.join(res_dir, res_file)
-
-            res_df = pd.read_csv(res_path, index_col=False)
-            res_df['combo'] = ['%s/%s' % (m,f) for m, f in zip(res_df.model_type.values, res_df.featurizer.values)]
-            dset_name = res_file.replace(suffix, '')
-            datasets.append(dset_name)
-            res_df['dataset'] = dset_name
-            print(dset_name)
-            res_df = res_df.sort_values('{0}_{1}'.format(metric_type, subset), ascending=False)
-            res_df['model_type/feat'] = ['%s/%s' % (m,f) for m, f in zip(res_df.model_type.values, res_df.featurizer.values)]
-            res_df = res_df.sort_values('{0}_{1}'.format(metric_type, subset), ascending=False)
-            grouped_df = res_df.groupby('model_type/feat').apply(
-                lambda t: t.head(top_n)
-            ).reset_index(drop=True)
-            top_grouped_models.append(grouped_df)
-            top_combo = res_df['model_type/feat'].values[0]
-            top_combo_dsets.append(top_combo + dset_name.lstrip('ATOM_GSK_dskey'))
-            top_score = res_df['{0}_{1}'.format(metric_type, subset)].values[0]
-            top_model_feat.append(top_combo)
-            top_scores.append(top_score)
-            num_samples.append(res_df['Dataset Size'][0])
-
-#------------------------------------------------------------------------------------------------------------------
-def get_umap_nn_model_perf_table(dataset_key, bucket, collection_name, pred_type='regression'):
-    """Load performance metrics from model tracker for all NN models with the given prediction_type saved in
-    the model tracker DB under a given collection that were trained against a particular dataset. Show
-    parameter settings for UMAP transformer for models where they are available.
-
-    Args:
-        dataset_key (str): Dataset key for training dataset.
-
-        bucket (str): Dataset bucket for training dataset.
-
-        collection_name (str): Name of model tracker collection to search for models.
-
-        pred_type (str): Prediction type ('classification' or 'regression') of models to query.
-
-    Returns:
-        pd.DataFrame: Table of model performance metrics.
-
-    """
-    if not mlmt_supported:
-        print("Model tracker not supported in your environment; can examine models saved in filesystem only.")
-        return None
-
-    query_params = {
-        "match_metadata": {
-            "training_dataset.bucket": bucket,
-            "training_dataset.dataset_key": dataset_key,
-            "model_parameters.model_type" : "NN",
-            "model_parameters.prediction_type" : pred_type
-        },
-
-        "match_metrics": {
-            "metrics_type": "training",  # match only training metrics
-            "label": "best",
-        },
-    }
-    query_params['match_metadata'].update(other_filters)
-
-    print("Finding models trained on %s dataset %s" % (bucket, dataset_key))
-    mlmt_client = dsf.initialize_model_tracker()
-    metadata_list = mlmt_client.model.query_model_metadata(
-        collection_name=collection_name,
-        query_params=query_params,
-    ).result()
-    if metadata_list == []:
-        print("No matching models returned")
-        return
-    else:
-        print("Found %d matching models" % len(metadata_list))
-
-    model_uuid_list = []
-    learning_rate_list = []
-    dropouts_list = []
-    layer_sizes_list = []
-    featurizer_list = []
-    best_epoch_list = []
-    max_epochs_list = []
-
-    feature_transform_type_list = []
-    umap_dim_list = []
-    umap_targ_wt_list = []
-    umap_neighbors_list = []
-    umap_min_dist_list = []
-
-    subsets = ['train', 'valid', 'test']
-
-    if pred_type == 'regression':
-        sort_metric = 'r2_score'
-        metrics = ['r2_score', 'rms_score', 'mae_score']
-    else:
-        sort_metric = 'roc_auc_score'
-        metrics = ['roc_auc_score', 'prc_auc_score', 'matthews_cc', 'kappa', 'confusion_matrix']
-    score_dict = {}
-    for subset in subsets:
-        score_dict[subset] = {}
-        for metric in metrics:
-            score_dict[subset][metric] = []
-
-    for metadata_dict in metadata_list:
-        model_uuid = metadata_dict['model_uuid']
-        #print("Got metadata for model UUID %s" % model_uuid)
-
-        # Get model metrics for this model
-        metrics_dicts = metadata_dict['training_metrics']
-        #print("Got %d metrics dicts for model %s" % (len(metrics_dicts), model_uuid))
-        if len(metrics_dicts) < 3:
-            print("Got no or incomplete metrics for model %s, skipping..." % model_uuid)
-            continue
-        if len(metrics_dicts) > 3:
-            raise Exception('Got more than one set of best epoch metrics for model %s' % model_uuid)
-        subset_metrics = {}
-        for metrics_dict in metrics_dicts:
-            subset = metrics_dict['subset']
-            subset_metrics[subset] = metrics_dict['prediction_results']
-
-        model_uuid_list.append(model_uuid)
-        model_params = metadata_dict['model_parameters']
-        model_type = model_params['model_type']
-        if model_type != 'NN':
-            continue
-        featurizer = model_params['featurizer']
-        featurizer_list.append(featurizer)
-        feature_transform_type = metadata_dict['training_dataset']['feature_transform_type']
-        feature_transform_type_list.append(feature_transform_type)
-        nn_params = metadata_dict['nn_specific']
-        max_epochs_list.append(nn_params['max_epochs'])
-        best_epoch_list.append(nn_params['best_epoch'])
-        learning_rate_list.append(nn_params['learning_rate'])
-        layer_sizes_list.append(','.join(['%d' % s for s in nn_params['layer_sizes']]))
-        dropouts_list.append(','.join(['%.2f' % d for d in nn_params['dropouts']]))
-        for subset in subsets:
-            for metric in metrics:
-                score_dict[subset][metric].append(subset_metrics[subset][metric])
-        if 'umap_specific' in metadata_dict:
-            umap_params = metadata_dict['umap_specific']
-            umap_dim_list.append(umap_params['umap_dim'])
-            umap_targ_wt_list.append(umap_params['umap_targ_wt'])
-            umap_neighbors_list.append(umap_params['umap_neighbors'])
-            umap_min_dist_list.append(umap_params['umap_min_dist'])
-        else:
-            umap_dim_list.append(nan)
-            umap_targ_wt_list.append(nan)
-            umap_neighbors_list.append(nan)
-            umap_min_dist_list.append(nan)
-
-
-    perf_df = pd.DataFrame(dict(
-                    model_uuid=model_uuid_list,
-                    learning_rate=learning_rate_list,
-                    dropouts=dropouts_list,
-                    layer_sizes=layer_sizes_list,
-                    featurizer=featurizer_list,
-                    best_epoch=best_epoch_list,
-                    max_epochs=max_epochs_list,
-                    feature_transform_type=feature_transform_type_list,
-                    umap_dim=umap_dim_list,
-                    umap_targ_wt=umap_targ_wt_list,
-                    umap_neighbors=umap_neighbors_list,
-                    umap_min_dist=umap_min_dist_list ))
-    for subset in subsets:
-        for metric in metrics:
-            metric_col = '%s_%s' % (metric, subset)
-            perf_df[metric_col] = score_dict[subset][metric]
-    sort_by = '%s_valid' % sort_metric
-
-    perf_df = perf_df.sort_values(sort_by, ascending=False)
-    return perf_df
-'''
-
 #------------------------------------------------------------------------------------------------------------------
 def get_tarball_perf_table(model_tarball, pred_type='classification'):
     """Retrieve model metadata and performance metrics for a model saved as a tarball (.tar.gz) file.