Skip to content

Check for ECFP collisions #330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 1 addition & 21 deletions atomsci/ddm/pipeline/chem_diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,27 +226,6 @@ def calc_summary(dist_arr, calc_type, num_nearest=1, within_dset=False):
print("calc_type %s is not valid" % calc_type)
sys.exit(1)

def _get_descriptors(smiles_arr):
"""DEPRECATED. This function is guaranteed not to work, since it refers to datasets that no longer exist."""
from atomsci.ddm.utils import datastore_functions as dsf
ds_client = dsf.config_client()

full_feature_matrix_key = '/ds/projdata/gsk_data/GSK_datasets/eXP_Panel_Min_100_Cmpds/scaled_descriptors/' \
'subset_all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi_HTR2A_5_' \
'HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50.csv'
full_feature_matrix = dsf.retrieve_dataset_by_datasetkey(full_feature_matrix_key, 'gskdata', ds_client)
#smiles_df = pd.DataFrame(smiles_arr)
#df = full_feature_matrix.merge(
# smiles_df, how='inner', left_on='smiles', right_on=smiles_df.columns[0])
df = full_feature_matrix.head(20)
del full_feature_matrix
descriptor_features = [x for x in df.columns.values.tolist() if x not in
['compound_id', 'inchi_key', 'smiles', 'smiles_out',
'lost_frags', 'inchi_string', 'pxc50', 'rdkit_smiles',
'HTR2A_5_HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50']]
#TODO this probably doesn't work
return df[descriptor_features]

def upload_distmatrix_to_DS(
dist_matrix,feature_type,compound_ids,bucket,title,description,tags,key_values,filepath="./",dataset_key=None):
"""Uploads distance matrix in the data store with the appropriate tags
Expand Down Expand Up @@ -287,3 +266,4 @@ def upload_distmatrix_to_DS(
filename = fnm.replace("nm",feature_type) # fn is not defined anywhere. need to address this
_dist_pkl = dist_df.to_pickle(filepath + filename)
dsf.upload_file_to_DS(bucket, title, description, tags, key_values, filepath, filename, dataset_key, client=None)

58 changes: 58 additions & 0 deletions atomsci/ddm/pipeline/featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,55 @@ def remove_duplicate_smiles(dset_df, smiles_col='rdkit_smiles'):
log.warning("All rows with duplicate smiles strings have been removed")
return dset_df

# ****************************************************************************************
def check_ecfp_collisions(dset_df, smiles_col, size, radius):
"""Check for ECFP uniqueness
Check to see if unique smiles result in unique ECFP features. Sometimes isn't the case
for ECFP features. A set of features should be the same size as a set of SMILES

Args:
feat_array (numpy array): 2d Feature array. Aligned with smiles

smiles (iterable of str): List of SMILES strings.

Returns:
collisions (bool):
True if there are collisions
"""

featurizer_obj = dc.feat.CircularFingerprint(size=size, radius=radius)

features, is_valid = featurize_smiles(dset_df, featurizer_obj, smiles_col)

valid_smiles = dset_df[is_valid][smiles_col]

return check_ecfp_feature_collisions(features, valid_smiles)


# ****************************************************************************************
def check_ecfp_feature_collisions(feat_array, smiles):
"""Check for feature uniqueness
Check to see if unique smiles result in unique features. Sometimes isn't the case
for ECFP features. A set of features should be the same size as a set of SMILES

Args:
feat_array (numpy array): 2d Feature array. Aligned with smiles

smiles (iterable of str): List of SMILES strings.

Returns:
collisions (bool):
True if there are collisions
"""

num_unique_feats = len(np.unique(feat_array, axis=0))
num_unique_smiles = len(set(smiles))

if num_unique_feats == num_unique_smiles and num_unique_smiles == 0:
log.warning("Received empty features and smiles when checking for ecfp feature collisions.")

return num_unique_feats != num_unique_smiles

# ****************************************************************************************
def get_dataset_attributes(dset_df, params):
"""Construct a table mapping compound IDs to SMILES strings and possibly other attributes
Expand Down Expand Up @@ -723,6 +772,15 @@ def featurize_data(self, dset_df, params, contains_responses):
"""
attr = get_dataset_attributes(dset_df, params)
features, is_valid = featurize_smiles(dset_df, featurizer=self.featurizer_obj, smiles_col=params.smiles_col)

if params.featurizer == 'ecfp':
valid_smiles = dset_df[is_valid][params.smiles_col]
has_collisions = check_ecfp_feature_collisions(features, valid_smiles)

if has_collisions:
log.warning("Multiple SMILES mapped have the same ECFP features. "
"Increasing ecfp_radius can reduce collisons.")

if features is None:
raise Exception("Featurization failed for dataset")
# Some SMILES strings may not be featurizable. This filters for only valid IDs.
Expand Down
97 changes: 97 additions & 0 deletions atomsci/ddm/test/unit/test_check_ecfp_collisions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import atomsci.ddm.pipeline.featurization as featurization
import atomsci.ddm.pipeline.parameter_parser as parser
import pandas as pd

def get_collision_df():
collision_smiles = [
# group 1
'CC(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(N)=O',
'CC(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(N)=O',
'CC(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(N)=O',

# group 2
'CC(CN1CCCCCC1)NC(=O)/C=N/O',
'CC(CN1CCCCC1)NC(=O)/C=N/O',

# group 3
'O=C(/C=N/O)NCCN1CCCCCC1',
'O=C(/C=N/O)NCCN1CCCCC1',

# group 4
'COc1ccc(/C=C2\\COc3cc(OCCCCCCCCNc4c5c(nc6ccccc46)CCCC5)ccc3C2=O)cc1',
'COc1ccc(/C=C2\\COc3cc(OCCCCCCNc4c5c(nc6ccccc46)CCCC5)ccc3C2=O)cc1'

# group 5
'O=c1ccc2ccc(OCCCCCN3CCN(CCCNc4c5c(nc6ccccc46)CCCC5)CC3)cc2o1',
'O=c1ccc2ccc(OCCCCCCN3CCN(CCCNc4c5c(nc6ccccc46)CCCC5)CC3)cc2o1',

# group 6
'O/N=C/c1cc[n+](CCCCCC[n+]2ccc(/C=N/O)cc2)cc1',
'O/N=C/c1cc[n+](CCCCC[n+]2ccc(/C=N/O)cc2)cc1',
'O/N=C/c1cc[n+](CCCCCCC[n+]2ccc(/C=N/O)cc2)cc1',
]
collision_df = pd.DataFrame(data={'smiles':collision_smiles, 'id':[f'id_{i}' for i in range(len(collision_smiles))]})

return collision_df

def test_check_ecfp_collisions():
collision_df = get_collision_df()

no_collision_smiles = [
# group 1
'CC(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(=O)N(C)[C@@H](Cc1ccccc1)C(N)=O',

# group 2
'CC(CN1CCCCCC1)NC(=O)/C=N/O',

# group 3
'O=C(/C=N/O)NCCN1CCCCCC1',

# group 4
'COc1ccc(/C=C2\\COc3cc(OCCCCCCCCNc4c5c(nc6ccccc46)CCCC5)ccc3C2=O)cc1',

# group 5
'O=c1ccc2ccc(OCCCCCN3CCN(CCCNc4c5c(nc6ccccc46)CCCC5)CC3)cc2o1',

# group 6
'O/N=C/c1cc[n+](CCCCCC[n+]2ccc(/C=N/O)cc2)cc1',
]
no_collision_df = pd.DataFrame(data={'smiles':no_collision_smiles})

collision = featurization.check_ecfp_collisions(collision_df, 'smiles', size=1024, radius=2)
assert collision, "Collisions should have been found."

no_collision = featurization.check_ecfp_collisions(no_collision_df, 'smiles', size=1024, radius=2)
assert not no_collision, "No collisions should have been found."

def test_dynamic_featurization_with_collisions():
test_csv = "test_collision_df.csv"
result_dir = "test_ecfp_collision_result"
collision_df = get_collision_df()

params = {
# dataset key
"dataset_key": test_csv,

# columns
"id_col": "id",
"smiles_col": "smiles",
"response_cols": "this column doesn't exist",

# featurizer
"featurizer": "ecfp",

# result dir
"result_dir": result_dir

}
pparams = parser.wrapper(params)
dynamic_featurizer = featurization.DynamicFeaturization(pparams)

# this should log a warning
_ = dynamic_featurizer.featurize_data(collision_df, pparams, contains_responses=False)


if __name__ == '__main__':
test_dynamic_featurization_with_collisions()
test_check_ecfp_collisions()
Loading