From 852938d6e27b2702f096c0613c9849ba240855b5 Mon Sep 17 00:00:00 2001 From: "Kevin S. McLoughlin" Date: Mon, 25 Nov 2024 14:16:44 -0800 Subject: [PATCH 1/2] Added compute_drug_likeness function. --- atomsci/ddm/utils/rdkit_easy.py | 109 ++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/atomsci/ddm/utils/rdkit_easy.py b/atomsci/ddm/utils/rdkit_easy.py index 6fcb5066..c849ac05 100644 --- a/atomsci/ddm/utils/rdkit_easy.py +++ b/atomsci/ddm/utils/rdkit_easy.py @@ -11,7 +11,9 @@ import pandas as pd from rdkit import DataStructs +from rdkit import Chem from rdkit.Chem import AllChem +from rdkit.Chem import QED from rdkit.Chem import Descriptors from rdkit.Chem import PandasTools from rdkit.Chem.Draw import MolToImage, rdMolDraw2D @@ -74,6 +76,113 @@ def calculate_descriptors(df, molecule_column='mol'): df=df.join(df2, lsuffix='', rsuffix='_rdk') return df + + +def compute_drug_likeness(df, molecule_column='mol'): + """Compute various molecular descriptors and drug-likeness criteria for compounds specified by RDKit Mol objects. + The descriptors are added to the input data frame, and are limited to those used to compute the Lipinski + rule-of-five, Ghose and Veber drug-likeness filters. The QED (qualitative estimate of drug-likeness) score is + also added to the data frame, along with columns of booleans indicating whether the various sets of filter + criteria are met. + + Args: + df (pandas.DataFrame): Input DataFrame containing RDKit Mol objects. + molecule_column (str): Name of the column in the DataFrame that contains the RDKit Mol objects. Default is 'mol'. + Returns: + pandas.DataFrame: A copy of the input DataFrame with additional columns for the computed descriptors: + - MolWt: Molecular weight + - LogP: Logarithm of the partition coefficient between n-octanol and water + - NumHDonors: Number of hydrogen bond donors + - NumHAcceptors: Number of hydrogen bond acceptors + - TPSA: Topological polar surface area + - NumRotatableBonds: Number of rotatable bonds + - MolarRefractivity: Molar refractivity + - QED: Quantitative estimate of drug-likeness + - TotalAtoms: Total number of atoms + - Lipinski: Boolean indicating if the molecule meets Lipinski's rule of five criteria + - Ghose: Boolean indicating if the molecule meets Ghose filter criteria + - Veber: Boolean indicating if the molecule meets Veber's rule criteria + """ + # Create a copy of the input DataFrame + df_copy = df.copy() + + # Initialize lists to store the computed descriptors + mol_wt = [] + logp = [] + num_h_donors = [] + num_h_acceptors = [] + tpsa = [] + num_rotatable_bonds = [] + molar_refractivity = [] + qed_scores = [] + total_atoms = [] + lipinski_criteria = [] + ghose_criteria = [] + veber_criteria = [] + + # Iterate over each RDKit Mol object in the DataFrame + for mol in df_copy[molecule_column]: + if mol is not None: + mw = Descriptors.MolWt(mol) + lp = Descriptors.MolLogP(mol) + h_donors = Descriptors.NumHDonors(mol) + h_acceptors = Descriptors.NumHAcceptors(mol) + tpsa_val = Descriptors.TPSA(mol) + rot_bonds = Descriptors.NumRotatableBonds(mol) + mr = Descriptors.MolMR(mol) + qed_val = QED.qed(mol) + num_atoms = Chem.rdMolDescriptors.CalcNumAtoms(mol) + + mol_wt.append(mw) + logp.append(lp) + num_h_donors.append(h_donors) + num_h_acceptors.append(h_acceptors) + tpsa.append(tpsa_val) + num_rotatable_bonds.append(rot_bonds) + molar_refractivity.append(mr) + qed_scores.append(qed_val) + total_atoms.append(num_atoms) + + # Check Lipinski's rule of five criteria + lipinski = (mw <= 500 and lp <= 5 and h_donors <= 5 and h_acceptors <= 10) + lipinski_criteria.append(lipinski) + # Check Ghose filter criteria + ghose = (160 <= mw <= 480 and -0.4 <= lp <= 5.6 and 40 <= mr <= 130 and 20 <= num_atoms <= 70) + ghose_criteria.append(ghose) + # Check Veber's rule criteria + veber = (rot_bonds <= 10 and tpsa_val <= 140) + veber_criteria.append(veber) + else: + mol_wt.append(None) + logp.append(None) + num_h_donors.append(None) + num_h_acceptors.append(None) + tpsa.append(None) + num_rotatable_bonds.append(None) + molar_refractivity.append(None) + qed_scores.append(None) + total_atoms.append(None) + lipinski_criteria.append(None) + ghose_criteria.append(None) + veber_criteria.append(None) + + # Add the computed descriptors to the DataFrame + df_copy['MolWt'] = mol_wt + df_copy['LogP'] = logp + df_copy['NumHDonors'] = num_h_donors + df_copy['NumHAcceptors'] = num_h_acceptors + df_copy['TPSA'] = tpsa + df_copy['NumRotatableBonds'] = num_rotatable_bonds + df_copy['MolarRefractivity'] = molar_refractivity + df_copy['QED'] = qed_scores + df_copy['TotalAtoms'] = total_atoms + df_copy['Lipinski'] = lipinski_criteria + df_copy['Ghose'] = ghose_criteria + df_copy['Veber'] = veber_criteria + + return df_copy + + def cluster_dataframe(df, molecule_column='mol', cluster_column='cluster', cutoff=0.2): """Performs Butina clustering on compounds specified by Mol objects in a data frame. From 580e761f89cfa45cd9f34659934165b137b3513c Mon Sep 17 00:00:00 2001 From: "Kevin S. McLoughlin" Date: Wed, 27 Nov 2024 12:55:22 -0800 Subject: [PATCH 2/2] Added unit test for compute_drug_likeness. --- .../test/unit/test_compute_drug_likeness.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 atomsci/ddm/test/unit/test_compute_drug_likeness.py diff --git a/atomsci/ddm/test/unit/test_compute_drug_likeness.py b/atomsci/ddm/test/unit/test_compute_drug_likeness.py new file mode 100644 index 00000000..8f16712e --- /dev/null +++ b/atomsci/ddm/test/unit/test_compute_drug_likeness.py @@ -0,0 +1,44 @@ +import pytest +import pandas as pd +from rdkit import Chem +from atomsci.ddm.utils.rdkit_easy import compute_drug_likeness + +def test_compute_drug_likeness(): + # Create a DataFrame with sample SMILES strings + data = { + 'smiles': [ + 'CCO', # Ethanol + 'CC(=O)OC1=CC=CC=C1C(=O)O', # Aspirin + 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O', # Ibuprofen + 'C1=CC=C(C=C1)C=O', # Benzaldehyde + 'CC(C)NCC(O)COC1=CC=CC=C1' # Pseudoephedrine + ] + } + df = pd.DataFrame(data) + df['mol'] = df['smiles'].apply(Chem.MolFromSmiles) + + # Compute drug likeness + result_df = compute_drug_likeness(df, molecule_column='mol') + + # Check if the expected columns are present in the result DataFrame + expected_columns = [ + 'MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumRotatableBonds', + 'MolarRefractivity', 'QED', 'TotalAtoms', 'Lipinski', 'Ghose', 'Veber' + ] + for col in expected_columns: + assert col in result_df.columns + + # Check if the values are computed correctly for a known molecule (Ethanol) + ethanol_row = result_df[result_df['smiles'] == 'CCO'].iloc[0] + assert pytest.approx(ethanol_row['MolWt'], 0.1) == 46.07 + assert pytest.approx(ethanol_row['LogP'], 0.1) == -0.0014 + assert ethanol_row['NumHDonors'] == 1 + assert ethanol_row['NumHAcceptors'] == 1 + assert pytest.approx(ethanol_row['TPSA'], 0.1) == 20.23 + assert ethanol_row['NumRotatableBonds'] == 0 + assert pytest.approx(ethanol_row['MolarRefractivity'], 0.1) == 12.76 + assert pytest.approx(ethanol_row['QED'], 0.1) == 0.41 + assert ethanol_row['TotalAtoms'] == 9 + assert ethanol_row['Lipinski'] == True + assert ethanol_row['Ghose'] == False + assert ethanol_row['Veber'] == True \ No newline at end of file