Applied machine learning for computational chemistry.
chemlearn handles all steps from featurization of molecules, training of machine learning/deep learning models and validation.
The MolDataset
class handles featurization and data splitting. The current version only supports pandas Dataframes,
and the input must be a string representing a filepath to a CSV file. The other inputs to MolDataset
are
a splitting strategy, using one of the factory splitters, and a featurizer (See cheminftools for a list of available featurizers)
from chemlearn.data.dataset import MolDataset
from cheminftools.tools.featurizer import MolFeaturizer
from chemlearn.utils.splitters import TrainTestSplitter
# Input data
csv_path = 'example_data.csv'
# Column with SMILES
smiles_column = 'smiles'
# Prediction target
target_column = 'pIC50'
# Define splitting strategy
splitter = TrainTestSplitter(test_size=0.2)
# Define featurizer
featurizer = MolFeaturizer(descriptor_type='morgan', params={'radius':3, 'fpSize': 2048})
dataset = MolDataset(data_path=csv_path, smiles_column=smiles_column, target_variable=target_column, splitter=splitter, featurizer=featurizer)
from chemlearn.models.skmodel import ChemLearner
from sklearn.ensemble import RandomForestRegressor
from chemlearn.models.validation.metrics import R2Score
from chemlearn.data.dataset import MolDataset
from cheminftools.tools.featurizer import MolFeaturizer
from chemlearn.utils.splitters import TrainTestSplitter
metric = R2Score()
# Input data
csv_path = 'example_data.csv'
# Column with SMILES
smiles_column = 'smiles'
# Prediction target
target_column = 'pIC50'
# Define splitting strategy
splitter = TrainTestSplitter(test_size=0.2)
# Define featurizer
featurizer = MolFeaturizer(descriptor_type='morgan', params={'radius':3, 'fpSize': 2048})
dataset = MolDataset(data_path=csv_path, smiles_column=smiles_column, target_variable=target_column, splitter=splitter, featurizer=featurizer)
chem_learner = ChemLearner(model=RandomForestRegressor(), dataset=dataset, metric=metric)
chem_learner.fit(dataset)