Predicting Regression - Multicomponent

Contents

Predicting Regression - Multicomponent#

[1]:

# Install chemprop from GitHub if running in Google Colab
import os

if os.getenv("COLAB_RELEASE_TAG"):
    try:
        import chemprop
    except ImportError:
        !git clone https://github.com/chemprop/chemprop.git
        %cd chemprop
        !pip install .
        %cd examples

Import packages#

[2]:

import numpy as np
import pandas as pd
import torch
from lightning import pytorch as pl
from pathlib import Path

from chemprop import data, featurizers
from chemprop.models import multi

Change model input here#

[3]:

chemprop_dir = Path.cwd().parent
checkpoint_path = chemprop_dir / "tests" / "data" / "example_model_v2_regression_mol+mol.ckpt" # path to the checkpoint file.
# If the checkpoint file is generated using the training notebook, it will be in the `checkpoints` folder with name similar to `checkpoints/epoch=19-step=180.ckpt`.

Load model#

[4]:

mcmpnn = multi.MulticomponentMPNN.load_from_checkpoint(checkpoint_path)
mcmpnn

[4]:

MulticomponentMPNN(
  (message_passing): MulticomponentMessagePassing(
    (blocks): ModuleList(
      (0-1): 2 x BondMessagePassing(
        (W_i): Linear(in_features=86, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=372, out_features=300, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (tau): ReLU()
        (V_d_transform): Identity()
        (graph_transform): GraphTransform(
          (V_transform): Identity()
          (E_transform): Identity()
        )
      )
    )
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=600, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]])
    (output_transform): UnscaleTransform()
  )
  (X_d_transform): Identity()
  (metrics): ModuleList(
    (0-1): 2 x MSE(task_weights=[[1.0]])
  )
)

Change predict input here#

[5]:

chemprop_dir = Path.cwd().parent
test_path = chemprop_dir / "tests" / "data" / "regression" / "mol+mol" / "mol+mol.csv" # path to your .csv file containing SMILES strings to make predictions for
smiles_columns = ['smiles', 'solvent'] # name of the column containing SMILES strings

Load test smiles#

[6]:

df_test = pd.read_csv(test_path)
df_test

[6]:

	smiles	solvent	peakwavs_max
0	CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2C...	ClCCl	642.0
1	C(=C/c1cnccn1)\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3c...	ClCCl	420.0
2	CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+]...	O	544.0
3	c1ccc2[nH]ccc2c1	O	290.0
4	CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5c...	ClC(Cl)Cl	736.0
...	...	...	...
95	COc1ccc(C2CC(c3ccc(O)cc3)=NN2c2ccc(S(N)(=O)=O)...	C1CCOC1	359.0
96	COc1ccc2c3c(c4ccc(OC)cc4c2c1)C1(c2ccccc2-c2ccc...	C1CCCCC1	386.0
97	CCCCOc1c(C=C2N(C)c3ccccc3C2(C)C)c(=O)c1=O	CCO	425.0
98	Cc1cc2ccc(-c3cccc4cccc(-c5ccc6cc(C)c(=O)oc6c5)...	c1ccccc1	324.0
99	Cc1ccc(C(=O)c2c(C)c3ccc4cccc5c6cccc7ccc2c(c76)...	ClCCl	391.0

100 rows × 3 columns

Get smiles#

[7]:

smiss = df_test[smiles_columns].values
smiss[:5]

[7]:

array([['CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2CCCC)C(=O)N(CCCC)C1=S',
        'ClCCl'],
       ['C(=C/c1cnccn1)\\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3cnccn3)cc2)cc1',
        'ClCCl'],
       ['CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+](C)C)cc-3oc2c1',
        'O'],
       ['c1ccc2[nH]ccc2c1', 'O'],
       ['CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5ccccc5c4C3(C)C)CCCC1=C2c1ccccc1C(=O)O',
        'ClC(Cl)Cl']], dtype=object)

Get molecule datapoints#

[8]:

n_componenets = len(smiles_columns)
test_datapointss = [[data.MoleculeDatapoint.from_smi(smi) for smi in smiss[:, i]] for i in range(n_componenets)]

Get molecule datasets#

[9]:

featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
test_dsets = [data.MoleculeDataset(test_datapoints, featurizer) for test_datapoints in test_datapointss]

Get multicomponent dataset and data loader#

[10]:

test_mcdset = data.MulticomponentDataset(test_dsets)
test_loader = data.build_dataloader(test_mcdset, shuffle=False)

Set up trainer#

[11]:

with torch.inference_mode():
    trainer = pl.Trainer(
        logger=None,
        enable_progress_bar=True,
        accelerator="auto",
        devices=1
    )
    test_preds = trainer.predict(mcmpnn, test_loader)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores

/home/knathan/anaconda3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:434: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.

[12]:

test_preds = np.concatenate(test_preds, axis=0)
df_test['pred'] = test_preds
df_test

[12]:

	smiles	solvent	peakwavs_max	pred
0	CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2C...	ClCCl	642.0	454.898621
1	C(=C/c1cnccn1)\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3c...	ClCCl	420.0	453.561584
2	CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+]...	O	544.0	448.694977
3	c1ccc2[nH]ccc2c1	O	290.0	448.159760
4	CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5c...	ClC(Cl)Cl	736.0	456.897003
...	...	...	...	...
95	COc1ccc(C2CC(c3ccc(O)cc3)=NN2c2ccc(S(N)(=O)=O)...	C1CCOC1	359.0	454.548584
96	COc1ccc2c3c(c4ccc(OC)cc4c2c1)C1(c2ccccc2-c2ccc...	C1CCCCC1	386.0	455.287140
97	CCCCOc1c(C=C2N(C)c3ccccc3C2(C)C)c(=O)c1=O	CCO	425.0	453.560364
98	Cc1cc2ccc(-c3cccc4cccc(-c5ccc6cc(C)c(=O)oc6c5)...	c1ccccc1	324.0	454.656891
99	Cc1ccc(C(=O)c2c(C)c3ccc4cccc5c6cccc7ccc2c(c76)...	ClCCl	391.0	453.118774

100 rows × 4 columns