Predicting Regression - Reaction#
[1]:
# Install chemprop from GitHub if running in Google Colab
import os
if os.getenv("COLAB_RELEASE_TAG"):
try:
import chemprop
except ImportError:
!git clone https://github.com/chemprop/chemprop.git
%cd chemprop
!pip install .
%cd examples
Import packages#
[2]:
import pandas as pd
import numpy as np
import torch
from lightning import pytorch as pl
from pathlib import Path
from chemprop import data, featurizers, models
Change model input here#
[3]:
chemprop_dir = Path.cwd().parent
checkpoint_path = chemprop_dir / "tests" / "data" / "example_model_v2_regression_rxn.ckpt" # path to the checkpoint file.
# If the checkpoint file is generated using the training notebook, it will be in the `checkpoints` folder with name similar to `checkpoints/epoch=19-step=180.ckpt`.
Load model#
[4]:
mpnn = models.MPNN.load_from_checkpoint(checkpoint_path)
mpnn
[4]:
MPNN(
(message_passing): BondMessagePassing(
(W_i): Linear(in_features=134, out_features=300, bias=False)
(W_h): Linear(in_features=300, out_features=300, bias=False)
(W_o): Linear(in_features=406, out_features=300, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(tau): ReLU()
(V_d_transform): Identity()
(graph_transform): GraphTransform(
(V_transform): Identity()
(E_transform): Identity()
)
)
(agg): MeanAggregation()
(bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(predictor): RegressionFFN(
(ffn): MLP(
(0): Sequential(
(0): Linear(in_features=300, out_features=300, bias=True)
)
(1): Sequential(
(0): ReLU()
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=300, out_features=1, bias=True)
)
)
(criterion): MSE(task_weights=[[1.0]])
(output_transform): UnscaleTransform()
)
(X_d_transform): Identity()
(metrics): ModuleList(
(0-1): 2 x MSE(task_weights=[[1.0]])
)
)
Change predict input here#
[5]:
chemprop_dir = Path.cwd().parent
test_path = chemprop_dir / "tests" / "data" / "regression" / "rxn" / "rxn.csv"
smiles_column = 'smiles'
Load smiles#
[6]:
df_test = pd.read_csv(test_path)
smis = df_test.loc[:, smiles_column].values
smis[:5]
[6]:
array(['[O:1]([C:2]([C:3]([C:4](=[O:5])[C:6]([O:7][H:15])([H:13])[H:14])([H:11])[H:12])([H:9])[H:10])[H:8]>>[C:3](=[C:4]=[O:5])([H:11])[H:12].[C:6]([O:7][H:15])([H:8])([H:13])[H:14].[O:1]=[C:2]([H:9])[H:10]',
'[C:1]1([H:8])([H:9])[O:2][C@@:3]2([H:10])[C@@:4]3([H:11])[O:5][C@:6]1([H:12])[C@@:7]23[H:13]>>[C:1]1([H:8])([H:9])[O:2][C:3]([H:10])=[C:7]([H:13])[C@:6]1([O+:5]=[C-:4][H:11])[H:12]',
'[C:1]([C@@:2]1([H:11])[C@@:3]2([H:12])[C:4]([H:13])([H:14])[C:5]([H:15])=[C:6]([H:16])[C@@:7]12[H:17])([H:8])([H:9])[H:10]>>[C:1]([C@@:2]1([H:11])[C:3]([H:12])([H:13])[C:4]([H:14])=[C:5]([H:15])[C:6]([H:16])=[C:7]1[H:17])([H:8])([H:9])[H:10]',
'[C:1]([O:2][C:3]([C@@:4]([C:5]([H:14])([H:15])[H:16])([C:6]([O:7][H:19])([H:17])[H:18])[H:13])([H:11])[H:12])([H:8])([H:9])[H:10]>>[C-:1]([O+:2]=[C:3]([C@@:4]([C:5]([H:14])([H:15])[H:16])([C:6]([O:7][H:19])([H:17])[H:18])[H:13])[H:12])([H:8])[H:10].[H:9][H:11]',
'[C:1]([C:2]#[C:3][C:4]([C:5](=[O:6])[H:12])([H:10])[H:11])([H:7])([H:8])[H:9]>>[C:1]([C:2](=[C:3]=[C:4]([H:10])[H:11])[C:5](=[O:6])[H:12])([H:7])([H:8])[H:9]'],
dtype=object)
Load datapoints#
[7]:
test_data = [data.ReactionDatapoint.from_smi(smi) for smi in smis]
Define featurizer#
[8]:
featurizer = featurizers.CondensedGraphOfReactionFeaturizer(mode_="PROD_DIFF")
# Testing parameters should match training parameters
Get dataset and dataloader#
[9]:
test_dset = data.ReactionDataset(test_data, featurizer=featurizer)
test_loader = data.build_dataloader(test_dset, shuffle=False)
Perform tests#
[10]:
with torch.inference_mode():
trainer = pl.Trainer(
logger=None,
enable_progress_bar=True,
accelerator="cpu",
devices=1
)
test_preds = trainer.predict(mpnn, test_loader)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
/home/knathan/anaconda3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:434: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
[11]:
test_preds = np.concatenate(test_preds, axis=0)
df_test['preds'] = test_preds
df_test
[11]:
| smiles | ea | preds | |
|---|---|---|---|
| 0 | [O:1]([C:2]([C:3]([C:4](=[O:5])[C:6]([O:7][H:1... | 8.898934 | 8.071494 |
| 1 | [C:1]1([H:8])([H:9])[O:2][C@@:3]2([H:10])[C@@:... | 5.464328 | 8.108090 |
| 2 | [C:1]([C@@:2]1([H:11])[C@@:3]2([H:12])[C:4]([H... | 5.270552 | 8.087680 |
| 3 | [C:1]([O:2][C:3]([C@@:4]([C:5]([H:14])([H:15])... | 8.473006 | 8.070966 |
| 4 | [C:1]([C:2]#[C:3][C:4]([C:5](=[O:6])[H:12])([H... | 5.579037 | 8.065533 |
| ... | ... | ... | ... |
| 95 | [C:1]([C:2]([C:3]([H:12])([H:13])[H:14])([C:4]... | 9.295665 | 8.071316 |
| 96 | [O:1]=[C:2]([C@@:3]1([H:9])[C:4]([H:10])([H:11... | 7.753442 | 8.085133 |
| 97 | [C:1]([C@@:2]1([H:11])[C@@:3]2([H:12])[C:4]([H... | 10.650215 | 8.096391 |
| 98 | [C:1]1([H:8])([H:9])[C@@:2]2([H:10])[N:3]1[C:4... | 10.138945 | 8.202709 |
| 99 | [C:1]([C@@:2]1([C:3]([C:4]([O:5][H:15])([H:13]... | 6.979934 | 8.107012 |
100 rows × 3 columns