Source code for chemprop.features.features_generators

from typing import Callable, List, Union

import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem


Molecule = Union[str, Chem.Mol]
FeaturesGenerator = Callable[[Molecule], np.ndarray]


FEATURES_GENERATOR_REGISTRY = {}


[docs]def register_features_generator(features_generator_name: str) -> Callable[[FeaturesGenerator], FeaturesGenerator]: """ Creates a decorator which registers a features generator in a global dictionary to enable access by name. :param features_generator_name: The name to use to access the features generator. :return: A decorator which will add a features generator to the registry using the specified name. """ def decorator(features_generator: FeaturesGenerator) -> FeaturesGenerator: FEATURES_GENERATOR_REGISTRY[features_generator_name] = features_generator return features_generator return decorator
[docs]def get_features_generator(features_generator_name: str) -> FeaturesGenerator: """ Gets a registered features generator by name. :param features_generator_name: The name of the features generator. :return: The desired features generator. """ if features_generator_name not in FEATURES_GENERATOR_REGISTRY: raise ValueError(f'Features generator "{features_generator_name}" could not be found. ' f'If this generator relies on rdkit features, you may need to install descriptastorus.') return FEATURES_GENERATOR_REGISTRY[features_generator_name]
[docs]def get_available_features_generators() -> List[str]: """Returns a list of names of available features generators.""" return list(FEATURES_GENERATOR_REGISTRY.keys())
MORGAN_RADIUS = 2 MORGAN_NUM_BITS = 2048
[docs]@register_features_generator('morgan') def morgan_binary_features_generator(mol: Molecule, radius: int = MORGAN_RADIUS, num_bits: int = MORGAN_NUM_BITS) -> np.ndarray: """ Generates a binary Morgan fingerprint for a molecule. :param mol: A molecule (i.e., either a SMILES or an RDKit molecule). :param radius: Morgan fingerprint radius. :param num_bits: Number of bits in Morgan fingerprint. :return: A 1D numpy array containing the binary Morgan fingerprint. """ mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits) features = np.zeros((1,)) DataStructs.ConvertToNumpyArray(features_vec, features) return features
[docs]@register_features_generator('morgan_count') def morgan_counts_features_generator(mol: Molecule, radius: int = MORGAN_RADIUS, num_bits: int = MORGAN_NUM_BITS) -> np.ndarray: """ Generates a counts-based Morgan fingerprint for a molecule. :param mol: A molecule (i.e., either a SMILES or an RDKit molecule). :param radius: Morgan fingerprint radius. :param num_bits: Number of bits in Morgan fingerprint. :return: A 1D numpy array containing the counts-based Morgan fingerprint. """ mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits) features = np.zeros((1,)) DataStructs.ConvertToNumpyArray(features_vec, features) return features
try: from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors @register_features_generator('rdkit_2d') def rdkit_2d_features_generator(mol: Molecule) -> np.ndarray: """ Generates RDKit 2D features for a molecule. :param mol: A molecule (i.e., either a SMILES or an RDKit molecule). :return: A 1D numpy array containing the RDKit 2D features. """ smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol generator = rdDescriptors.RDKit2D() features = generator.process(smiles)[1:] return features @register_features_generator('rdkit_2d_normalized') def rdkit_2d_normalized_features_generator(mol: Molecule) -> np.ndarray: """ Generates RDKit 2D normalized features for a molecule. :param mol: A molecule (i.e., either a SMILES or an RDKit molecule). :return: A 1D numpy array containing the RDKit 2D normalized features. """ smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol generator = rdNormalizedDescriptors.RDKit2DNormalized() features = generator.process(smiles)[1:] return features except ImportError:
[docs] @register_features_generator('rdkit_2d') def rdkit_2d_features_generator(mol: Molecule) -> np.ndarray: """Mock implementation raising an ImportError if descriptastorus cannot be imported.""" raise ImportError('Failed to import descriptastorus. Please install descriptastorus ' '(https://github.com/bp-kelley/descriptastorus) to use RDKit 2D features.')
[docs] @register_features_generator('rdkit_2d_normalized') def rdkit_2d_normalized_features_generator(mol: Molecule) -> np.ndarray: """Mock implementation raising an ImportError if descriptastorus cannot be imported.""" raise ImportError('Failed to import descriptastorus. Please install descriptastorus ' '(https://github.com/bp-kelley/descriptastorus) to use RDKit 2D normalized features.')
except AttributeError as e: from scipy import __version__ as scipy_version from descriptastorus import __version__ as descriptastorus_version raise AttributeError('`descriptastorus==2.6.1` and `==2.5.1` are incompatible with `scipy<1.9`. Please try changing ' 'descriptastorus versions or update scipy to avoid issues with `scipy.stats.gibrat`. ' f'Your versions are descriptastorus: {descriptastorus_version} and scipy {scipy_version}') from e """ Custom features generator template. Note: The name you use to register the features generator is the name you will specify on the command line when using the --features_generator <name> flag. Ex. python train.py ... --features_generator custom ... @register_features_generator('custom') def custom_features_generator(mol: Molecule) -> np.ndarray: # If you want to use the SMILES string smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol # If you want to use the RDKit molecule mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol # Replace this with code which generates features from the molecule features = np.array([0, 0, 1]) return features """