Source code for chemprop.features.utils

import csv
import os
import pickle
from typing import List

import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools


[docs]def save_features(path: str, features: List[np.ndarray]) -> None: """ Saves features to a compressed :code:`.npz` file with array name "features". :param path: Path to a :code:`.npz` file where the features will be saved. :param features: A list of 1D numpy arrays containing the features for molecules. """ np.savez_compressed(path, features=features)
[docs]def load_features(path: str) -> np.ndarray: """ Loads features saved in a variety of formats. Supported formats: * :code:`.npz` compressed (assumes features are saved with name "features") * .npy * :code:`.csv` / :code:`.txt` (assumes comma-separated features with a header and with one line per molecule) * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a sparse numpy array .. note:: All formats assume that the SMILES loaded elsewhere in the code are in the same order as the features loaded here. :param path: Path to a file containing features. :return: A 2D numpy array of size :code:`(num_molecules, features_size)` containing the features. """ extension = os.path.splitext(path)[1] if extension == '.npz': features = np.load(path)['features'] elif extension == '.npy': features = np.load(path) elif extension in ['.csv', '.txt']: with open(path) as f: reader = csv.reader(f) next(reader) # skip header features = np.array([[float(value) for value in row] for row in reader]) elif extension in ['.pkl', '.pckl', '.pickle']: with open(path, 'rb') as f: features = np.array([np.squeeze(np.array(feat.todense())) for feat in pickle.load(f)]) else: raise ValueError(f'Features path extension {extension} not supported.') return features
[docs]def load_valid_atom_or_bond_features(path: str, smiles: List[str]) -> List[np.ndarray]: """ Loads features saved in a variety of formats. Supported formats: * :code:`.npz` descriptors are saved as 2D array for each molecule in the order of that in the data.csv * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a pandas dataframe with smiles as index and numpy array of descriptors as columns * :code:'.sdf' containing all mol blocks with descriptors as entries :param path: Path to file containing atomwise features. :return: A list of 2D array. """ extension = os.path.splitext(path)[1] if extension == '.npz': container = np.load(path) features = [container[key] for key in container] elif extension in ['.pkl', '.pckl', '.pickle']: features_df = pd.read_pickle(path) if features_df.iloc[0, 0].ndim == 1: features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist() elif features_df.iloc[0, 0].ndim == 2: features = features_df.apply(lambda x: np.concatenate(x.tolist(), axis=1), axis=1).tolist() else: raise ValueError(f'Atom/bond descriptors input {path} format not supported') elif extension == '.sdf': features_df = PandasTools.LoadSDF(path).drop(['ID', 'ROMol'], axis=1).set_index('SMILES') features_df = features_df[~features_df.index.duplicated()] # locate atomic descriptors columns features_df = features_df.iloc[:, features_df.iloc[0, :].apply(lambda x: isinstance(x, str) and ',' in x).to_list()] features_df = features_df.reindex(smiles) if features_df.isnull().any().any(): raise ValueError('Invalid custom atomic descriptors file, Nan found in data') features_df = features_df.applymap(lambda x: np.array(x.replace('\r', '').replace('\n', '').split(',')).astype(float)) features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist() else: raise ValueError(f'Extension "{extension}" is not supported.') return features