Source code for chemprop.data.scaler

from typing import Any, List, Optional

import numpy as np


[docs]class StandardScaler: """A :class:`StandardScaler` normalizes the features of a dataset. When it is fit on a dataset, the :class:`StandardScaler` learns the mean and standard deviation across the 0th axis. When transforming a dataset, the :class:`StandardScaler` subtracts the means and divides by the standard deviations. """ def __init__(self, means: np.ndarray = None, stds: np.ndarray = None, replace_nan_token: Any = None): """ :param means: An optional 1D numpy array of precomputed means. :param stds: An optional 1D numpy array of precomputed standard deviations. :param replace_nan_token: A token to use to replace NaN entries in the features. """ self.means = means self.stds = stds self.replace_nan_token = replace_nan_token
[docs] def fit(self, X: List[List[Optional[float]]]) -> 'StandardScaler': """ Learns means and standard deviations across the 0th axis of the data :code:`X`. :param X: A list of lists of floats (or None). :return: The fitted :class:`StandardScaler` (self). """ X = np.array(X).astype(float) self.means = np.nanmean(X, axis=0) self.stds = np.nanstd(X, axis=0) self.means = np.where(np.isnan(self.means), np.zeros(self.means.shape), self.means) self.stds = np.where(np.isnan(self.stds), np.ones(self.stds.shape), self.stds) self.stds = np.where(self.stds == 0, np.ones(self.stds.shape), self.stds) return self
[docs] def transform(self, X: List[List[Optional[float]]]) -> np.ndarray: """ Transforms the data by subtracting the means and dividing by the standard deviations. :param X: A list of lists of floats (or None). :return: The transformed data with NaNs replaced by :code:`self.replace_nan_token`. """ X = np.array(X).astype(float) transformed_with_nan = (X - self.means) / self.stds transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan) return transformed_with_none
[docs] def inverse_transform(self, X: List[List[Optional[float]]]) -> np.ndarray: """ Performs the inverse transformation by multiplying by the standard deviations and adding the means. :param X: A list of lists of floats. :return: The inverse transformed data with NaNs replaced by :code:`self.replace_nan_token`. """ X = np.array(X).astype(float) transformed_with_nan = X * self.stds + self.means transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan) return transformed_with_none
[docs]class AtomBondScaler(StandardScaler): """A :class:`AtomBondScaler` normalizes the features of a dataset. When it is fit on a dataset, the :class:`AtomBondScaler` learns the mean and standard deviation across the 0th axis. When transforming a dataset, the :class:`AtomBondScaler` subtracts the means and divides by the standard deviations. """ def __init__(self, means: np.ndarray = None, stds: np.ndarray = None, replace_nan_token: Any = None, n_atom_targets = None, n_bond_targets = None): super().__init__(means, stds, replace_nan_token) self.n_atom_targets = n_atom_targets self.n_bond_targets = n_bond_targets
[docs] def fit(self, X: List[List[Optional[float]]]) -> 'AtomBondScaler': scalers = [] for i in range(self.n_atom_targets): scaler = StandardScaler().fit(X[i]) scalers.append(scaler) for i in range(self.n_bond_targets): scaler = StandardScaler().fit(X[i+self.n_atom_targets]) scalers.append(scaler) self.means = np.array([s.means for s in scalers]) self.stds = np.array([s.stds for s in scalers]) return self
[docs] def transform(self, X: List[List[Optional[float]]]) -> List[np.ndarray]: """ Transforms the data by subtracting the means and dividing by the standard deviations. :param X: A list of lists of floats (or None). :return: The transformed data with NaNs replaced by :code:`self.replace_nan_token`. """ transformed_results = [] for i in range(self.n_atom_targets): Xi = np.array(X[i]).astype(float) transformed_with_nan = (Xi - self.means[i]) / self.stds[i] transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan) transformed_results.append(transformed_with_none.tolist()) for i in range(self.n_bond_targets): Xi = np.array(X[i+self.n_atom_targets]).astype(float) transformed_with_nan = (Xi - self.means[i+self.n_atom_targets]) / self.stds[i+self.n_atom_targets] transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan) transformed_results.append(transformed_with_none.tolist()) return transformed_results
[docs] def inverse_transform(self, X: List[List[Optional[float]]]) -> List[np.ndarray]: """ Performs the inverse transformation by multiplying by the standard deviations and adding the means. :param X: A list of lists of floats. :return: The inverse transformed data with NaNs replaced by :code:`self.replace_nan_token`. """ transformed_results = [] for i in range(self.n_atom_targets): Xi = np.array(X[i]).astype(float) transformed_with_nan = Xi * self.stds[i] + self.means[i] transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan) transformed_results.append(transformed_with_none.tolist()) for i in range(self.n_bond_targets): Xi = np.array(X[i+self.n_atom_targets]).astype(float) transformed_with_nan = Xi * self.stds[i+self.n_atom_targets] + self.means[i+self.n_atom_targets] transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan) transformed_results.append(transformed_with_none.tolist()) return transformed_results