Source code for chemprop.data.scaler
from typing import Any, List, Optional
import numpy as np
[docs]class StandardScaler:
"""A :class:`StandardScaler` normalizes the features of a dataset.
When it is fit on a dataset, the :class:`StandardScaler` learns the mean and standard deviation across the 0th axis.
When transforming a dataset, the :class:`StandardScaler` subtracts the means and divides by the standard deviations.
"""
def __init__(self, means: np.ndarray = None, stds: np.ndarray = None, replace_nan_token: Any = None):
"""
:param means: An optional 1D numpy array of precomputed means.
:param stds: An optional 1D numpy array of precomputed standard deviations.
:param replace_nan_token: A token to use to replace NaN entries in the features.
"""
self.means = means
self.stds = stds
self.replace_nan_token = replace_nan_token
[docs] def fit(self, X: List[List[Optional[float]]]) -> 'StandardScaler':
"""
Learns means and standard deviations across the 0th axis of the data :code:`X`.
:param X: A list of lists of floats (or None).
:return: The fitted :class:`StandardScaler` (self).
"""
X = np.array(X).astype(float)
self.means = np.nanmean(X, axis=0)
self.stds = np.nanstd(X, axis=0)
self.means = np.where(np.isnan(self.means), np.zeros(self.means.shape), self.means)
self.stds = np.where(np.isnan(self.stds), np.ones(self.stds.shape), self.stds)
self.stds = np.where(self.stds == 0, np.ones(self.stds.shape), self.stds)
return self
[docs] def transform(self, X: List[List[Optional[float]]]) -> np.ndarray:
"""
Transforms the data by subtracting the means and dividing by the standard deviations.
:param X: A list of lists of floats (or None).
:return: The transformed data with NaNs replaced by :code:`self.replace_nan_token`.
"""
X = np.array(X).astype(float)
transformed_with_nan = (X - self.means) / self.stds
transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
return transformed_with_none
[docs] def inverse_transform(self, X: List[List[Optional[float]]]) -> np.ndarray:
"""
Performs the inverse transformation by multiplying by the standard deviations and adding the means.
:param X: A list of lists of floats.
:return: The inverse transformed data with NaNs replaced by :code:`self.replace_nan_token`.
"""
X = np.array(X).astype(float)
transformed_with_nan = X * self.stds + self.means
transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
return transformed_with_none
[docs]class AtomBondScaler(StandardScaler):
"""A :class:`AtomBondScaler` normalizes the features of a dataset.
When it is fit on a dataset, the :class:`AtomBondScaler` learns the mean and standard deviation across the 0th axis.
When transforming a dataset, the :class:`AtomBondScaler` subtracts the means and divides by the standard deviations.
"""
def __init__(self, means: np.ndarray = None, stds: np.ndarray = None, replace_nan_token: Any = None, n_atom_targets = None, n_bond_targets = None):
super().__init__(means, stds, replace_nan_token)
self.n_atom_targets = n_atom_targets
self.n_bond_targets = n_bond_targets
[docs] def fit(self, X: List[List[Optional[float]]]) -> 'AtomBondScaler':
scalers = []
for i in range(self.n_atom_targets):
scaler = StandardScaler().fit(X[i])
scalers.append(scaler)
for i in range(self.n_bond_targets):
scaler = StandardScaler().fit(X[i+self.n_atom_targets])
scalers.append(scaler)
self.means = np.array([s.means for s in scalers])
self.stds = np.array([s.stds for s in scalers])
return self
[docs] def transform(self, X: List[List[Optional[float]]]) -> List[np.ndarray]:
"""
Transforms the data by subtracting the means and dividing by the standard deviations.
:param X: A list of lists of floats (or None).
:return: The transformed data with NaNs replaced by :code:`self.replace_nan_token`.
"""
transformed_results = []
for i in range(self.n_atom_targets):
Xi = np.array(X[i]).astype(float)
transformed_with_nan = (Xi - self.means[i]) / self.stds[i]
transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
transformed_results.append(transformed_with_none.tolist())
for i in range(self.n_bond_targets):
Xi = np.array(X[i+self.n_atom_targets]).astype(float)
transformed_with_nan = (Xi - self.means[i+self.n_atom_targets]) / self.stds[i+self.n_atom_targets]
transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
transformed_results.append(transformed_with_none.tolist())
return transformed_results
[docs] def inverse_transform(self, X: List[List[Optional[float]]]) -> List[np.ndarray]:
"""
Performs the inverse transformation by multiplying by the standard deviations and adding the means.
:param X: A list of lists of floats.
:return: The inverse transformed data with NaNs replaced by :code:`self.replace_nan_token`.
"""
transformed_results = []
for i in range(self.n_atom_targets):
Xi = np.array(X[i]).astype(float)
transformed_with_nan = Xi * self.stds[i] + self.means[i]
transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
transformed_results.append(transformed_with_none.tolist())
for i in range(self.n_bond_targets):
Xi = np.array(X[i+self.n_atom_targets]).astype(float)
transformed_with_nan = Xi * self.stds[i+self.n_atom_targets] + self.means[i+self.n_atom_targets]
transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
transformed_results.append(transformed_with_none.tolist())
return transformed_results