Source code for megnet.utils.preprocessing
"""
Preprocessing codes
"""
from typing import List
import numpy as np
from monty.json import MSONable
from .typing import StructureOrMolecule, VectorLike
[docs]class Scaler(MSONable):
"""
Base Scaler class. It implements transform and
inverse_transform. Both methods will take number
of atom as the second parameter in addition to
the target property
"""
[docs] def transform(self, target: np.ndarray, n: int = 1) -> np.ndarray:
"""
Transform the target values into new target values
Args:
target (float): target numerical value
n (int): number of atoms
Returns:
scaled target
"""
raise NotImplementedError
[docs] def inverse_transform(self, transformed_target: np.ndarray, n: int = 1) -> np.ndarray:
"""
Inverse transform of the target
Args:
transformed_target (np.ndarray): transformed target
n (int): number of atoms
Returns:
target
"""
raise NotImplementedError
[docs]class StandardScaler(Scaler):
"""
Standard scaler with consideration of extensive/intensive quantity
For intensive quantity, the mean is just the mean of training data,
and std is the std of training data
For extensive quantity, the mean is the mean of target/atom, and
std is the std for target/atom
Methods:
transform(self, target, n=1): standard scaling the target and
"""
def __init__(self, mean: float = 0.0, std: float = 1.0, is_intensive: bool = True):
"""
Args:
mean (float): mean value of target
std (float): standard deviation of target
is_intensive (bool): whether the target is already an intensive
property
"""
self.mean = mean
if np.abs(std) < np.finfo(float).eps:
std = 1.0
self.std = std
self.is_intensive = is_intensive
[docs] def transform(self, target: np.ndarray, n: int = 1) -> np.ndarray:
"""
Transform numeric values according the mean and std, plus a factor n
Args:
target (np.ndarray): target numerical value
n (int): number of atoms
Returns:
scaled target
"""
if self.is_intensive:
n = 1
return (target / n - self.mean) / self.std
[docs] def inverse_transform(self, transformed_target: np.ndarray, n: int = 1) -> np.ndarray:
"""
Inverse transform of the target
Args:
transformed_target (np.ndarray): transformed target
n (int): number of atoms
Returns:
original target
"""
if self.is_intensive:
n = 1
return n * (transformed_target * self.std + self.mean)
[docs] @classmethod
def from_training_data(
cls, structures: List[StructureOrMolecule], targets: VectorLike, is_intensive: bool = True
) -> "StandardScaler":
"""
Generate a target scaler from a list of input structures/molecules,
a target value vector and an indicator for intensiveness of the
property
Args:
structures (list): list of structures/molecules
targets (list): vector of target properties
is_intensive (bool): whether the target is intensive
Returns: new instance
"""
if is_intensive:
new_targets = targets
else:
new_targets = [i / len(j) for i, j in zip(targets, structures)]
mean = np.mean(new_targets).item()
std = np.std(new_targets).item()
return cls(mean, std, is_intensive)
def __str__(self):
return f"StandardScaler(mean={self.mean:.3f}, std={self.std:.3f}, " f"is_intensive={self.is_intensive})"
def __repr__(self):
return str(self)
[docs]class DummyScaler(MSONable):
"""
Dummy scaler does nothing
"""
[docs] @staticmethod
def transform(target: np.ndarray, n: int = 1) -> np.ndarray:
"""
Args:
target (np.ndarray): target numerical value
n (int): number of atoms
Returns:
target
"""
return target
[docs] @staticmethod
def inverse_transform(transformed_target: np.ndarray, n: int = 1) -> np.ndarray:
"""
return as it is
Args:
transformed_target (np.ndarray): transformed target
n (int): number of atoms
Returns:
transformed_target
"""
return transformed_target
[docs] @classmethod
def from_training_data(cls, structures: List[StructureOrMolecule], targets: VectorLike, is_intensive: bool = True):
"""
Args:
structures (list): list of structures/molecules
targets (list): vector of target properties
is_intensive (bool): whether the target is intensive
Returns: DummyScaler
"""
return cls()