Source code for pymatgen.analysis.diffusion.aimd.clustering

"""
This module implements clustering algorithms to determine centroids, with
adaption for periodic boundary conditions. This can be used, for example, to
determine likely atomic positions from MD trajectories.
"""

from __future__ import annotations

import random
import warnings

import numpy as np

from pymatgen.util.coord import all_distances, pbc_diff

__author__ = "Shyue Ping Ong"
__copyright__ = "Copyright 2013, The Materials Virtual Lab"
__version__ = "0.1"
__maintainer__ = "Shyue Ping Ong"
__email__ = "ongsp@ucsd.edu"
__date__ = "3/18/15"



[docs]
class Kmeans:
    """Simple kmeans clustering."""

    def __init__(self, max_iterations: int = 1000):
        """
        Args:
            max_iterations (int): Maximum number of iterations to run KMeans algo.
        """
        self.max_iterations = max_iterations


[docs]
    def cluster(self, points, k, initial_centroids=None):
        """
        Args:
            points (ndarray): Data points as a mxn ndarray, where m is the
                number of features and n is the number of data points.
            k (int): Number of means.
            initial_centroids (np.array): Initial guess for the centroids. If
                None, a randomized array of points is used.

        Returns:
            centroids, labels, ss: centroids are the final centroids, labels
            provide the index for each point, and ss in the final sum squared
            distances.
        """
        centroids = np.array(random.sample(list(points), k)) if initial_centroids is None else initial_centroids

        # Initialize book keeping vars.
        iterations = 0
        old_centroids = None

        # Run the main k-means algorithm
        while not self.should_stop(old_centroids, centroids, iterations):
            # Save old centroids for convergence test. Book keeping.
            old_centroids = centroids
            iterations += 1
            # Assign labels to each datapoint based on centroids
            labels, ss = self.get_labels(points, centroids)

            # Assign centroids based on datapoint labels
            centroids = self.get_centroids(points, labels, k, centroids)

        labels, ss = self.get_labels(points, centroids)
        # We can get the labels too by calling getLabels(dataSet, centroids)
        return centroids, labels, ss



[docs]
    @staticmethod
    def get_labels(points, centroids):
        """
        For each element in the dataset, chose the closest centroid.
        Make that centroid the element's label.

        Args:
            points: List of points
            centroids: List of centroids
        """
        dists = all_distances(points, centroids)
        min_dists = np.min(dists, axis=1)
        return np.where(dists == min_dists[:, None])[1], np.sum(min_dists**2)



[docs]
    @staticmethod
    def get_centroids(points, labels, k, centroids):
        """
        Each centroid is the geometric mean of the points that
        have that centroid's label. Important: If a centroid is empty (no
        points have that centroid's label) you should randomly re-initialize it.

        Args:
            points: List of points
            labels: List of labels
            k: Number of means
            centroids: List of centroids
        """
        labels = np.array(labels)
        centroids = []
        for i in range(k):
            ind = np.where(labels == i)[0]
            if len(ind) > 0:
                centroids.append(np.average(points[ind, :], axis=0))
            else:
                centroids.append(get_random_centroid(points))
        return np.array(centroids)



[docs]
    def should_stop(self, old_centroids, centroids, iterations):
        """
        Check for stopping conditions.

        Args:
            old_centroids: List of old centroids
            centroids: List of centroids
            iterations: Number of iterations thus far.
        """
        if iterations > self.max_iterations:
            warnings.warn(f"Max iterations {self.max_iterations} reached!")
            return True
        if old_centroids is None:
            return False
        return np.allclose(old_centroids, centroids)





[docs]
class KmeansPBC(Kmeans):
    """
    A version of KMeans that work with PBC. Distance metrics have to change,
    as well as new centroid determination. The points supplied should be
    fractional coordinates.
    """

    def __init__(self, lattice, max_iterations=1000):
        """
        Args:
            lattice: Lattice
            max_iterations: Maximum number of iterations to run KMeans.
        """
        self.lattice = lattice
        self.max_iterations = max_iterations


[docs]
    def get_labels(self, points, centroids):
        """
        For each element in the dataset, chose the closest centroid.
        Make that centroid the element's label.

        Args:
            points: List of points
            centroids: List of centroids
        """
        dists = self.lattice.get_all_distances(points, centroids)
        min_dists = np.min(dists, axis=1)
        return np.where(dists == min_dists[:, None])[1], np.sum(min_dists**2)



[docs]
    def get_centroids(self, points, labels, k, centroids):
        """
        Each centroid is the geometric mean of the points that
        have that centroid's label. Important: If a centroid is empty (no
        points have that centroid's label) you should randomly re-initialize it.

        Args:
            points: List of points
            labels: List of labels
            k: Number of means
            centroids: List of centroids
        """
        m, n = points.shape
        labels = np.array(labels)
        new_centroids = []
        for i in range(k):
            ind = np.where(labels == i)[0]
            if len(ind) > 0:
                c = np.zeros(n)
                for j in ind:
                    dist, image = self.lattice.get_distance_and_image(centroids[i], points[j])
                    c += points[j] + image
                c /= len(ind)
                c = np.mod(c, 1)
            else:
                c = get_random_centroid(points)
                c = np.mod(c, 1)
            new_centroids.append(c)
        return np.array(new_centroids)



[docs]
    def should_stop(self, old_centroids, centroids, iterations):
        """
        Check for stopping conditions.

        Args:
            old_centroids: List of old centroids
            centroids: List of centroids
            iterations: Number of iterations thus far.
        """
        if iterations > self.max_iterations:
            warnings.warn(f"Max iterations {self.max_iterations} reached!")
            return True
        if old_centroids is None:
            return False
        return all(np.allclose(pbc_diff(c1, c2), [0, 0, 0]) for c1, c2 in zip(old_centroids, centroids))





[docs]
def get_random_centroid(points):
    """
    Generate a random centroid based on points.

    Args:
        points: List of points.
    """
    m, n = points.shape
    maxd = np.max(points, axis=0)
    mind = np.min(points, axis=0)
    return np.array([random.uniform(mind[i], maxd[i]) for i in range(n)])




[docs]
def get_random_centroids(points, k):
    """
    Generate k random centroids based on points.

    Args:
        points: List of points.
        k: Number of means.
    """
    centroids = []
    for _i in range(k):
        centroids.append(get_random_centroid(points))
    return np.array(centroids)