Source code for torch_kmeans.clustering.kmeans

#
from typing import Any, Optional, Tuple, Union
from warnings import warn

import torch
import torch.nn as nn
from torch import LongTensor, Tensor

from ..utils.distances import (
    BaseDistance,
    CosineSimilarity,
    DotProductSimilarity,
    LpDistance,
)
from ..utils.utils import ClusterResult, group_by_label_mean

# import numpy as np
# from sklearn.cluster._kmeans import _kmeans_plusplus, row_norms

__all__ = ["KMeans"]


#
[docs]class KMeans(nn.Module):
    """
    Implements k-means clustering in terms of
    pytorch tensor operations which can be run on GPU.
    Supports batches of instances for use in
    batched training (e.g. for neural networks).

    Partly based on ideas from:
        - https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
        - https://github.com/overshiki/kmeans_pytorch


    Args:
            init_method: Method to initialize cluster centers ['rnd', 'k-means++']
                            (default: 'rnd')
            num_init: Number of different initial starting configurations,
                        i.e. different sets of initial centers (default: 8).
            max_iter: Maximum number of iterations (default: 100).
            distance: batched distance evaluator (default: LpDistance).
            p_norm: norm for lp distance (default: 2).
            tol: Relative tolerance with regards to Frobenius norm of the difference
                        in the cluster centers of two consecutive iterations to
                        declare convergence. (default: 1e-4)
            normalize: String id of method to use to normalize input.
                        one of ['mean', 'minmax', 'unit'].
                        None to disable normalization. (default: None).
            n_clusters: Default number of clusters to use if not provided in call
                    (optional, default: 8).
            verbose: Verbosity flag to print additional info (default: True).
            seed: Seed to fix random state for randomized center inits
                    (default: True).
            **kwargs: additional key word arguments for the distance function.
    """

    INIT_METHODS = ["rnd", "k-means++"]
    NORM_METHODS = ["mean", "minmax", "unit"]

    def __init__(
        self,
        init_method: str = "rnd",
        num_init: int = 8,
        max_iter: int = 100,
        distance: BaseDistance = LpDistance,
        p_norm: int = 2,
        tol: float = 1e-4,
        normalize: Optional[Union[str, bool]] = None,
        n_clusters: Optional[int] = 8,
        verbose: bool = True,
        seed: Optional[int] = 123,
        **kwargs,
    ):
        super(KMeans, self).__init__()
        self.init_method = init_method.lower()
        self.num_init = num_init
        self.max_iter = max_iter
        self.p_norm = p_norm
        self.tol = tol
        self.normalize = normalize
        self.n_clusters = n_clusters
        self.verbose = verbose
        self.seed = seed

        self._check_params()
        self.distance = distance(p=self.p_norm, **kwargs)

        self.eps = None
        self._k_max = None
        self._result = None

    @property
    def is_fitted(self) -> bool:
        """True if model was already fitted."""
        return self._result is not None

    @property
    def num_clusters(self) -> Union[int, Tensor, Any]:
        """
        Number of clusters in fitted model.
        Returns a tensor with possibly different
        numbers of clusters per instance for whole batch.
        """
        if not self.is_fitted:
            return None
        return self._result.k

    def _check_params(self):
        if self.init_method not in self.INIT_METHODS:
            raise ValueError(
                f"unknown <init_method>: {self.init_method}. "
                f"Please choose one of {self.INIT_METHODS}"
            )
        if self.num_init <= 0:
            raise ValueError(f"num_init should be > 0, but got {self.num_init}.")
        if self.max_iter <= 0:
            raise ValueError(f"max_iter should be > 0, but got {self.max_iter}.")
        if self.p_norm <= 0:
            raise ValueError(f"p_norm should be > 0, but got {self.p_norm}.")
        if self.tol < 0 or self.tol > 1:
            raise ValueError(f"tol should be > 0 and < 1, but got {self.tol}.")
        if isinstance(self.normalize, bool):
            if self.normalize:
                self.normalize = "mean"
            else:
                self.normalize = None
        if self.normalize is not None and self.normalize not in self.NORM_METHODS:
            raise ValueError(
                f"unknown <normalize> method: {self.normalize}. "
                f"Please choose one of {self.NORM_METHODS}"
            )
        if self.n_clusters is not None and self.n_clusters < 2:
            raise ValueError(f"n_clusters should be > 1, but got {self.n_clusters}.")

    def _check_x(self, x) -> Tensor:
        """Check and (re-)format input samples x."""
        if not isinstance(x, Tensor):
            raise TypeError(f"x has to be a torch.Tensor but got {type(x)}.")
        shp = x.shape
        if len(shp) < 3:
            raise ValueError(
                f"input <x> should be at least of shape (BS, N, D) "
                f"with batch size BS, number of points N "
                f"and number of dimensions D but got {shp}."
            )
        elif len(shp) > 3:
            x = x.squeeze()
            x = self._check_x(x)
        self.eps = torch.finfo(x.dtype).eps
        return x

    def _check_k(
        self, k, dims: Tuple, device: torch.device = torch.device("cpu")
    ) -> LongTensor:
        """Check and (re-)format number of clusters k."""
        bs, n, d = dims
        if not isinstance(k, Tensor):
            if k is None:  # use specified default number of clusters
                if self.n_clusters is None:
                    raise ValueError(
                        "Did not provide number of clusters k on call and "
                        "did not specify default 'n_clusters' at initialization."
                    )
                k = self.n_clusters
            if isinstance(k, int):  # convert to tensor
                k = torch.empty(bs, dtype=torch.long).fill_(k)
            else:
                raise TypeError(
                    f"k has to be int, torch.Tensor or None " f"but got {type(k)}."
                )
        if len(k.shape) > 1:
            k = k.squeeze()
            assert len(k.shape) == 1
        if k.shape[0] == 1:
            k = k.repeat(bs)
        if (k >= n).any():
            raise ValueError(
                f"Specified 'k' must be smaller than "
                f"number of samples n={n}, but got: {k}."
            )
        if (k <= 1).any():
            raise ValueError("Clustering for k=1 is ambiguous.")
        self._k_max = int(k.max())
        return k.to(dtype=torch.long, device=device)

    def _check_centers(
        self,
        centers,
        dims: Tuple,
        dtype: torch.dtype = torch.float32,
        device: torch.device = torch.device("cpu"),
    ) -> Tensor:
        if not isinstance(centers, Tensor):
            raise TypeError(
                f"centers has to be a torch.Tensor " f"but got {type(centers)}."
            )
        bs, n, d = dims
        if len(centers.shape) == 3:
            if (
                centers.size(0) != bs
                or centers.size(1) != self._k_max
                or centers.size(2) != d
            ):
                raise ValueError(
                    f"centers needs to be of shape "
                    f"({bs}, {self._k_max}, {d}),"
                    f"but got {tuple(centers.shape)}."
                )
            if self.num_init > 1:
                warn(
                    f"Specified num_init={self.num_init} > 1 but provided "
                    f"only 1 center configuration per instance. "
                    f"Using same center configuration for all {self.num_init} runs."
                )
                # expand to num_init size
                centers = centers[:, None, :, :].expand(
                    centers.size(0), self.num_init, centers.size(1), centers.size(2)
                )
            else:
                centers = centers.unsqueeze(1)
        elif len(centers.shape) == 4:
            if (
                centers.size(0) != bs
                or centers.size(1) != self.num_init
                or centers.size(2) != self._k_max
                or centers.size(3) != d
            ):
                raise ValueError(
                    f"centers needs to be of shape "
                    f"({bs}, {self.num_init}, {self._k_max}, {d}),"
                    f"but got {tuple(centers.shape)}."
                )
        else:
            raise ValueError(
                f"centers have unsupported shape of "
                f"{tuple(centers.shape)} "
                f"instead of "
                f"({bs}, {self.num_init}, {self._k_max}, {d})."
            )
        return centers.contiguous().to(dtype=dtype, device=device)

[docs]    def forward(
        self,
        x: Tensor,
        k: Optional[Union[LongTensor, Tensor, int]] = None,
        centers: Optional[Tensor] = None,
        **kwargs,
    ) -> ClusterResult:
        """torch.nn like forward pass.

        Args:
            x: input features/coordinates (BS, N, D)
            k: optional batch of (possibly different)
                numbers of clusters per instance (BS, )
            centers: optional batch of initial centers to use (BS, K, D)
            **kwargs: additional kwargs for initialization or cluster procedure

        Returns:
            ClusterResult tuple

        """
        x = self._check_x(x)
        x_ = x
        k = self._check_k(k, dims=x.shape, device=x.device)

        # normalize input
        if self.normalize is not None:
            x = self._normalize(x, self.normalize, self.eps)
        # init centers
        if centers is None:
            centers = self._center_init(x, k, **kwargs)
        centers = self._check_centers(
            centers, dims=x.shape, dtype=x.dtype, device=x.device
        )

        labels, new_centers, inertia, soft_assign = self._cluster(
            x, centers, k, **kwargs
        )
        return ClusterResult(
            labels=labels,  # type: ignore
            centers=new_centers,
            inertia=inertia,
            x_org=x_,
            x_norm=x,
            k=k,
            soft_assignment=soft_assign,
        )

[docs]    def fit(
        self,
        x: Tensor,
        k: Optional[Union[LongTensor, Tensor, int]] = None,
        centers: Optional[Tensor] = None,
        **kwargs,
    ) -> nn.Module:
        """Compute cluster centers and predict cluster index for each sample.

        Args:
            x: input features/coordinates (BS, N, D)
            k: optional batch of (possibly different)
                numbers of clusters per instance (BS, )
            centers: optional batch of initial centers to use (BS, K, D)
            **kwargs: additional kwargs for initialization or cluster procedure

        Returns:
            KMeans model
        """
        self._result = self(x, k=k, centers=centers, **kwargs)
        return self

[docs]    def predict(self, x: Tensor, **kwargs) -> LongTensor:
        """Predict the closest cluster each sample in X belongs to.

        Args:
            x: input features/coordinates (BS, N, D)
            **kwargs: additional kwargs for assignment procedure

        Returns:
            batch tensor of cluster labels for each sample (BS, N)

        """
        assert self.is_fitted
        x = self._check_x(x)
        return self._assign(
            x, centers=self._result.centers[:, None, :, :], **kwargs
        ).squeeze(1)

[docs]    def fit_predict(
        self,
        x: Tensor,
        k: Optional[Union[LongTensor, Tensor, int]] = None,
        centers: Optional[Tensor] = None,
        **kwargs,
    ) -> LongTensor:
        """Compute cluster centers and predict cluster index for each sample.

        Args:
            x: input features/coordinates (BS, N, D)
            k: optional batch of (possibly different)
                numbers of clusters per instance (BS, )
            centers: optional batch of initial centers to use (BS, K, D)
            **kwargs: additional kwargs for initialization or cluster procedure

        Returns:
            batch tensor of cluster labels for each sample (BS, N)

        """
        return self(x, k=k, centers=centers, **kwargs).labels

    @torch.no_grad()
    def _center_init(self, x: Tensor, k: LongTensor, **kwargs) -> Tensor:
        """Wrapper to apply different methods for
        initialization of initial centers (centroids)."""
        if self.init_method == "rnd":
            return self._init_rnd(x, k)
        elif self.init_method == "k-means++":
            return self._init_plus(x, k)
        else:
            raise ValueError(f"unknown initialization method: {self.init_method}.")

    @staticmethod
    def _normalize(x: Tensor, normalize: str, eps: float = 1e-8):
        """Normalize input samples x according to specified method:

        - mean: subtract sample mean
        - minmax: min-max normalization subtracting sample min and divide by sample max
        - unit: normalize x to lie on D-dimensional unit sphere

        """
        if normalize == "mean":
            x -= x.mean(dim=1)[:, None, :]
        elif normalize == "minmax":
            x -= x.min(-1, keepdims=True).values  # type: ignore
            x /= x.max(-1, keepdims=True).values  # type: ignore
        elif normalize == "unit":
            # normalize x to unit sphere
            z_msk = x == 0
            x = x.clone()
            x[z_msk] = eps
            x = torch.diag_embed(1.0 / (torch.norm(x, p=2, dim=-1))) @ x
        else:
            raise ValueError(f"unknown normalization type {normalize}.")
        return x

    def _init_rnd(self, x: Tensor, k: LongTensor) -> Tensor:
        """Choose k random nodes as initial centers.

        Args:
            x: (BS, N, D)
            k: (BS, )

        Returns:
            centers: (BS, num_init, k, D)

        """
        bs, n, d = x.size()
        k_max = torch.max(k).cpu().item()

        if self.seed is not None:
            # make random init reproducible independent of current iteration,
            # which otherwise would step and change the torch generator state
            gen = torch.Generator(device=x.device)
            gen.manual_seed(self.seed)
        else:
            gen = None

        # sample from uniform in batch and for num_init runs
        rnd_idx = torch.multinomial(
            torch.empty((bs * self.num_init, n), device=x.device, dtype=x.dtype).fill_(
                1 / n
            ),
            num_samples=k_max,
            replacement=False,
            generator=gen,
        )
        return x.gather(
            index=rnd_idx.view(bs, -1)[:, :, None].expand(bs, -1, d), dim=1
        ).view(bs, self.num_init, k_max, d)

    def _init_skl_plus(self, x: Tensor, k: LongTensor) -> Tensor:
        """Choose initial centers via kmeans++ method.
        https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/cluster/_kmeans.py#L50

        Args:
            x: (BS, N, D)
            k: (BS, )

        Returns:
            centers: (BS, num_init, k, D)

        """
        raise NotImplementedError
        # would require sklearn as additional dependency

        # bs, n, d = x.size()
        # k_max = torch.max(k).cpu().item()
        # rs = np.random.RandomState(self.seed if self.seed is not None else 1)
        # device = x.device
        # x = x.cpu().numpy()
        # k = k.cpu().numpy()
        # centers = []
        # for smp, nc in zip(x, k):
        #     center_inits = []
        #     x_squared_norms = row_norms(smp, squared=True)
        #     for i in range(self.num_init):
        #         c = np.zeros((k_max, d))
        #         c_init, _ = _kmeans_plusplus(
        #             smp, nc, random_state=rs, x_squared_norms=x_squared_norms
        #         )
        #         c[:nc] = c_init
        #         center_inits.append(c)
        #     centers.append(torch.from_numpy(np.stack(center_inits)))
        #
        # return torch.stack(centers).to(device)

    def _init_plus(self, x: Tensor, k: LongTensor) -> Tensor:
        """Choose initial centers via k-means++ method

        Args:
            x: (BS, N, D)
            k: (BS, )

        Returns:
            centers: (BS, num_init, k, D)

        """
        bs, n, d = x.size()
        k_max = torch.max(k).cpu().item()

        if self.seed is not None:
            # make random init reproducible independent of current iteration,
            # which otherwise would step and change the torch generator state
            gen = torch.Generator(device=x.device)
            gen.manual_seed(self.seed)
        else:
            gen = None

        bsm = bs * self.num_init
        bsm_idx = torch.arange(bsm, device=x.device)
        centers = torch.empty((bsm, k_max, d), dtype=x.dtype, device=x.device)

        # select first center randomly
        assert n > self.num_init, (
            f"Number of samples must be larger than <num_init> "
            f"but got {n} <= {self.num_init}"
        )
        idx = torch.multinomial(
            torch.empty((bs, n), device=x.device, dtype=x.dtype).fill_(1 / n),
            num_samples=self.num_init,
            replacement=False,
            generator=gen,
        )
        centers[:, 0] = x.gather(index=idx[:, :, None].expand(-1, -1, d), dim=1).view(
            -1, d
        )
        msk = torch.zeros((bsm, n, k_max), dtype=torch.bool, device=x.device)
        msk[bsm_idx, idx.view(-1), 0] = True

        # select the remaining k-1 centers
        for nc in range(1, k_max):
            dist = self._pairwise_distance(
                x, centers[:, :nc].view(bs, self.num_init, -1, d)
            ).view(bsm, n, nc)
            pot = dist**2
            pot[msk[:, :, :nc]] = 0
            pot = pot.min(dim=-1).values
            idx = torch.multinomial(pot, 1, generator=gen).view(bs, self.num_init)
            centers[:, nc] = x.gather(
                index=idx[:, :, None].expand(-1, -1, d), dim=1
            ).view(-1, d)
            msk[bsm_idx, idx.view(-1), nc] = True

        return centers.view(bs, self.num_init, k_max, d)

    @torch.no_grad()
    def _cluster(
        self, x: Tensor, centers: Tensor, k: LongTensor, **kwargs
    ) -> Tuple[Tensor, Tensor, Tensor, Union[Tensor, Any]]:
        """
        Run Lloyd's k-means algorithm.

        Args:
            x: (BS, N, D)
            centers: (BS, num_init, k_max, D)
            k: (BS, )

        """
        if not isinstance(self.distance, LpDistance):
            warn("standard k-means should use a non-inverted distance measure.")
        bs, n, d = x.size()
        # mask centers for which  k < k_max with inf to get correct assignment
        k_max = torch.max(k).cpu().item()
        k_max_range = torch.arange(k_max, device=x.device)[None, :].expand(bs, -1)
        k_mask = k_max_range >= k[:, None]
        k_mask = k_mask[:, None, :].expand(bs, self.num_init, -1)

        for i in range(self.max_iter):
            centers[k_mask] = float("inf")
            old_centers = centers.clone()
            # get cluster assignments
            c_assign = self._assign(x, centers)
            # update cluster centers
            centers = group_by_label_mean(x, c_assign, k_max_range)
            if self.tol is not None:
                # calculate center shift
                shift = self._calculate_shift(centers, old_centers, p=self.p_norm)
                if (shift < self.tol).all():
                    if self.verbose:
                        print(
                            f"Full batch converged at iteration "
                            f"{i+1}/{self.max_iter} "
                            f"with center shifts = "
                            f"{shift.view(-1, self.num_init).mean(-1)}."
                        )
                    break

        # select best rnd restart according to inertia
        centers[k_mask] = float("inf")
        c_assign = self._assign(x, centers)
        inertia = self._calculate_inertia(x, centers, c_assign)
        best_init = torch.argmin(inertia, dim=-1)
        b_idx = torch.arange(bs, device=x.device)

        return (
            c_assign[b_idx, best_init],
            centers[b_idx, best_init],
            inertia[b_idx, best_init],
            None,
        )

    def _pairwise_distance(self, x: Tensor, centers: Tensor, **kwargs):
        """Calculate pairwise distances between samples in x and all centers."""
        # expand tensors to calculate pairwise distance over (d) dimensions
        # of each point (n) to each center (k_max)
        # for each random restart (num_init) in each batch instance (bs)
        bs, n, d = x.size()
        bs, num_init, k_max, d = centers.size()
        x = x[:, None, :, None, :].expand(bs, num_init, n, k_max, d).reshape(-1, d)
        centers = (
            centers[:, :, None, :, :].expand(bs, num_init, n, k_max, d).reshape(-1, d)
        )
        return self.distance.pairwise_distance(x, centers, **kwargs).view(
            bs, num_init, n, k_max
        )

    def _assign(self, x: Tensor, centers: Tensor, **kwargs) -> LongTensor:
        """Infer cluster assignment for each sample in x."""
        # dist: (bs, num_init, n, k_max)
        dist = self._pairwise_distance(x, centers)
        if isinstance(self.distance, (CosineSimilarity, DotProductSimilarity)):
            # Similarity is an inverted distance measure,
            # so we need to adapt it in order to calculate priority
            dist = 1 - dist

        # get cluster assignments (center with minimal distance)
        return torch.argmin(dist, dim=-1)  # type: ignore

    @staticmethod
    @torch.jit.script
    def _calculate_shift(centers: Tensor, old_centers: Tensor, p: int = 2) -> Tensor:
        """Calculate center shift w.r.t. centers from last iteration."""
        # calculate euclidean distance while replacing inf with 0 in sum
        d = torch.norm((centers - old_centers), p=p, dim=-1)
        d[d == float("inf")] = 0
        # sum(d, dim=-1)**2 -> use mean to be independent of number of points
        return torch.mean(d, dim=-1)

    @staticmethod
    @torch.jit.script
    def _calculate_inertia(x: Tensor, centers: Tensor, labels: Tensor) -> Tensor:
        """Compute sum of squared distances of samples
        to their closest cluster center."""
        bs, n, d = x.size()
        m = centers.size(1)
        assert m == labels.size(1)
        # select assigned center by label and calculate squared distance
        assigned_centers = centers.gather(
            index=labels[:, :, :, None].expand(
                labels.size(0), labels.size(1), labels.size(2), d
            ),
            dim=2,
        )
        # squared distance to closest center
        d = (
            torch.norm(
                (x[:, None, :, :].expand(bs, m, n, d) - assigned_centers), p=2, dim=-1
            )
            ** 2
        )
        d[d == float("inf")] = 0
        return torch.sum(d, dim=-1)

    def __repr__(self):
        return (
            f"{self.__class__.__name__}("
            f"init: '{self.init_method}', "
            f"num_init: {self.num_init}, "
            f"max_iter: {self.max_iter}, "
            f"distance: {self.distance}, "
            f"tolerance: {self.tol}, "
            f"normalize: {self.normalize}"
            f")"
        )