Source code for torch_kmeans.clustering.constr_kmeans

#
from typing import Any, Optional, Tuple, Union
from warnings import warn

# import numpy as np
import torch
from torch import LongTensor, Tensor

from ..utils.distances import BaseDistance, LpDistance
from ..utils.utils import first_nonzero, group_by_label_mean, rm_kwargs
from .kmeans import KMeans

__all__ = ["ConstrainedKMeans"]


class InfeasibilityError(Exception):
    """Raised when no feasible assignment could be found."""


[docs]class ConstrainedKMeans(KMeans):
    """
    Implements constrained k-means clustering.
    Priority implementation is based on the method of

    Paper:
         Geetha, S., G. Poonthalir, and P. T. Vanathi.
         "Improved k-means algorithm for capacitated clustering problem."
         INFOCOMP Journal of Computer Science 8.4 (2009)


    Args:
        init_method: Method to initialize cluster centers:
                        ['rnd', 'topk', 'k-means++', 'ckm++']
                        (default: 'rnd')
        num_init: Number of different initial starting configurations,
                    i.e. different sets of initial centers (default: 8).
        max_iter: Maximum number of iterations (default: 100).
        distance: batched distance evaluator (default: LpDistance).
        p_norm: norm for lp distance (default: 2).
        tol: Relative tolerance with regards to Frobenius norm of the difference
                    in the cluster centers of two consecutive iterations to
                    declare convergence. (default: 1e-4)
        n_clusters: Default number of clusters to use if not provided in call
                (optional, default: 8).
        verbose: Verbosity flag to print additional info (default: True).
        seed: Seed to fix random state for randomized center inits
                (default: 123).
        n_priority_trials_before_fall_back: Number of trials trying to assign
                                            samples to constrained clusters based
                                            on priority values before falling back
                                            to assigning the node with the highest
                                            weight to a cluster which can still
                                            accommodate it or the dummy cluster
                                            otherwise. (default: 5)
        raise_infeasible: if set to False, will only display a warning
                            instead of raising an error (default: True)
        **kwargs: additional key word arguments for the distance function.

    """

    INIT_METHODS = ["rnd", "k-means++", "topk", "ckm++"]
    NORM_METHODS = []

    def __init__(
        self,
        init_method: str = "rnd",
        num_init: int = 8,
        max_iter: int = 100,
        distance: BaseDistance = LpDistance,
        p_norm: int = 2,
        tol: float = 1e-4,
        n_clusters: Optional[int] = 8,
        verbose: bool = True,
        seed: Optional[int] = 123,
        n_priority_trials_before_fall_back: int = 5,
        raise_infeasible: bool = True,
        **kwargs,
    ):
        kwargs = rm_kwargs(kwargs, ["normalize"])
        super(ConstrainedKMeans, self).__init__(
            init_method=init_method,
            num_init=num_init,
            max_iter=max_iter,
            distance=distance,
            p_norm=p_norm,
            tol=tol,
            normalize=None,
            n_clusters=n_clusters,
            verbose=verbose,
            seed=seed,
            **kwargs,
        )
        self.n_trials = n_priority_trials_before_fall_back
        self.raise_infeasible = raise_infeasible
        # check
        if self.n_trials <= 0:
            raise ValueError(f"n_trials should be > 0, " f"but got {self.n_trials}.")
        if self.distance.is_inverted:
            raise ValueError(
                "constrained k-means does not work " "for inverted distance measures."
            )
        if self.init_method == "topk" and self.num_init > 1:
            raise ValueError(
                "topk init method is deterministic and "
                "does not work with num_init > 1."
            )

    def _check_weights(
        self,
        weights,
        dims: Tuple,
        dtype: torch.dtype = torch.float32,
        device: torch.device = torch.device("cpu"),
    ) -> Tensor:
        if not isinstance(weights, Tensor):
            raise TypeError(
                f"weights has to be a torch.Tensor " f"but got {type(weights)}."
            )
        if not ((0 < weights) & (weights <= 1)).all():
            raise ValueError(
                "weights must be positive and " "be normalized between [0, 1]"
            )
        bs, n, d = dims
        if len(weights.shape) == 2:
            if weights.size(0) != bs or weights.size(1) != n:
                raise ValueError(
                    f"weights needs to be of shape "
                    f"({bs}, {n}, ),"
                    f"but got {tuple(weights.shape)}."
                )
        else:
            raise ValueError(
                f"weights have unsupported shape of "
                f"{tuple(weights.shape)} "
                f"instead of ({bs}, {n})."
            )
        return weights.contiguous().to(dtype=dtype, device=device)

[docs]    def predict(self, x: Tensor, weights: Tensor, **kwargs) -> LongTensor:
        """Predict the closest cluster each sample in X belongs to.

        Args:
            x: input features/coordinates (BS, N, D)
            weights: normalized weight for each sample (BS, N)
            **kwargs: additional kwargs for assignment procedure

        Returns:
            batch tensor of cluster labels for each sample (BS, N)
        """
        assert self.is_fitted
        x = self._check_x(x)
        weights = self._check_weights(weights, dims=x.shape, dtype=x.dtype)
        k_mask, _ = self._get_kmask(self._result.k, num_init=1)
        return self._assign(
            x,
            centers=self._result.centers[:, None, :, :],
            weights=weights,
            k_mask=k_mask,
            **kwargs,
        )

    @torch.no_grad()
    def _center_init(self, x: Tensor, k: LongTensor, **kwargs):
        if self.init_method == "rnd":
            return self._init_rnd(x, k)
        elif self.init_method == "k-means++":
            return self._init_plus(x, k)
        elif self.init_method == "topk":
            return self._init_topk(x, k, **kwargs)
        elif self.init_method == "ckm++":
            return self._init_ckm_plus(x, k, **kwargs)
        else:
            raise ValueError(f"unknown initialization method: {self.init_method}.")

    def _init_topk(self, x: Tensor, k: LongTensor, weights: Tensor, **kwargs) -> Tensor:
        """Choose k nodes with largest weights as initial centers.

        Args:
            x: (BS, N, D)
            k: (BS, )
            weights: (BS, N)

        Returns:
            centers: (BS, num_init, k, D)

        """
        if self.num_init > 1:
            raise RuntimeError(
                "topk init method is deterministic and "
                "does not work with num_init > 1."
            )
        bs, n, d = x.size()
        k_max = torch.max(k).cpu().item()
        # sample from uniform in batch and for num_init runs
        idx = torch.topk(weights, k=k_max, dim=-1).indices
        return x.gather(
            index=idx.view(bs, -1)[:, :, None].expand(bs, -1, d), dim=1
        ).view(bs, self.num_init, k_max, d)

    def _init_ckm_plus(
        self, x: Tensor, k: LongTensor, weights: Tensor, **kwargs
    ) -> Tensor:
        """Choose initial centers via adapted k-means++ method
        which also considers the weights.

        Args:
            x: (BS, N, D)
            k: (BS, )

        Returns:
            centers: (BS, num_init, k, D)

        """
        bs, n, d = x.size()
        k_max = torch.max(k).cpu().item()

        if self.seed is not None:
            # make random init reproducible independent of current iteration,
            # which otherwise would step and change the torch generator state
            gen = torch.Generator(device=x.device)
            gen.manual_seed(self.seed)
        else:
            gen = None

        bsm = bs * self.num_init
        bsm_idx = torch.arange(bsm, device=x.device)
        centers = torch.empty((bsm, k_max, d), dtype=x.dtype, device=x.device)
        weights = weights[:, None, :].expand(bs, self.num_init, n).reshape(bsm, n)

        # TODO: implement selection of n local trials
        #  (select center out of trials which minimizes inertia)
        # n_local_trials = 2 + int(np.log(k_max))

        # select first center randomly
        assert n > self.num_init, (
            f"Number of samples must be larger than <num_init> "
            f"but got {n} <= {self.num_init}"
        )
        idx = torch.multinomial(
            torch.empty((bs, n), device=x.device, dtype=x.dtype).fill_(1 / n),
            num_samples=self.num_init,
            replacement=False,
            generator=gen,
        )
        centers[:, 0] = x.gather(index=idx[:, :, None].expand(-1, -1, d), dim=1).view(
            -1, d
        )
        msk = torch.zeros((bsm, n, k_max), dtype=torch.bool, device=x.device)
        msk[bsm_idx, idx.view(-1), 0] = True

        # select the remaining k-1 centers
        # The intuition behind this approach is that spreading out the
        # k initial cluster centers is a good thing: the first cluster
        # center is chosen uniformly at random from the data points that
        # are being clustered, after which each subsequent cluster center
        # is chosen from the remaining data points with probability
        # proportional to its squared distance from the point's closest
        # existing cluster center weighted by its weight.
        for nc in range(1, k_max):
            dist = self._pairwise_distance(
                x, centers[:, :nc].view(bs, self.num_init, -1, d)
            ).view(bsm, n, nc)
            pot = weights[:, :, None].expand(bsm, n, nc) * dist**2
            pot[msk[:, :, :nc]] = 0
            pot = pot.min(dim=-1).values
            idx = torch.multinomial(pot, 1, generator=gen).view(bs, self.num_init)
            centers[:, nc] = x.gather(
                index=idx[:, :, None].expand(-1, -1, d), dim=1
            ).view(-1, d)
            msk[bsm_idx, idx.view(-1), nc] = True

        return centers.view(bs, self.num_init, k_max, d)

    def _get_kmask(self, k: Tensor, num_init: int = 1) -> Tuple[Tensor, Tensor]:
        """Compute mask of number of clusters k for centers of each instance."""
        bs = k.size(0)
        # mask centers for which  k < k_max with inf to get correct assignment
        k_max = torch.max(k).cpu().item() + 1  # dummy cluster
        k_max_range = torch.arange(k_max, device=k.device)[None, :].expand(bs, -1)
        k_mask = k_max_range >= k[:, None]
        k_mask = k_mask[:, None, :].expand(bs, num_init, -1)
        return k_mask, k_max_range

    @torch.no_grad()
    def _cluster(
        self, x: Tensor, centers: Tensor, k: LongTensor, weights: Tensor, **kwargs
    ) -> Tuple[Tensor, Tensor, Tensor, Union[Tensor, Any]]:
        """
        Execute main algorithm.

        Args:
            x: (BS, N, D)
            centers: (BS, num_init, k_max, D)
            k: (BS, )
            weights: normalized weights w.r.t. constraint of 1.0 (BS, N, )

        """
        weights = self._check_weights(
            weights, dims=x.shape, dtype=x.dtype, device=x.device
        )
        bs, n, d = x.size()
        # add dummy center at origin to assign nodes which cannot be assigned
        # at an intermediate point because they violate all capacities
        centers = torch.cat(
            (centers, torch.zeros((bs, self.num_init, 1, d), device=x.device)), dim=2
        )
        k_mask, k_max_range = self._get_kmask(k, num_init=self.num_init)

        for i in range(self.max_iter):
            centers[k_mask] = float("inf")
            old_centers = centers.clone()
            # get cluster assignments
            c_assign = self._assign(x, centers, weights, k_mask)
            # update cluster centers
            centers = group_by_label_mean(x, c_assign, k_max_range)
            if self.tol is not None:
                # calculate center shift
                shift = self._calculate_shift(centers, old_centers, p=self.p_norm)
                if (shift < self.tol).all():
                    if self.verbose:
                        print(
                            f"Full batch converged at iteration "
                            f"{i + 1}/{self.max_iter} "
                            f"with center shifts: "
                            f"\n{shift.view(-1, self.num_init).mean(-1)}."
                        )
                    break

        if self.verbose and i == self.max_iter - 1:
            print(
                f"Full batch did not converge after "
                f"{self.max_iter} maximum iterations."
                f"\nThere were some center shifts in last iteration "
                f"larger than specified threshold {self.tol}: "
                f"\n{shift.view(-1, self.num_init).mean(-1)}"
            )

        # select best rnd restart according to inertia
        centers[k_mask] = float("inf")
        c_assign = self._assign(x, centers, weights, k_mask)
        if (c_assign < 0).any():
            # There remain some dummy clusters after convergence.
            # This means the algorithm could not find a
            # feasible assignment for at least one init
            # Check if there is at least 1 feasible solution for each instance
            feasible = (c_assign >= 0).all(-1).any(-1)
            if not feasible.all():
                inf_idx = (feasible == 0).nonzero().squeeze()
                msg = (
                    f"No feasible assignment found for "
                    f"instance(s) with idx: {inf_idx}.\n"
                    f"(Try to increase the number of clusters "
                    f"or loosen the constraints.)"
                )
                if self.raise_infeasible:
                    raise InfeasibilityError(msg)
                else:
                    warn(msg + "\nInfeasible instances removed from output.")
                    x = x[feasible]
                    centers = centers[feasible]
                    c_assign = c_assign[feasible]
                    bs = feasible.sum()

            # at least one init produced a feasible assignment
            # replace infeasible inits with feasible dummies to compute inertia
            feasible = (c_assign >= 0).all(-1)
            valid, dummy_row_idx = first_nonzero(feasible)
            assgn = (
                c_assign[torch.arange(bs, device=x.device), dummy_row_idx][:, None, :]
                .expand(c_assign.size())
                .contiguous()
            )
            assgn[feasible] = c_assign[feasible]
            c_assign = assgn

        inertia = self._calculate_inertia(x, centers, c_assign)
        best_init = torch.argmin(inertia, dim=-1)
        b_idx = torch.arange(bs, device=x.device)
        return (
            c_assign[b_idx, best_init],
            centers[b_idx, best_init],
            inertia[b_idx, best_init],
            None,
        )

    def _assign(
        self, x: Tensor, centers: Tensor, weights: Tensor, k_mask: Tensor, **kwargs
    ) -> LongTensor:
        # dist: (bs, num_init, n, k_max)
        dist = self._pairwise_distance(x, centers)
        bs, num_init, n, k_max = dist.size()
        bsm = bs * num_init
        dist = dist.view(bsm, n, k_max)

        # we use a heuristic approach to include the
        # cluster capacity by defining a priority value w.r.t. the weight
        # (demand, workload, etc.) of each point
        # The idea is to first assign points with a
        # relatively larger weight to the clusters
        # and then points with smaller weight which
        # can be more easily assigned to other clusters.
        weights = weights[:, None, :].expand(bs, num_init, n).reshape(bsm, n)
        priority = weights[:, :, None].expand(bsm, n, k_max) / dist
        priority[
            k_mask[:, :, None, :].expand(bs, num_init, n, k_max).reshape(bsm, n, k_max)
        ] = 0
        # loop over all nodes to sequentially assign them to clusters
        # while keeping track of cluster capacity
        assignment = -torch.ones((bsm, n), device=x.device, dtype=torch.long)
        cluster_capacity = torch.ones(k_mask.size(), device=x.device)
        cluster_capacity[k_mask] = 0
        cluster_capacity = cluster_capacity.view(bsm, k_max)
        for i in range(n):
            ##n_trials = min(n-i, self.n_trials)  # noqa
            n_trials = self.n_trials
            max_val_k, max_idx_k = priority.max(dim=-1)

            # select n_trials top priority nodes for each instance
            max_idx_n = max_val_k.topk(dim=-1, k=n_trials).indices
            # get corresponding cluster idx and weight
            cl_try = max_idx_k.gather(index=max_idx_n, dim=-1)
            w_try = weights.gather(index=max_idx_n, dim=-1)
            can_be_assigned = cluster_capacity.gather(index=cl_try, dim=-1) >= w_try
            # get first nonzero as idx and a validity mask
            # if any trial could be assigned
            valid_idx, fnz = first_nonzero(can_be_assigned, dim=-1)
            trial_select = fnz[valid_idx]
            cl_select = cl_try[valid_idx, trial_select]
            # do assignment
            n_select = max_idx_n[valid_idx, trial_select]
            assignment[valid_idx, n_select] = cl_select
            # mask priority of assigned nodes
            priority[valid_idx, n_select] = 0
            # adjust cluster capacity
            cur_cap = cluster_capacity[valid_idx, cl_select].clone()
            cluster_capacity[valid_idx, cl_select] = (
                cur_cap - w_try[valid_idx, trial_select]
            )
            # all instances with no valid idx could not assign any trial node
            not_assigned = ~valid_idx

            if not_assigned.any():
                # complete current assignment where for some instances
                # all trials based on priority were not feasible,
                # by assigning the node with the highest weight
                # to a cluster which can still accommodate it
                # or the dummy cluster at the origin otherwise (idx = -1)
                n_not_assigned = not_assigned.sum()
                cur_cap = cluster_capacity[not_assigned].clone()
                available = assignment[not_assigned] < 0
                # select node with highest weight from remaining unassigned nodes
                try:
                    w = weights[not_assigned][available].view(n_not_assigned, -1)
                except RuntimeError:
                    # fallback: just select best of first min available clusters
                    sm = available.sum(-1)
                    min_av = sm.min()
                    av_msk = sm > min_av
                    if min_av <= 1:
                        av_valid, av_idx = first_nonzero(available)
                        available[av_msk] = False
                        available[av_msk, av_idx[av_msk]] = True
                    else:
                        avbl = available[av_msk]
                        bi_cp = 0
                        cnter = 0
                        for bi, zi in zip(*avbl.nonzero(as_tuple=True)):
                            if bi == bi_cp:
                                cnter += 1
                            else:
                                bi_cp += 1
                                cnter = 1
                            if cnter > min_av:
                                avbl[bi, zi] = False
                        available[av_msk] = avbl
                    w = weights[not_assigned][available].view(n_not_assigned, -1)

                max_w, max_idx = w.max(dim=-1, keepdims=True)
                max_idx_n = (
                    available.nonzero(as_tuple=True)[1]
                    .view(n_not_assigned, -1)
                    .gather(dim=-1, index=max_idx)
                    .squeeze(-1)
                )
                # check the cluster priorities of this node
                msk = cur_cap >= max_w
                n_prio_idx = (
                    priority[not_assigned, max_idx_n]
                    .sort(dim=-1, descending=True)
                    .indices
                )
                # reorder msk according to priority and select first valid index
                select_msk = msk.gather(index=n_prio_idx, dim=-1)
                # get first nonzero as idx
                valid_idx, fnz = first_nonzero(select_msk, dim=-1)
                # nodes which cannot be assigned to any cluster anymore
                # since no sufficient capacity is available
                # are assigned to a dummy cluster with idx -1.
                cl_select = -torch.ones(
                    n_not_assigned, device=x.device, dtype=torch.long
                )
                cl_select[valid_idx] = n_prio_idx[valid_idx, fnz[valid_idx]]
                # do assignment
                assignment[not_assigned, max_idx_n] = cl_select
                # adapt priority
                priority[not_assigned, max_idx_n] = 0
                # adjust cluster capacity
                cur_cap = cluster_capacity[not_assigned, cl_select].clone()
                cluster_capacity[not_assigned, cl_select] = cur_cap - max_w.squeeze(-1)

        return assignment.view(bs, num_init, n)