Source code for imml.cluster.simcadc

# License: BSD-3-Clause

import os
from os.path import dirname

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans
from scipy.linalg import svd
from scipy.stats import zscore

from ..impute import simple_mod_imputer
from ..preprocessing import select_complete_samples
from ..utils import check_Xs
from ..explore import get_missing_samples_by_mod

matlabmodule_installed = False
oct2py_module_error = "Module 'matlab' needs to be installed. See https://imml.readthedocs.io/stable/main/installation.html#optional-dependencies"
try:
    import oct2py
    matlabmodule_installed = True
except ImportError:
    pass



[docs]
class SIMCADC(BaseEstimator, ClassifierMixin):
    r"""
    Scalable Incomplete Multiview Clustering with Adaptive Data Completion (SIMC-ADC). [#simcadcpaper]_ [#simcadccode]_

    The SIMC-ADC algorithm captures the complementary information from different views by building a view-specific
    anchor graph. The anchor graph construction and a structure alignment are jointly optimized to enhance
    clustering quality.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to generate.
    lambda_parameter : float, default=1
        Balance the influence between anchor graph generation and alignment term.
    n_anchors : int, default=None
        Number of anchors. If None, use n_clusters.
    beta : float, default=1
        Balance the influence between anchor graph generation and alignment term.
    gamma : float, default=1
        Balance the influence between anchor graph generation and alignment term.
    random_state : int, default=None
        Determines the randomness. Use an int to make the randomness deterministic.
    engine : str, default=python
        Engine to use for computing the model. Current options are 'matlab' or 'python'.
    verbose : bool, default=False
        Verbosity mode.
    clean_space : bool, default=True
        If engine is 'matlab' and clean_space is True, the session will be closed after fitting the model.

    Attributes
    ----------
    labels_ : array-like of shape (n_samples,)
        Labels of each point in training data.
    embedding_ : array-like of shape (n_samples, n_clusters)
        Consensus clustering matrix to be used as input for the KMeans clustering step.
    V_ : array-like of shape (n_clusters, n_clusters)
        Commont latent feature matrix.
    A_ : array-like of shape (n_clusters, n_clusters)
        Learned anchors.
    Z_ : array-like of shape (n_clusters, n_samples)
        modality-specific anchor graph.
    loss_ : array-like of shape (n_iter\_,)
        Values of the loss function.
    n_iter_ : int
        Number of iterations.

    References
    ----------
    .. [#simcadcpaper] He, W.-J., Zhang, Z., & Wei, Y. (2023). Scalable incomplete multi-view clustering with adaptive
                       data completion. Information Sciences, 649, 119562. doi:10.1016/j.ins.2023.119562.
    .. [#simcadccode] https://github.com/DarrenZZhang/INS23-SIMC_ADC

    Example
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from imml.cluster import SIMCADC
    >>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
    >>> estimator = SIMCADC(n_clusters = 2)
    >>> labels = estimator.fit_predict(Xs)
    """

    def __init__(self, n_clusters: int = 8, lambda_parameter: float = 1, n_anchors: int = None,
                 beta: float = 1, gamma: float = 1, eps: float = 1e-25, random_state:int = None,
                 engine: str ="python", verbose = False, clean_space: bool = True):
        if not isinstance(n_clusters, int):
            raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.")
        if n_clusters < 2:
            raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.")
        engines_options = ["matlab", "python"]
        if engine not in engines_options:
            raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.")
        if (engine == "matlab") and (not matlabmodule_installed):
            raise ImportError(oct2py_module_error)

        self.n_clusters = n_clusters
        self.lambda_parameter = lambda_parameter
        self.beta = beta
        self.gamma = gamma
        self.eps = eps
        self.n_anchors = n_clusters if n_anchors is None else n_anchors
        self.random_state = random_state
        self.engine = engine
        self.verbose = verbose
        self.clean_space = clean_space

        if self.engine == "matlab":
            matlab_folder = dirname(__file__)
            matlab_folder = os.path.join(matlab_folder, "_" + (os.path.basename(__file__).split(".")[0]))
            self._matlab_folder = matlab_folder
            matlab_files = [x for x in os.listdir(matlab_folder) if x.endswith(".m")]
            self._oc = oct2py.Oct2Py(temp_dir= matlab_folder)
            for matlab_file in matlab_files:
                with open(os.path.join(matlab_folder, matlab_file)) as f:
                    self._oc.eval(f.read())



[docs]
    def fit(self, Xs, y=None):
        r"""
        Fit the transformer to the input data.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)

            A list of different modalities.
        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self :  Fitted estimator.
        """
        Xs = check_Xs(Xs, ensure_all_finite='allow-nan')

        if self.engine=="matlab":
            if not isinstance(Xs[0], pd.DataFrame):
                Xs = [pd.DataFrame(X) for X in Xs]
            mean_mod_profile = [X.mean(axis=0).to_frame(X_id) for X_id, X in enumerate(select_complete_samples(Xs))]
            incomplete_samples = get_missing_samples_by_mod(Xs=Xs, return_as_list=True)
            mean_mod_profile = [pd.DataFrame(np.tile(means, len(incom))).values for means, incom in
                                  zip(mean_mod_profile, incomplete_samples)]

            transformed_Xs = simple_mod_imputer(Xs, value="zeros")
            transformed_Xs, mean_mod_profile = tuple(transformed_Xs), tuple(mean_mod_profile)

            w = [pd.DataFrame(np.eye(len(X)), index=X.index, columns=X.index) for X in Xs]
            w = [eye.loc[samples,:].values for eye, samples in zip(w, incomplete_samples)]
            w = tuple(w)

            n_incomplete_samples_mod = list(len(incomplete_sample) for incomplete_sample in incomplete_samples)

            # if self.random_state is not None:
            #     self._oc.rand('seed', self.random_state)
            u,v,a,w,z,iter,obj = self._oc.SIMC(transformed_Xs, len(Xs[0]), self.lambda_parameter,
                                                self.n_clusters, self.n_anchors, w, n_incomplete_samples_mod,
                                                mean_mod_profile, self.beta, self.gamma, nout=7)
            obj = obj[0]

            if self.clean_space:
                self._clean_space()

        elif self.engine=="python":
            if not isinstance(Xs[0], pd.DataFrame):
                Xs = [pd.DataFrame(X) for X in Xs]
            mean_mod_profile = [X.mean(axis=0).to_frame(X_id) for X_id, X in enumerate(select_complete_samples(Xs))]
            incomplete_samples = get_missing_samples_by_mod(Xs=Xs, return_as_list=True)
            mean_mod_profile = [pd.DataFrame(np.tile(means, len(incom))).values for means, incom in
                                 zip(mean_mod_profile, incomplete_samples)]

            transformed_Xs = simple_mod_imputer(Xs, value="zeros")
            # transformed_Xs, mean_view_profile = tuple(transformed_Xs), tuple(mean_view_profile)

            w = [pd.DataFrame(np.eye(len(X)), index=X.index, columns=X.index) for X in Xs]
            w = [eye.loc[samples, :].values for eye, samples in zip(w, incomplete_samples)]
            # w = tuple(w)

            n_incomplete_samples_mod = list(len(incomplete_sample) for incomplete_sample in incomplete_samples)

            if self.random_state is not None:
                np.random.seed(self.random_state)
            u, v, a, w, z, iter, obj = self._SIMC(transformed_Xs, len(Xs[0]), self.lambda_parameter,
                                                     self.n_clusters, self.n_anchors, w, n_incomplete_samples_mod,
                                                     mean_mod_profile, self.beta, self.gamma)

        model = KMeans(n_clusters= self.n_clusters, n_init= "auto", random_state= self.random_state)
        self.labels_ = model.fit_predict(X=u)
        self.embedding_ = u
        self.V_ = v
        self.A_ = a
        self.Z_, self.loss_, self.iter_ = z, obj, iter
        self.n_iter_ = len(self.loss_)

        return self



    def _predict(self, Xs):
        r"""
        Return clustering results for samples.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)

            A list of different modalities.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        return self.labels_



[docs]
    def fit_predict(self, Xs, y=None):
        r"""
        Fit the model and return clustering results.
        Convenience method; equivalent to calling fit(X) followed by predict(X).

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)

            A list of different modalities.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """

        labels = self.fit(Xs)._predict(Xs)
        return labels



    def _clean_space(self):
        [os.remove(os.path.join(self._matlab_folder, x)) for x in ["reader.mat", "writer.mat"]]
        self._oc.exit()
        del self._oc
        return None



    def _eproj_simplex_new(self, v, k=1):
        r"""
        Adjust the v variable if needed.

        Parameters
        ----------
        v: list of length (n_clusters)
        k: int, default=1

        Returns
        -------
        v0: list of length (n_clusters)
        """
        ft = 1
        n = len(v)

        v0 = v - np.mean(v) + k / n
        vmin = np.min(v0)

        if vmin < 0:
            lambda_m = 0
            f = 1

            while abs(f) > 1e-10:
                v1 = v0 - lambda_m
                posidx = v1 > 0
                npos = np.sum(posidx)

                g = -npos
                f = np.sum(v1[posidx]) - k

                lambda_m -= f / g
                ft += 1

                if ft > 100:
                    return np.maximum(v1, 0)

            return np.maximum(v1, 0)

        else:
            # If no adjustment is needed, return the shifted vector
            return v0


    def _SIMC(self, Y, num_sample, lambda_parameter, n_clusters, n_anchors, N, Ne, E, beta, gamma):
        r"""
        Runs the SIMCADC algorithm.

        Parameters
        ----------
        Y: list of array-likes objects of length (n_mods)
            - Y[i] shape: (n_samples, n_features_i)
        num_sample: int
            Number of samples
        lambda_parameter : float, default=1
            Balance the influence between anchor graph generation and alignment term.
        n_clusters : int, default=8
            The number of clusters to generate.
        n_anchors : int, default=None
            Number of anchors. If None, use n_clusters.
        N: list of arrays-likes of length (n_mods)
            - N[i] shape: (missing_view_columns, n_features_i)
        Ne: list of arrays-likes of length (n_mods)
            Number of missing modality columns
        E: list of arrays-likes of length (n_mods)
            - E[i] shape: (n_features_i, missing_view_columns)
        beta: float, default=1.0
        gamma: float, default=1.0

        Returns
        -------
        UU: list of array-likes objects of shape (n_samples, n_clusters)
        V: list of array-likes objects of shape (n_clusters, n_clusters)
        A: list of array-likes objects of shape (n_clusters, n_anchors)
        W: list of array-likes objects of length (n_view)
            - W[i] shape: (n_features_i, n_clusters)
        Z_final: list of array-likes objects of shape (n_anchors, n_samples)
        iter: int
            Number of iterations.
        obj: List of floats of length (iter)
        """
        # Initialize parameters
        maxIter = 50
        num_view = len(Y)

        W = [None] * num_view
        A = np.zeros(shape=(n_clusters, n_anchors))
        Z_final = np.zeros(shape=(n_anchors, num_sample))
        NNT = [None] * num_view
        B = [None] * num_view
        Z = [None] * num_view
        R = [None] * num_view

        # Preprocessing Y and initializing
        for i in range(num_view):
            Y[i] = np.nan_to_num(zscore(Y[i], ddof=1, axis=0).T, nan=0)
            di = Y[i].shape[0]
            W[i] = np.zeros(shape=(di, n_clusters))
            NNT[i] = np.matmul(N[i], N[i].T)
            B[i] = np.zeros(shape=(di, num_sample))
            Z[i] = np.zeros(shape=(n_anchors, num_sample))
            R[i] = np.eye(n_anchors)

        Z_final[:, :n_anchors] = np.eye(n_anchors)
        alpha = (np.ones(shape=(1, num_view)) / num_view)[0]

        flag = 1
        iter = 0
        obj = []

        while flag:
            iter += 1

            X = [Y[iv] + np.matmul(E[iv], N[iv]) for iv in range(num_view)]
            X = [np.nan_to_num(i, nan=0) for i in X]

            # Update W_i
            for iv in range(num_view):
                AZ = np.matmul(A, Z[iv])
                C = np.matmul(X[iv], AZ.T)
                U, _, Vt = svd(C, full_matrices=False)
                Vt = Vt.T.conj()
                W[iv] = np.matmul(U, Vt.T)

            # Update A
            part1 = sum(alpha[ia] ** 2 * np.matmul(W[ia].T, np.matmul(X[ia], Z[ia].T)) for ia in range(num_view))
            Unew, _, Vnew = svd(part1, full_matrices=False)
            Vnew = Vnew.T.conj()
            A = np.matmul(Unew, Vnew.T)

            # Update Z_i
            for iv in range(num_view):
                C1 = alpha[iv] ** 2 * np.matmul(X[iv].T, np.matmul(W[iv], A)) + gamma * np.matmul(Z_final.T, R[iv])
                C2 = alpha[iv] ** 2 + gamma
                C1 = C1.T
                for ii in range(num_sample):
                    ut = C1[:, ii] / C2
                    Z[iv][:, ii] = self._eproj_simplex_new(ut)

            # Update Z_final
            C3 = sum(gamma * np.matmul(Z[iv].T, R[iv].T) for iv in range(num_view))
            C3 = C3.T
            C4 = num_view * gamma + 1
            for ii in range(num_sample):
                ut = C3[:, ii] / C4
                Z_final[:, ii] = self._eproj_simplex_new(ut)

            # Update E
            for iv in range(num_view):
                B[iv] = Y[iv] - np.matmul(W[iv], np.matmul(A, Z_final))
                E[iv] = np.matmul(-B[iv], np.matmul(N[iv].T, np.linalg.inv(NNT[iv] + beta * np.ones((Ne[iv], Ne[iv])))))

            # Update R
            for iv in range(num_view):
                Z_iv_final = np.matmul(Z[iv], Z_final.T)
                U_R, _, V_R = svd(Z_iv_final, full_matrices=False)
                V_R = V_R.T.conj()
                R[iv] = np.matmul(U_R, V_R.T)

            # Update alpha
            for iv in range(num_view):
                alpha[iv] = np.sqrt(1 / (np.linalg.norm(B[iv] + np.matmul(E[iv], N[iv]), 'fro') + np.finfo('float').eps))

            # Objective calculation
            term1 = sum(alpha[iv] ** 2 * np.linalg.norm(X[iv] - np.matmul(W[iv], np.matmul(A, Z_final)), 'fro') ** 2 for iv in range(num_view))
            term2 = lambda_parameter * np.linalg.norm(Z_final, 'fro') ** 2
            obj.append(term1 + term2)

            if iter > 9 and (abs((obj[iter - 2] - obj[iter - 1]) / obj[iter - 2]) < 1e-3 or iter > maxIter or obj[
                iter - 1] < 1e-10):
                UU, _, V = svd(Z_final.T, full_matrices=False)
                V = V.T.conj()
                flag = 0

        UU = UU / np.sqrt(np.sum(UU ** 2, axis=1, keepdims=True))
        return UU, V, A, W, Z_final, iter, obj