Source code for imml.cluster.imsr

# License: BSD-3-Clause

import os
from os.path import dirname
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from scipy.sparse.linalg import eigs

from ..impute import get_observed_mod_indicator
from ..utils import check_Xs_y
from .. import octavemodule_installed, oct2py_module_error

if octavemodule_installed:
    import oct2py



[docs]
class IMSR(BaseEstimator, ClusterMixin):
    r"""
    Self-representation Subspace Clustering for Incomplete Multi-view Data (IMSR). [#imsrpaper]_ [#imscaglcode]_

    IMSR performs feature extraction, imputation and self-representation learning to obtain a low-rank regularized
    consensus coefficient matrix.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to generate.
    lbd : float, default=1
        Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.
    gamma : float, default=1
        Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.
    random_state : int, default=None
        Determines the randomness. Use an int to make the randomness deterministic.
    engine : str, default=python
        Engine to use for computing the model. Current options are 'octave' or 'python'.
    verbose : bool, default=False
        Verbosity mode.
    clean_space : bool, default=True
        If engine is 'octave' and clean_space is True, the session will be closed after fitting the model.

    Attributes
    ----------
    labels_ : array-like of shape (n_samples,)
        Labels of each point in training data.
    embedding_ : array-like of shape (n_samples, n_clusters)
        Consensus clustering matrix to be used as input for the KMeans clustering step.
    loss_ : array-like of shape (n_iter\_,)
        Values of the loss function.
    n_iter_ : int
        Number of iterations.

    References
    ----------
    .. [#imsrpaper] Jiyuan Liu, Xinwang Liu, Yi Zhang, Pei Zhang, Wenxuan Tu, Siwei Wang, Sihang Zhou, Weixuan Liang,
                     Siqi Wang, and Yuexiang Yang. 2021. Self-Representation Subspace Clustering for Incomplete
                     Multi-view Data. In Proceedings of the 29th ACM International Conference on Multimedia (MM '21).
                     Association for Computing Machinery, New York, NY, USA, 2726–2734.
                     https://doi.org/10.1145/3474085.3475379.
    .. [#imscaglcode] https://github.com/liujiyuan13/IMSR-code_release

    Example
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from imml.cluster import IMSR
    >>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
    >>> estimator = IMSR(n_clusters = 2)
    >>> labels = estimator.fit_predict(Xs)
    """

    def __init__(self, n_clusters: int = 8, lbd : float = 1, gamma: float = 1, random_state:int = None,
                 engine: str ="python", verbose = False, clean_space: bool = True):
        if not isinstance(n_clusters, int):
            raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.")
        if n_clusters < 2:
            raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.")
        engines_options = ["octave", "python"]
        if engine not in engines_options:
            raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.")
        if (engine == "octave") and (not octavemodule_installed):
            raise ImportError(oct2py_module_error)
        if lbd <= 0:
            raise ValueError(f"Invalid lbd. It must be a positive value. {lbd} was passed.")
        if gamma <= 0:
            raise ValueError(f"Invalid gamma. It must be a positive value. {gamma} was passed.")

        self.n_clusters = n_clusters
        self.lbd = lbd
        self.gamma = gamma
        self.random_state = random_state
        self.engine = engine
        self.verbose = verbose
        self.clean_space = clean_space

        if self.engine == "octave":
            octave_folder = dirname(__file__)
            octave_folder = os.path.join(octave_folder, "_" + (os.path.basename(__file__).split(".")[0]))
            self._octave_folder = octave_folder
            octave_files = [x for x in os.listdir(octave_folder) if x.endswith(".m")]
            self._oc = oct2py.Oct2Py(temp_dir= octave_folder)
            for octave_file in octave_files:
                with open(os.path.join(octave_folder, octave_file)) as f:
                    self._oc.eval(f.read())



[docs]
    def fit(self, Xs, y=None):
        r"""
        Fit the transformer to the input data.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)

            A list of different modalities.
        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self :  Fitted estimator.
        """
        Xs = check_Xs_y(Xs, ensure_all_finite='allow-nan')

        if not isinstance(Xs[0], pd.DataFrame):
            Xs = [pd.DataFrame(X) for X in Xs]
        observed_mod_indicator = get_observed_mod_indicator(Xs)
        if isinstance(observed_mod_indicator, pd.DataFrame):
            observed_mod_indicator = observed_mod_indicator.reset_index(drop=True)
        observed_mod_indicator = [(1 + missing_mod[missing_mod == 0].index).to_list() for _, missing_mod in observed_mod_indicator.items()]
        transformed_Xs = [X.T.values for X in Xs]

        if self.engine=="octave":
            if self.random_state is not None:
                self._oc.rand('seed', self.random_state)
            Z, obj = self._oc.IMSC(transformed_Xs, tuple(observed_mod_indicator), self.n_clusters, self.lbd, self.gamma, nout=2)

            if self.clean_space:
                self._clean_space()

        elif self.engine == "python":
            Z, obj = self._imsc(transformed_Xs, tuple(observed_mod_indicator), self.n_clusters, self.lbd, self.gamma)

        model = KMeans(n_clusters= self.n_clusters, n_init="auto", random_state= self.random_state)
        self.labels_ = model.fit_predict(X= Z)
        self.embedding_, self.loss_ = Z, obj
        self.n_iter_ = len(self.loss_)

        return self



    def _predict(self, Xs):
        r"""
        Return clustering results for samples.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)

            A list of different modalities.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        return self.labels_



[docs]
    def fit_predict(self, Xs, y=None):
        r"""
        Fit the model and return clustering results.
        Convenience method; equivalent to calling fit(X) followed by predict(X).

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)

            A list of different modalities.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """

        labels = self.fit(Xs)._predict(Xs)
        return labels



    def _clean_space(self):
        [os.remove(os.path.join(self._octave_folder, x)) for x in ["reader.mat", "writer.mat"]]
        self._oc.exit()
        del self._oc
        return None



    def _imsc(self, X, Im, n_cluters, lbd, gamma):
        r"""
        Runs the IMSR clustering algorithm.

        Parameters
        ----------
        X : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)
        Im : array of shape (n_mods, columns_with_missing_values)
        n_cluters : int
            The number of clusters.
        lbd : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.
        gamma : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.

        Returns
        -------
        Utmp : list of array-likes objects of shape (n_samples, n_clusters)
        obj : float
        """
        V = len(X)
        max_iter = 100

        # Initialization
        X = [np.nan_to_num(mat, nan=0.0) for mat in X]
        beta = np.ones(shape=(V, )) / V

        Z = self._init_z(X, beta, lbd)

        t = 0
        flag = 1
        obj = []
        while flag:
            F = self._update_f(Z, n_cluters)
            Z = self._update_z(X, F, beta, lbd, gamma)
            X = self._update_x(X, Im, Z)

            append_obj, _, _, _ = self._cal_obj(X, Z, F, beta, lbd, gamma)
            obj.append([append_obj])

            if (t >= 2) and ((np.abs(np.subtract(obj[t - 1], obj[t]) / (obj[t])) < 1e-3) or (t > max_iter)):
                flag = 0
            t += 1

        Ztmp = (np.abs(Z) + np.abs(Z).T) / 2
        Utmp = np.real(self._baseline_spectral_onkernel(Ztmp, n_cluters))

        return Utmp, obj

    @staticmethod
    def _init_z(X, beta, lbd):
        r"""
        Initializes Z variable.

        Parameters
        ----------
        X : list of array-likes objects
            - X length: n_mods
            - X[i] shape: (n_samples, n_features_i)
        beta : list of n_mods values
        lbd : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.

        Returns
        -------
        Z : list of array-likes objects of shape (n_samples, n_samples)
        """
        V = len(X)
        n = X[0].shape[1]

        D = np.zeros(shape=(n, n))
        D = D + lbd * np.diag(np.ones(shape=(n,)))
        for v in range(V):
            D = D + beta[v] * (np.matmul(X[v].T, X[v]))

        D = np.linalg.inv(D)
        Z = -D / np.diag(D).T[: np.newaxis]
        Z = Z - np.diag(np.diag(Z))
        Z = (Z + Z.T) / 2

        return Z

    @staticmethod
    def _update_f(Z, n_cluters):
        r"""
        Updates the F variables.

        Parameters
        ----------
        Z : list of array-likes objects of shape (n_samples, n_samples)
        n_cluters : int
            The number of clusters.

        Returns
        -------
        F : list of array-likes objects of shape (n_clusters, n_samples)
        """
        _, Ftmp = eigs(A=(Z + Z.T), k=n_cluters, which='LR')
        F = Ftmp.T

        return np.real(F)

    @staticmethod
    def _update_z(X, F, beta, lbd, gamma):
        r"""
        Updates the Z variables.

        Parameters
        ----------
        X : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)
        F : list of array-likes objects of shape (n_clusters, n_samples)
        beta : list of n_mods values
        lbd : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.
        gamma : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.

        Returns
        -------
        Z : list of array-likes objects of shape (n_samples, n_samples)
        """
        V = len(X)
        n = X[0].shape[1]

        K = np.zeros(shape=(n, n))
        for v in range(V):
            K += beta[v] * np.matmul(X[v].T, X[v])

        C = np.matmul(F.T, F)
        base = K + (lbd + gamma) * np.eye(n)

        D = np.linalg.inv(base)
        beta = np.diag(D)
        Z1 = -D / np.diag(D).T[: np.newaxis]
        Z1 -= np.diag(np.diag(Z1))

        C = C - np.diag(np.diag(C))
        a = np.matmul(D, C)
        a -= np.diag(np.diag(a))
        b = np.diag(np.matmul(Z1.T, C))
        c = beta * b
        d = np.matmul(Z1, np.diag(c))
        d -= np.diag(np.diag(d))
        Z2 = gamma * (a - d)

        Z = Z1 + Z2
        return Z


    @staticmethod
    def _update_x(X, Im, Z):
        r"""
        Updates the X variables.

        Parameters
        ----------
        X : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)
        Im : array of shape (n_mods, columns_with_missing_values)
        Z : list of array-likes objects of shape (n_samples, n_samples)

        Returns
        -------
        X : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)
        """
        V = len(X)
        n = X[0].shape[1]

        B = np.eye(n) - Z - Z.T + (np.matmul(Z, Z.T))
        Io = [None] * V
        for v in range(V):
            Im_temp = [i-1 for i in Im[v]]
            Io[v] = np.setdiff1d(ar1=np.array([i for i in range(0, n)]), ar2=Im_temp)
            X[v][:, Im_temp] = np.linalg.solve(B[np.ix_(Im_temp, Im_temp)].conj().T,
                                               (np.matmul(-X[v][:, Io[v]], B[np.ix_(Io[v], Im_temp)]).conj().T)).conj().T

        return X

    @staticmethod
    def _cal_obj(X, Z, F, beta, lbd, gamma):
        r"""
        Returns pbj values with the individual terms that contribute to it.

        Parameters
        ----------
        X : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)
        Z : list of array-likes objects of shape (n_samples, n_samples)
        F
        beta : list of n_views values
        lbd : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.
        gamma : float, default=1
            Positive trade-off parameter used for the optimization function. It is recommended to set from 0 to 1.

        Returns
        -------
        obj : float
        term1 : float
        term2 : float
        term3 : float
        """
        V = len(X)
        term1 = 0

        for v in range(V):
            tmp = X[v] - np.matmul(X[v], Z)
            term1 = term1 + beta[v] * np.sum(np.sum(tmp ** 2))

        term2 = np.sum(np.sum(Z ** 2))
        tmp = Z - np.matmul(F.T, F)
        term3 = np.sum(np.sum(tmp ** 2))
        obj = term1 + lbd * term2 + gamma * term3

        return obj, term1, term2, term3

    @staticmethod
    def _baseline_spectral_onkernel(K, n_clusters):
        r"""
        This function returns the top eigenvectors of the given matrix.

        Parameters
        ----------
        K : list of array-likes objects of shape (n_samples, n_samples)
        n_clusters: int
            The number of clusters.

        Returns
        -------
        U : list of array-likes objects of shape (n_samples, n_clusters)
        """
        D = np.diag(np.sum(K, axis=1) + np.finfo(float).eps)
        inv_sqrt_D = np.sqrt(np.linalg.inv(np.abs(D)))
        L = np.matmul(np.matmul(inv_sqrt_D, K), inv_sqrt_D)
        L = (L + L.T) / 2
        V, U = eigs(L, k=n_clusters, which='LR', tol=0)

        return U