Source code for imml.cluster.eeimvc

import os
from contextlib import contextmanager
from os.path import dirname
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans
from sklearn.gaussian_process import kernels
from scipy.sparse.linalg import eigs
from numpy.linalg import svd

from ..impute import get_observed_mod_indicator
from ..utils import check_Xs

matlabmodule_installed = False
oct2py_module_error = "Module 'matlab' needs to be installed."
try:
    import oct2py
    matlabmodule_installed = True
except ImportError:
    pass


@contextmanager
def fixed_seed(seed):
    state = np.random.get_state()
    np.random.seed(seed)
    try:
        yield
    finally:
        np.random.set_state(state)


[docs] class EEIMVC(BaseEstimator, ClassifierMixin): r""" Efficient and Effective Incomplete Multi-view Clustering (EE-IMVC). [#eeimvcpaper]_ [#eeimvccode]_ EE-IMVC impute missing views with a consensus clustering matrix that is regularized with prior knowledge. Parameters ---------- n_clusters : int, default=8 The number of clusters to generate. kernel : callable, default=None Specifies the kernel type to be used in the algorithm. By default, it applies dot product kernel. lambda_reg : float, default=1. Regularization parameter. The algorithm demonstrated stable performance across a wide range of this hyperparameter. qnorm : float, default=2. Regularization parameter. The algorithm demonstrated stable performance across a wide range of this hyperparameter. random_state : int, default=None Determines the randomness. Use an int to make the randomness deterministic. engine : str, default=python Engine to use for computing the model. Current options are 'matlab' or 'python'. verbose : bool, default=False Verbosity mode. clean_space : bool, default=True If engine is 'matlab' and clean_space is True, the session will be closed after fitting the model. Attributes ---------- labels_ : array-like of shape (n_samples,) Labels of each point in training data. embedding_ : array-like of shape (n_samples, n_clusters) Consensus clustering matrix to be used as input for the KMeans clustering step. WP_ : array-like of shape (n_clusters, n_clusters, n_mods) p-th permutation matrix. HP_ : array-like of shape (n_samples, n_clusters, n_mods) missing part of the p-th base clustering matrix. beta_ : array-like of shape (n_mods,) Adaptive weights of clustering matrices. loss_ : array-like of shape (n_iter\_,) Values of the loss function. n_iter_ : int Number of iterations. References ---------- .. [#eeimvcpaper] X. Liu et al., "Efficient and Effective Regularized Incomplete Multi-View Clustering," in IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 43, no. 8, pp. 2634-2646, 1 Aug. 2021, doi: 10.1109/TPAMI.2020.2974828. .. [#eeimvccode] https://github.com/xinwangliu/TPAMI_EEIMVC Example -------- >>> import numpy as np >>> import pandas as pd >>> from imml.cluster import EEIMVC >>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)] >>> estimator = EEIMVC(n_clusters = 2) >>> labels = estimator.fit_predict(Xs) """ def __init__(self, n_clusters: int = 8, kernel: callable = None, lambda_reg: float = 1., qnorm: float = 2., random_state: int = None, engine: str ="python", verbose = False, clean_space: bool = True): if not isinstance(n_clusters, int): raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.") if n_clusters < 2: raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.") engines_options = ["matlab", "python"] if engine not in engines_options: raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.") if (engine == "matlab") and (not matlabmodule_installed): raise ImportError(oct2py_module_error) if kernel is None: kernel = kernels.Sum(kernels.DotProduct(), kernels.WhiteKernel()) self.n_clusters = n_clusters self.qnorm = qnorm self.kernel = kernel self.lambda_reg = lambda_reg self.random_state = random_state self.engine = engine self.verbose = verbose self.clean_space = clean_space if self.engine == "matlab": matlab_folder = dirname(__file__) matlab_folder = os.path.join(matlab_folder, "_" + (os.path.basename(__file__).split(".")[0])) self._matlab_folder = matlab_folder matlab_files = [x for x in os.listdir(matlab_folder) if x.endswith(".m")] self._oc = oct2py.Oct2Py(temp_dir= matlab_folder) for matlab_file in matlab_files: with open(os.path.join(matlab_folder, matlab_file)) as f: self._oc.eval(f.read())
[docs] def fit(self, Xs, y=None): r""" Fit the transformer to the input data. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : Fitted estimator. """ Xs = check_Xs(Xs, ensure_all_finite='allow-nan') if self.engine=="matlab": if isinstance(Xs[0], pd.DataFrame): transformed_Xs = [X.values for X in Xs] elif isinstance(Xs[0], np.ndarray): transformed_Xs = Xs observed_mod_indicator = get_observed_mod_indicator(transformed_Xs) if isinstance(observed_mod_indicator, np.ndarray): observed_mod_indicator = pd.DataFrame(observed_mod_indicator) s = [modality[modality == 0].index.values for _,modality in observed_mod_indicator.items()] transformed_Xs = [self.kernel(X) for X in transformed_Xs] transformed_Xs = np.array(transformed_Xs).swapaxes(0, -1) s = tuple([{"indx": i +1} for i in s]) if self.random_state is not None: self._oc.rand('seed', self.random_state) H_normalized,WP,HP,beta,obj = self._oc.incompleteLateFusionMKCOrthHp_lambda(transformed_Xs, s, self.n_clusters, self.qnorm, self.lambda_reg, nout=5) beta = beta[:,0] obj = obj[0] if self.clean_space: self._clean_space() elif self.engine=="python": observed_mod_indicator = get_observed_mod_indicator(Xs) if isinstance(observed_mod_indicator, pd.DataFrame): observed_mod_indicator = observed_mod_indicator.reset_index(drop=True) elif isinstance(observed_mod_indicator[0], np.ndarray): observed_mod_indicator = pd.DataFrame(observed_mod_indicator) s = [modality[modality == 0].index.values for _, modality in observed_mod_indicator.items()] transformed_Xs = [self.kernel(X) for X in Xs] transformed_Xs = np.array(transformed_Xs).swapaxes(0, -1) transformed_Xs = np.nan_to_num(transformed_Xs, nan=0) s = tuple([{"indx": i + 1} for i in s]) H_normalized, WP, HP, beta, obj = self._incomplete_late_fusion_MKCOrthHp_lamba(transformed_Xs, s, self.n_clusters, self.qnorm, self.lambda_reg) model = KMeans(n_clusters= self.n_clusters, n_init="auto", random_state= self.random_state) self.labels_ = model.fit_predict(X=H_normalized) self.embedding_, self.WP_, self.HP_, self.beta_, self.loss_ = H_normalized, WP, HP, beta, obj self.n_iter_ = len(self.loss_) return self
def _predict(self, Xs): r""" Return clustering results for samples. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. Returns ------- labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ return self.labels_
[docs] def fit_predict(self, Xs, y=None): r""" Fit the model and return clustering results. Convenience method; equivalent to calling fit(X) followed by predict(X). Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. Returns ------- labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ labels = self.fit(Xs)._predict(Xs) return labels
def _clean_space(self): [os.remove(os.path.join(self._matlab_folder, x)) for x in ["reader.mat", "writer.mat"]] self._oc.exit() del self._oc return None def _my_initialization_Hp(self, KH, S, n_clusters): r""" Initialize HP and WP variable. Parameters ---------- KH: 3-D array of shape(n_samples, n_samples, kernels) S: tuple of shape (n_mods) - S[i]['indx']: array of missing values column n_clusters: int The number of clusters. Returns ------- HP: 3-d array of shape (n_samples, n_clusters, n_mods) WP: 3-d array of shape (n_clusters, n_clusters, n_mods) """ numker = KH.shape[2] num = KH.shape[0] HP = np.zeros(shape=(num, n_clusters, numker)) WP = np.zeros(shape=(n_clusters, n_clusters, numker)) for p in range(numker): KH_tmp = KH[:, :, p] HP_tmp = HP[:, :, p] obs_index = np.setdiff1d(ar1=[i for i in range(num)], ar2=[i-1 for i in S[p]['indx'].T]) KAp = KH_tmp[np.ix_(obs_index, obs_index)] KAp = (KAp + KAp.T) / 2 + 1e-8 * np.eye(len(obs_index)) if self.random_state is not None: v0 = np.random.default_rng(self.random_state).uniform(size=min(KAp.shape)) with fixed_seed(self.random_state): _, Hp = eigs(KAp, n_clusters, which='LR', v0=v0) else: _, Hp = eigs(KAp, n_clusters, which='LR') HP_tmp[np.ix_(obs_index), :] = Hp HP[:, :, p] = HP_tmp WP[:, :, p] = np.eye(n_clusters) return HP, WP def _algorithm2(self, KH, S): r""" Process KH with the missing index. Parameters ---------- KH: 3-D array of shape(n_samples, n_samples, kernels) S: tuple of shape (n_mods) - S[i]['indx']: array of missing values column Returns ------- KH2: 3-D array of shape (n_samples, n_samples, n_mods) """ num = KH.shape[0] numker = KH.shape[2] KH2 = np.zeros(shape=(num, num, numker)) for p in range(numker): KH_tmp = KH[:, :, p] KH2_tmp = KH2[:, :, p] obs_index = np.setdiff1d(ar1=[i for i in range(num)], ar2=[i-1 for i in S[p]['indx'].T]) KAp = KH_tmp[np.ix_(obs_index, obs_index)] KH2_tmp[np.ix_(obs_index, obs_index)] = (KAp + KAp.T)/2 KH2[:, :, p] = KH2_tmp return KH2 def _my_comb_fun(self, Y, beta): r""" Process data with beta values Parameters ---------- Y: 3-D array of shape(n_samples, n_samples, kernels) beta: list of float (len=n_mods) Returns ------- cF: 2-D array of shape (n_samples, n_samples) """ m = Y.shape[2] n = Y.shape[0] cF = np.zeros(shape=(n, n)) for p in range(m): cF += Y[:, :, p] * beta[p] return cF def _my_kernal_kmeans(self, K, n_clusters): r""" Determines eigenvectors. Parameters ---------- K: 2-D array of shape (n_samples, n_samples) n_clusters: int The number of clusters. Returns ------- H_normalized: 2-D array of shape (n_samples, n_clusters) """ K = (K + K.T) / 2 if self.random_state is not None: v0 = np.random.default_rng(self.random_state).uniform(size=min(K.shape)) with fixed_seed(self.random_state): _, H = eigs(K, n_clusters, which='LR', v0=v0) else: _, H = eigs(K, n_clusters, which='LR') obj = np.trace(np.matmul(H.T, np.matmul(K, H))) - np.trace(K) H_normalized = H return H_normalized def _update_WP_absent_clustering_V1(self, HP, Hstar): r""" Update the WP variable. Parameters ---------- HP: 3-D array of shape (n_samples, n_clusters, n_mods) Hstar: 2-D array of shape (n_samples, n_clusters) Returns ------- WP: 3-d array of shape (n_clusters, n_clusters, n_mods) """ k = HP.shape[1] numker = HP.shape[2] WP = np.zeros(shape=(k, k, numker)) for p in range(numker): Tp = np.matmul(HP[:, :, p].T, Hstar) Up, Sp, Vp = np.linalg.svd(Tp, full_matrices=False) V = Vp.T.conj() WP[:, :, p] = np.matmul(Up, V.T) return WP def _update_HP_absent_clustering_OrthHp(self, WP, Hstar, S, HP00): r""" Update the HP variable. Parameters ---------- WP: 3-D array of shape (n_clusters, n_clusters, n_views) Hstar: 2-D array of shape (n_samples, n_clusters) S: tuple of shape (n_views) - S[i]['indx']: array of missing values column HP00: 3-D array of shape (n_samples, n_clusters, n_views) Returns ------- HP: 3-D array of shape (n_samples, n_clusters, n_views) """ num = Hstar.shape[0] k = Hstar.shape[1] numker = WP.shape[2] HP = np.zeros(shape=(num, k, numker)) for p in range(numker): mis_indx = [i-1 for i in S[p]['indx'].T] obs_indx = np.setdiff1d(ar1=[i for i in range(num)], ar2=mis_indx) if len(mis_indx) > 0: Vp = np.matmul(Hstar[np.ix_(mis_indx), :], WP[:, :, p].T) Up, Sp, Vp = svd(Vp, full_matrices=False) V = Vp.T.conj() HP[mis_indx, :, p] = np.matmul(Up, V.T) HP_tmp = HP[:, :, p] HP00_tmp = HP00[:, :, p] HP_tmp[np.ix_(obs_indx), :] = HP00_tmp[np.ix_(obs_indx), :] HP[:, :, p] = HP_tmp return HP def _update_beta_absent_clustering(self, HP, WP, Hstar, qnorm): r""" Update the beta variable Parameters ---------- HP: 3-D array of shape (n_samples, n_clusters, n_views) WP: 3-D array of shape (n_clusters, n_clusters, n_views) Hstar: 2-D array of shape (n_samples, n_clusters) qnorm: float, default=2.0 Returns ------- beta: list of float (len=n_views) """ numker = WP.shape[2] HHPWP = np.zeros(shape=(numker, 1)) for p in range(numker): HHPWP[p] = np.trace(np.matmul(Hstar.T, np.matmul(HP[:, :, p], WP[:, :, p]))) beta = HHPWP**(1/qnorm-1) / np.sum(HHPWP**(qnorm/(qnorm-1)))**(1/qnorm) return beta def _incomplete_late_fusion_MKCOrthHp_lamba(self, KH, S, n_clusters, qnorm, lambda_reg): r""" Runs the EEIMVC clustering algorithm. Parameters ---------- KH: 3-D array of shape(n_samples, n_samples, n_views) S: tuple of shape (n_views) - S[i]['indx']: array of missing values column n_clusters: int The number of clusters. qnorm: float, default=2.0 lambda_: float, default=1.0 Returns ------- H_normalized: list of array-likes objects of shape (n_samples, n_clusters) WP: 3-D array of shape (n_clusters, n_clusters, n_views) HP: 3-D array of shape (n_samples, n_clusters, n_views) beta: list of float (len=n_views) obj: list of float """ num = KH.shape[1] numker = KH.shape[2] maxIter = 100 HP, WP = self._my_initialization_Hp(KH, S, n_clusters) HP00 = HP beta = np.ones(shape=(numker, 1)) * (1/numker)**(1/qnorm) KA = self._algorithm2(KH, S) KC = self._my_comb_fun(KA, beta) H0 = self._my_kernal_kmeans(KC, n_clusters) flag = 1 iter = 0 RpHpwp = np.zeros(shape=(num, n_clusters)) for p in range(numker): RpHpwp += beta[p] * (HP[:, :, p] @ WP[:, :, p]) RpHpwp_lambda = RpHpwp + (lambda_reg * H0) obj = [] obj.append(0) while flag: iter += 1 Uh, Sh, Vh = svd(RpHpwp_lambda, full_matrices=False) V = Vh.T.conj() Hstar = Uh @ V.T WP = self._update_WP_absent_clustering_V1(HP, Hstar) HP = self._update_HP_absent_clustering_OrthHp(WP, Hstar, S, HP00) beta = self._update_beta_absent_clustering(HP, WP, Hstar, qnorm) RpHpwp = np.zeros(shape=(num, n_clusters)) for p in range(numker): RpHpwp += beta[p] * np.matmul(HP[:, :, p], WP[:, :, p]) RpHpwp_lambda = RpHpwp + (lambda_reg * H0) obj.append(np.trace(np.matmul(Hstar.T, RpHpwp_lambda))) if (iter > 2) and (np.abs((obj[iter] - obj[iter-1]) / obj[iter])) < 1e-4 or (iter > maxIter): flag = 0 H_normalized = np.real(Hstar / np.tile(A=np.sqrt(np.sum(Hstar**2, 1)), reps=(n_clusters, 1)).T) return H_normalized, WP, HP, beta, obj