Source code for imml.cluster.lfimvc

# License: BSD-3-Clause

import os
from os.path import dirname
import numpy as np
from sklearn.gaussian_process import kernels
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from scipy.sparse.linalg import eigs
from sklearn.impute import SimpleImputer

from ..preprocessing import MMTransformer
from ..utils import check_Xs_y
from .. import octavemodule_installed, oct2py_module_error

if octavemodule_installed:
    import oct2py


[docs] class LFIMVC(BaseEstimator, ClusterMixin): r""" Late Fusion Incomplete Multi-View Clustering (LF-IMVC). [#lfimvcpaper]_ [#lfimvccode]_ LF-IMVC jointly learns a consensus clustering matrix, imputes each incomplete base matrix, and optimizes the corresponding permutation matrices to integrate the incomplete clustering matrices generated by incomplete views. Parameters ---------- n_clusters : int, default=8 The number of clusters to generate. kernel : callable, default=kernels.Sum(kernels.DotProduct(), kernels.WhiteKernel()) Specifies the kernel type to be used in the algorithm. lambda_reg : float, default=1. Regularization parameter. The algorithm demonstrated stable performance across a wide range of this hyperparameter. max_iter : int, default=200 Maximum number of iterations. random_state : int, default=None Determines the randomness. Use an int to make the randomness deterministic. engine : str, default=python Engine to use for computing the model. Current options are 'octave' or 'python'. verbose : bool, default=False Verbosity mode. clean_space : bool, default=True If engine is 'octave' and clean_space is True, the session will be closed after fitting the model. Attributes ---------- labels_ : array-like of shape (n_samples,) Labels of each point in training data. embedding_ : np.array Consensus clustering matrix to be used as input for the KMeans clustering step. WP_ : array-like of shape (n_clusters, n_clusters, n_mods) p-th permutation matrix. HP_ : array-like of shape (n_samples, n_clusters, n_mods) missing part of the p-th base clustering matrix. loss_ : array-like of shape (n_iter\_,) Values of the loss function. n_iter_ : int Number of iterations. References ---------- .. [#lfimvcpaper] X. Liu et al., "Late Fusion Incomplete Multi-View Clustering," in IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 41, no. 10, pp. 2410-2423, 1 Oct. 2019, doi: 10.1109/TPAMI.2018.2879108. .. [#lfimvccode] https://github.com/xinwangliu/Late-Fusion-Incomplete-Multi-view-Clustering Example -------- >>> import numpy as np >>> import pandas as pd >>> from imml.cluster import LFIMVC >>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)] >>> estimator = LFIMVC(n_clusters = 2) >>> labels = estimator.fit_predict(Xs) """ def __init__(self, n_clusters: int = 8, kernel: callable = kernels.Sum(kernels.DotProduct(), kernels.WhiteKernel()), lambda_reg: float = 1., max_iter: int = 200, random_state:int = None, engine: str ="python", verbose = False, clean_space: bool = True): if not isinstance(n_clusters, int): raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.") if n_clusters < 2: raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.") engines_options = ["octave", "python"] if engine not in engines_options: raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.") if (engine == "octave") and (not octavemodule_installed): raise ImportError(oct2py_module_error) self.n_clusters = n_clusters self.kernel = kernel self.lambda_reg = lambda_reg self.random_state = random_state self.max_iter = max_iter self.engine = engine self.verbose = verbose self.clean_space = clean_space if self.engine == "octave": octave_folder = dirname(__file__) octave_folder = os.path.join(octave_folder, "_" + (os.path.basename(__file__).split(".")[0])) self._octave_folder = octave_folder octave_files = [x for x in os.listdir(octave_folder) if x.endswith(".m")] self._oc = oct2py.Oct2Py(temp_dir= octave_folder) for octave_file in octave_files: with open(os.path.join(octave_folder, octave_file)) as f: self._oc.eval(f.read())
[docs] def fit(self, Xs, y=None): r""" Fit the transformer to the input data. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : Fitted estimator. """ Xs = check_Xs_y(Xs, ensure_all_finite='allow-nan') imputer = MMTransformer(transformer=SimpleImputer()) transformed_Xs = imputer.fit_transform(Xs) transformed_Xs = [self.kernel(X) for X in transformed_Xs] KH = np.array(transformed_Xs).swapaxes(0, -1) if self.engine=="octave": if self.random_state is not None: self._oc.rand('seed', self.random_state) # Apply kcenter and knorm KH = self._oc.kcenter(KH) KH = self._oc.knorm(KH) num = KH.shape[0] numker = KH.shape[2] H = np.zeros((num, self.n_clusters, numker)) for ker in range(numker): H[:, :, ker] = self._oc.mykernelkmeans(KH[:, :, ker], self.n_clusters, nout=1) U, WP, HP, obj = self._oc.IncompleteMultikernelLatefusionclusteringV1Hv(H, self.n_clusters, self.lambda_reg, self.max_iter, nout=4) if obj.ndim == 2: obj = obj.flatten() if self.clean_space: self._clean_space() elif self.engine=="python": # Apply kcenter and knorm KH = self._k_center(KH) KH = self._k_norm(KH) # Compute base clustering matrices num = KH.shape[0] numker = KH.shape[2] H = np.zeros((num, self.n_clusters, numker)) for ker in range(numker): H[:, :, ker] = self._my_kernel_kmeans(KH[:, :, ker], self.n_clusters) U, WP, HP, obj = self._incomplete_multikernel_late_fusion_clustering(H, self.n_clusters, self.lambda_reg) model = KMeans(n_clusters= self.n_clusters, n_init="auto", random_state= self.random_state) self.labels_ = model.fit_predict(X= U) self.embedding_, self.WP_, self.HP_, self.loss_ = U, WP, HP, obj self.n_iter_ = len(self.loss_) return self
def _predict(self, Xs): r""" Return clustering results for samples. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. Returns ------- labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ return self.labels_
[docs] def fit_predict(self, Xs, y=None): r""" Fit the model and return clustering results. Convenience method; equivalent to calling fit(X) followed by predict(X). Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. Returns ------- labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ labels = self.fit(Xs)._predict(Xs) return labels
def _clean_space(self): [os.remove(os.path.join(self._octave_folder, x)) for x in ["reader.mat", "writer.mat"]] self._oc.exit() del self._oc return None def _k_center(self, K): r""" Center a kernel matrix. Parameters ---------- K: 2-D or 3-D array Kernel matrix or stack of kernel matrices Returns ------- K: Centered kernel matrix """ n = K.shape[1] if K.ndim == 2: D = np.sum(K, axis=0) / n E = np.sum(D) / n J = np.outer(np.ones(n), D) K = K - J - J.T + E * np.ones((n, n)) K = 0.5 * (K + K.T) elif K.ndim == 3: for i in range(K.shape[2]): D = np.sum(K[:, :, i], axis=0) / n E = np.sum(D) / n J = np.outer(np.ones(n), D) K[:, :, i] = K[:, :, i] - J - J.T + E * np.ones((n, n)) K[:, :, i] = 0.5 * (K[:, :, i] + K[:, :, i].T) + 1e-12 * np.eye(n) return K def _k_norm(self, K): r""" Normalize a kernel matrix. Parameters ---------- K: 2-D or 3-D array Kernel matrix or stack of kernel matrices Returns ------- K: Normalized kernel matrix """ if K.ndim == 3: for i in range(K.shape[2]): diag_k = np.diag(K[:, :, i]) K[:, :, i] = K[:, :, i] / np.sqrt(np.outer(diag_k, diag_k)) else: diag_k = np.diag(K) K = K / np.sqrt(np.outer(diag_k, diag_k)) return K def _my_kernel_kmeans(self, K, n_clusters): r""" Determines eigenvectors. Parameters ---------- K: 2-D array of shape (n_samples, n_samples) n_clusters: int The number of clusters. Returns ------- H_normalized: 2-D array of shape (n_samples, n_clusters) """ K = (K + K.T) / 2 # Use eigs with deterministic starting vector if random_state is set if self.random_state is not None: rng = np.random.RandomState(self.random_state) v0 = rng.randn(K.shape[0]) _, H = eigs(A=K, k=n_clusters, which='LR', v0=v0) else: _, H = eigs(A=K, k=n_clusters, which='LR') # Take real part (eigenvectors should be real for symmetric matrices) H = np.real(H) H_normalized = H / np.tile(np.sqrt(np.sum(H ** 2, 1)), reps=(n_clusters, 1)).T return H_normalized def _update_wp_absent_clustering_v1(self, HP, Hstar): r""" Update WP variable. Parameters ---------- HP: 3-D array of shape (n_samples, n_clusters, n_mods) Hstar: 2-D array of shape (n_samples, n_clusters) Returns ------- WP: 3-D array of shape (n_clusters, n_clusters, n_mods) """ k = HP.shape[1] numker = HP.shape[2] WP = np.zeros(shape=(k, k, numker)) for p in range(numker): Tp = np.matmul(HP[:, :, p].T, Hstar) Up, Sp, Vp = np.linalg.svd(Tp, full_matrices=False) V = Vp.T.conj() WP[:, :, p] = np.matmul(Up, V.T) return WP def _update_hp_absent_clustering_v1(self, WP, Hstar, lambda_reg, HP00): r""" Update HP variable. Parameters ---------- WP: 3-D array of shape (n_clusters, n_clusters, n_mods) Hstar: 2-D array of shape (n_samples, n_clusters) lambda_reg : float, default=1. Regularization parameter. The algorithm demonstrated stable performance across a wide range of this hyperparameter. HP00: 3-D array of shape (n_samples, n_clusters, n_mods) Returns ------- HP: 3-D array of shape (n_samples, n_clusters, n_mods) """ num = HP00.shape[0] k = HP00.shape[1] numker = HP00.shape[2] HP = np.zeros(shape=(num, k, numker)) for p in range(numker): Vp = np.matmul(Hstar, WP[:, :, p].T) + lambda_reg * HP00[:, :, p] Up, Sp, Vp = np.linalg.svd(Vp, full_matrices=False) V = Vp.T.conj() HP[:, :, p] = np.matmul(Up, V.T) return HP def _incomplete_multikernel_late_fusion_clustering(self, HP, n_clusters, lambda_reg): r""" Runs the LFIMVC clustering algorithm. Parameters ---------- HP: 3-D array of shape (n_samples, n_clusters, n_mods) Base clustering matrices from each view n_clusters : int, default=8 The number of clusters to generate. lambda_reg : float, default=1. Regularization parameter. The algorithm demonstrated stable performance across a wide range of this hyperparameter. Returns ------- H_normalized: list of array-likes objects of shape (n_samples, n_clusters) WP: 3-D array of shape (n_clusters, n_clusters, n_mods) HP: 3-D array of shape (n_samples, n_clusters, n_views) obj: list of float """ num = HP.shape[0] # Number of samples numker = HP.shape[2] # Number of kernels WP = np.zeros(shape=(n_clusters, n_clusters, numker)) for p in range(numker): WP[:, :, p] = np.eye(n_clusters) HP00 = HP flag = 1 iter = 0 obj = [] while flag: iter += 1 RpHpwp = np.zeros(shape=(num, n_clusters)) for p in range(numker): RpHpwp = RpHpwp + (np.matmul(HP[:, :, p], WP[:, :, p])) Uh, Sh, Vh = np.linalg.svd(RpHpwp, full_matrices=False) V = Vh.T.conj() Hstar = np.matmul(Uh, V.T) WP = self._update_wp_absent_clustering_v1(HP, Hstar) HP = self._update_hp_absent_clustering_v1(WP, Hstar, lambda_reg, HP00) RpHpwp = np.zeros(shape=(num, n_clusters)) obj2 = 0 for p in range(numker): RpHpwp += np.matmul(HP[:, :, p], WP[:, :, p]) obj2 += np.trace(np.matmul(HP[:, :, p].T, HP00[:, :, p])) obj.append(np.trace(np.matmul(Hstar.T, RpHpwp)) + lambda_reg * obj2) if (iter > 2) and ((np.abs((obj[iter-1] - obj[iter-2]) / obj[iter-1]) < 1e-4) or (iter > self.max_iter)): flag = 0 H_normalized = Hstar / np.tile(A=np.sqrt(np.sum(Hstar ** 2, 1)), reps=(n_clusters, 1)).T return H_normalized, WP, HP, obj