# License: BSD-3-Clause
import os
from contextlib import contextmanager
from os.path import dirname
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from sklearn.gaussian_process import kernels
from scipy.sparse.linalg import eigs
from numpy.linalg import svd
from ..impute import get_observed_mod_indicator
from ..utils import check_Xs
matlabmodule_installed = False
oct2py_module_error = "Module 'matlab' needs to be installed. See https://imml.readthedocs.io/stable/main/installation.html#optional-dependencies"
try:
import oct2py
matlabmodule_installed = True
except ImportError:
pass
@contextmanager
def fixed_seed(seed):
state = np.random.get_state()
np.random.seed(seed)
try:
yield
finally:
np.random.set_state(state)
[docs]
class EEIMVC(BaseEstimator, ClusterMixin):
r"""
Efficient and Effective Incomplete Multi-view Clustering (EE-IMVC). [#eeimvcpaper]_ [#eeimvccode]_
EE-IMVC impute missing views with a consensus clustering matrix that is regularized with prior knowledge.
Parameters
----------
n_clusters : int, default=8
The number of clusters to generate.
kernel : callable, default=None
Specifies the kernel type to be used in the algorithm. By default, it applies dot product kernel.
lambda_reg : float, default=1.
Regularization parameter. The algorithm demonstrated stable performance across a wide range of
this hyperparameter.
qnorm : float, default=2.
Regularization parameter. The algorithm demonstrated stable performance across a wide range of
this hyperparameter.
random_state : int, default=None
Determines the randomness. Use an int to make the randomness deterministic.
engine : str, default=python
Engine to use for computing the model. Current options are 'matlab' or 'python'.
verbose : bool, default=False
Verbosity mode.
clean_space : bool, default=True
If engine is 'matlab' and clean_space is True, the session will be closed after fitting the model.
Attributes
----------
labels_ : array-like of shape (n_samples,)
Labels of each point in training data.
embedding_ : array-like of shape (n_samples, n_clusters)
Consensus clustering matrix to be used as input for the KMeans clustering step.
WP_ : array-like of shape (n_clusters, n_clusters, n_mods)
p-th permutation matrix.
HP_ : array-like of shape (n_samples, n_clusters, n_mods)
missing part of the p-th base clustering matrix.
beta_ : array-like of shape (n_mods,)
Adaptive weights of clustering matrices.
loss_ : array-like of shape (n_iter\_,)
Values of the loss function.
n_iter_ : int
Number of iterations.
References
----------
.. [#eeimvcpaper] X. Liu et al., "Efficient and Effective Regularized Incomplete Multi-View Clustering," in
IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 43, no. 8, pp. 2634-2646,
1 Aug. 2021, doi: 10.1109/TPAMI.2020.2974828.
.. [#eeimvccode] https://github.com/xinwangliu/TPAMI_EEIMVC
Example
--------
>>> import numpy as np
>>> import pandas as pd
>>> from imml.cluster import EEIMVC
>>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
>>> estimator = EEIMVC(n_clusters = 2)
>>> labels = estimator.fit_predict(Xs)
"""
def __init__(self, n_clusters: int = 8, kernel: callable = None,
lambda_reg: float = 1., qnorm: float = 2., random_state: int = None,
engine: str ="python", verbose = False, clean_space: bool = True):
if not isinstance(n_clusters, int):
raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.")
if n_clusters < 2:
raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.")
engines_options = ["matlab", "python"]
if engine not in engines_options:
raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.")
if (engine == "matlab") and (not matlabmodule_installed):
raise ImportError(oct2py_module_error)
if kernel is None:
kernel = kernels.Sum(kernels.DotProduct(), kernels.WhiteKernel())
self.n_clusters = n_clusters
self.qnorm = qnorm
self.kernel = kernel
self.lambda_reg = lambda_reg
self.random_state = random_state
self.engine = engine
self.verbose = verbose
self.clean_space = clean_space
if self.engine == "matlab":
matlab_folder = dirname(__file__)
matlab_folder = os.path.join(matlab_folder, "_" + (os.path.basename(__file__).split(".")[0]))
self._matlab_folder = matlab_folder
matlab_files = [x for x in os.listdir(matlab_folder) if x.endswith(".m")]
self._oc = oct2py.Oct2Py(temp_dir= matlab_folder)
for matlab_file in matlab_files:
with open(os.path.join(matlab_folder, matlab_file)) as f:
self._oc.eval(f.read())
[docs]
def fit(self, Xs, y=None):
r"""
Fit the transformer to the input data.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : Fitted estimator.
"""
Xs = check_Xs(Xs, ensure_all_finite='allow-nan')
if self.engine=="matlab":
if isinstance(Xs[0], pd.DataFrame):
transformed_Xs = [X.values for X in Xs]
elif isinstance(Xs[0], np.ndarray):
transformed_Xs = Xs
observed_mod_indicator = get_observed_mod_indicator(transformed_Xs)
if isinstance(observed_mod_indicator, np.ndarray):
observed_mod_indicator = pd.DataFrame(observed_mod_indicator)
s = [modality[modality == 0].index.values for _,modality in observed_mod_indicator.items()]
transformed_Xs = [self.kernel(X) for X in transformed_Xs]
transformed_Xs = np.array(transformed_Xs).swapaxes(0, -1)
s = tuple([{"indx": i +1} for i in s])
if self.random_state is not None:
self._oc.rand('seed', self.random_state)
H_normalized,WP,HP,beta,obj = self._oc.incompleteLateFusionMKCOrthHp_lambda(transformed_Xs, s,
self.n_clusters, self.qnorm,
self.lambda_reg, nout=5)
beta = beta[:,0]
obj = obj[0]
if self.clean_space:
self._clean_space()
elif self.engine=="python":
observed_mod_indicator = get_observed_mod_indicator(Xs)
if isinstance(observed_mod_indicator, pd.DataFrame):
observed_mod_indicator = observed_mod_indicator.reset_index(drop=True)
elif isinstance(observed_mod_indicator[0], np.ndarray):
observed_mod_indicator = pd.DataFrame(observed_mod_indicator)
s = [modality[modality == 0].index.values for _, modality in observed_mod_indicator.items()]
transformed_Xs = [self.kernel(X) for X in Xs]
transformed_Xs = np.array(transformed_Xs).swapaxes(0, -1)
transformed_Xs = np.nan_to_num(transformed_Xs, nan=0)
s = tuple([{"indx": i + 1} for i in s])
H_normalized, WP, HP, beta, obj = self._incomplete_late_fusion_MKCOrthHp_lamba(transformed_Xs, s,
self.n_clusters,
self.qnorm, self.lambda_reg)
model = KMeans(n_clusters= self.n_clusters, n_init="auto", random_state= self.random_state)
self.labels_ = model.fit_predict(X=H_normalized)
self.embedding_, self.WP_, self.HP_, self.beta_, self.loss_ = H_normalized, WP, HP, beta, obj
self.n_iter_ = len(self.loss_)
return self
def _predict(self, Xs):
r"""
Return clustering results for samples.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
return self.labels_
[docs]
def fit_predict(self, Xs, y=None):
r"""
Fit the model and return clustering results.
Convenience method; equivalent to calling fit(X) followed by predict(X).
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
labels = self.fit(Xs)._predict(Xs)
return labels
def _clean_space(self):
[os.remove(os.path.join(self._matlab_folder, x)) for x in ["reader.mat", "writer.mat"]]
self._oc.exit()
del self._oc
return None
def _my_initialization_Hp(self, KH, S, n_clusters):
r"""
Initialize HP and WP variable.
Parameters
----------
KH: 3-D array of shape(n_samples, n_samples, kernels)
S: tuple of shape (n_mods)
- S[i]['indx']: array of missing values column
n_clusters: int
The number of clusters.
Returns
-------
HP: 3-d array of shape (n_samples, n_clusters, n_mods)
WP: 3-d array of shape (n_clusters, n_clusters, n_mods)
"""
numker = KH.shape[2]
num = KH.shape[0]
HP = np.zeros(shape=(num, n_clusters, numker))
WP = np.zeros(shape=(n_clusters, n_clusters, numker))
for p in range(numker):
KH_tmp = KH[:, :, p]
HP_tmp = HP[:, :, p]
obs_index = np.setdiff1d(ar1=[i for i in range(num)], ar2=[i-1 for i in S[p]['indx'].T])
KAp = KH_tmp[np.ix_(obs_index, obs_index)]
KAp = (KAp + KAp.T) / 2 + 1e-8 * np.eye(len(obs_index))
if self.random_state is not None:
v0 = np.random.default_rng(self.random_state).uniform(size=min(KAp.shape))
with fixed_seed(self.random_state):
_, Hp = eigs(KAp, n_clusters, which='LR', v0=v0)
else:
_, Hp = eigs(KAp, n_clusters, which='LR')
HP_tmp[np.ix_(obs_index), :] = Hp
HP[:, :, p] = HP_tmp
WP[:, :, p] = np.eye(n_clusters)
return HP, WP
def _algorithm2(self, KH, S):
r"""
Process KH with the missing index.
Parameters
----------
KH: 3-D array of shape(n_samples, n_samples, kernels)
S: tuple of shape (n_mods)
- S[i]['indx']: array of missing values column
Returns
-------
KH2: 3-D array of shape (n_samples, n_samples, n_mods)
"""
num = KH.shape[0]
numker = KH.shape[2]
KH2 = np.zeros(shape=(num, num, numker))
for p in range(numker):
KH_tmp = KH[:, :, p]
KH2_tmp = KH2[:, :, p]
obs_index = np.setdiff1d(ar1=[i for i in range(num)], ar2=[i-1 for i in S[p]['indx'].T])
KAp = KH_tmp[np.ix_(obs_index, obs_index)]
KH2_tmp[np.ix_(obs_index, obs_index)] = (KAp + KAp.T)/2
KH2[:, :, p] = KH2_tmp
return KH2
def _my_comb_fun(self, Y, beta):
r"""
Process data with beta values
Parameters
----------
Y: 3-D array of shape(n_samples, n_samples, kernels)
beta: list of float (len=n_mods)
Returns
-------
cF: 2-D array of shape (n_samples, n_samples)
"""
m = Y.shape[2]
n = Y.shape[0]
cF = np.zeros(shape=(n, n))
for p in range(m):
cF += Y[:, :, p] * beta[p]
return cF
def _my_kernal_kmeans(self, K, n_clusters):
r"""
Determines eigenvectors.
Parameters
----------
K: 2-D array of shape (n_samples, n_samples)
n_clusters: int
The number of clusters.
Returns
-------
H_normalized: 2-D array of shape (n_samples, n_clusters)
"""
K = (K + K.T) / 2
if self.random_state is not None:
v0 = np.random.default_rng(self.random_state).uniform(size=min(K.shape))
with fixed_seed(self.random_state):
_, H = eigs(K, n_clusters, which='LR', v0=v0)
else:
_, H = eigs(K, n_clusters, which='LR')
obj = np.trace(np.matmul(H.T, np.matmul(K, H))) - np.trace(K)
H_normalized = H
return H_normalized
def _update_WP_absent_clustering_V1(self, HP, Hstar):
r"""
Update the WP variable.
Parameters
----------
HP: 3-D array of shape (n_samples, n_clusters, n_mods)
Hstar: 2-D array of shape (n_samples, n_clusters)
Returns
-------
WP: 3-d array of shape (n_clusters, n_clusters, n_mods)
"""
k = HP.shape[1]
numker = HP.shape[2]
WP = np.zeros(shape=(k, k, numker))
for p in range(numker):
Tp = np.matmul(HP[:, :, p].T, Hstar)
Up, Sp, Vp = np.linalg.svd(Tp, full_matrices=False)
V = Vp.T.conj()
WP[:, :, p] = np.matmul(Up, V.T)
return WP
def _update_HP_absent_clustering_OrthHp(self, WP, Hstar, S, HP00):
r"""
Update the HP variable.
Parameters
----------
WP: 3-D array of shape (n_clusters, n_clusters, n_views)
Hstar: 2-D array of shape (n_samples, n_clusters)
S: tuple of shape (n_views)
- S[i]['indx']: array of missing values column
HP00: 3-D array of shape (n_samples, n_clusters, n_views)
Returns
-------
HP: 3-D array of shape (n_samples, n_clusters, n_views)
"""
num = Hstar.shape[0]
k = Hstar.shape[1]
numker = WP.shape[2]
HP = np.zeros(shape=(num, k, numker))
for p in range(numker):
mis_indx = [i-1 for i in S[p]['indx'].T]
obs_indx = np.setdiff1d(ar1=[i for i in range(num)], ar2=mis_indx)
if len(mis_indx) > 0:
Vp = np.matmul(Hstar[np.ix_(mis_indx), :], WP[:, :, p].T)
Up, Sp, Vp = svd(Vp, full_matrices=False)
V = Vp.T.conj()
HP[mis_indx, :, p] = np.matmul(Up, V.T)
HP_tmp = HP[:, :, p]
HP00_tmp = HP00[:, :, p]
HP_tmp[np.ix_(obs_indx), :] = HP00_tmp[np.ix_(obs_indx), :]
HP[:, :, p] = HP_tmp
return HP
def _update_beta_absent_clustering(self, HP, WP, Hstar, qnorm):
r"""
Update the beta variable
Parameters
----------
HP: 3-D array of shape (n_samples, n_clusters, n_views)
WP: 3-D array of shape (n_clusters, n_clusters, n_views)
Hstar: 2-D array of shape (n_samples, n_clusters)
qnorm: float, default=2.0
Returns
-------
beta: list of float (len=n_views)
"""
numker = WP.shape[2]
HHPWP = np.zeros(shape=(numker, 1))
for p in range(numker):
HHPWP[p] = np.trace(np.matmul(Hstar.T, np.matmul(HP[:, :, p], WP[:, :, p])))
beta = HHPWP**(1/qnorm-1) / np.sum(HHPWP**(qnorm/(qnorm-1)))**(1/qnorm)
return beta
def _incomplete_late_fusion_MKCOrthHp_lamba(self, KH, S, n_clusters, qnorm, lambda_reg):
r"""
Runs the EEIMVC clustering algorithm.
Parameters
----------
KH: 3-D array of shape(n_samples, n_samples, n_views)
S: tuple of shape (n_views)
- S[i]['indx']: array of missing values column
n_clusters: int
The number of clusters.
qnorm: float, default=2.0
lambda_: float, default=1.0
Returns
-------
H_normalized: list of array-likes objects of shape (n_samples, n_clusters)
WP: 3-D array of shape (n_clusters, n_clusters, n_views)
HP: 3-D array of shape (n_samples, n_clusters, n_views)
beta: list of float (len=n_views)
obj: list of float
"""
num = KH.shape[1]
numker = KH.shape[2]
maxIter = 100
HP, WP = self._my_initialization_Hp(KH, S, n_clusters)
HP00 = HP
beta = np.ones(shape=(numker, 1)) * (1/numker)**(1/qnorm)
KA = self._algorithm2(KH, S)
KC = self._my_comb_fun(KA, beta)
H0 = self._my_kernal_kmeans(KC, n_clusters)
flag = 1
iter = 0
RpHpwp = np.zeros(shape=(num, n_clusters))
for p in range(numker):
RpHpwp += beta[p] * (HP[:, :, p] @ WP[:, :, p])
RpHpwp_lambda = RpHpwp + (lambda_reg * H0)
obj = []
obj.append(0)
while flag:
iter += 1
Uh, Sh, Vh = svd(RpHpwp_lambda, full_matrices=False)
V = Vh.T.conj()
Hstar = Uh @ V.T
WP = self._update_WP_absent_clustering_V1(HP, Hstar)
HP = self._update_HP_absent_clustering_OrthHp(WP, Hstar, S, HP00)
beta = self._update_beta_absent_clustering(HP, WP, Hstar, qnorm)
RpHpwp = np.zeros(shape=(num, n_clusters))
for p in range(numker):
RpHpwp += beta[p] * np.matmul(HP[:, :, p], WP[:, :, p])
RpHpwp_lambda = RpHpwp + (lambda_reg * H0)
obj.append(np.trace(np.matmul(Hstar.T, RpHpwp_lambda)))
if (iter > 2) and (np.abs((obj[iter] - obj[iter-1]) / obj[iter])) < 1e-4 or (iter > maxIter):
flag = 0
H_normalized = np.real(Hstar / np.tile(A=np.sqrt(np.sum(Hstar**2, 1)), reps=(n_clusters, 1)).T)
return H_normalized, WP, HP, beta, obj