# License: BSD-3-Clause
import os
from os.path import dirname
import numpy as np
from sklearn.gaussian_process import kernels
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from scipy.sparse.linalg import eigs
from sklearn.impute import SimpleImputer
from ..preprocessing import MMTransformer
from ..utils import check_Xs_y
from .. import octavemodule_installed, oct2py_module_error
if octavemodule_installed:
import oct2py
[docs]
class LFIMVC(BaseEstimator, ClusterMixin):
r"""
Late Fusion Incomplete Multi-View Clustering (LF-IMVC). [#lfimvcpaper]_ [#lfimvccode]_
LF-IMVC jointly learns a consensus clustering matrix, imputes each incomplete base matrix, and optimizes the
corresponding permutation matrices to integrate the incomplete clustering matrices generated by incomplete views.
Parameters
----------
n_clusters : int, default=8
The number of clusters to generate.
kernel : callable, default=kernels.Sum(kernels.DotProduct(), kernels.WhiteKernel())
Specifies the kernel type to be used in the algorithm.
lambda_reg : float, default=1.
Regularization parameter. The algorithm demonstrated stable performance across a wide range of
this hyperparameter.
max_iter : int, default=100
Maximum number of iterations.
random_state : int, default=None
Determines the randomness. Use an int to make the randomness deterministic.
engine : str, default=python
Engine to use for computing the model. Current options are 'octave' or 'python'.
verbose : bool, default=False
Verbosity mode.
clean_space : bool, default=True
If engine is 'octave' and clean_space is True, the session will be closed after fitting the model.
Attributes
----------
labels_ : array-like of shape (n_samples,)
Labels of each point in training data.
embedding_ : np.array
Consensus clustering matrix to be used as input for the KMeans clustering step.
WP_ : array-like of shape (n_clusters, n_clusters, n_mods)
p-th permutation matrix.
HP_ : array-like of shape (n_samples, n_clusters, n_mods)
missing part of the p-th base clustering matrix.
loss_ : array-like of shape (n_iter\_,)
Values of the loss function.
n_iter_ : int
Number of iterations.
References
----------
.. [#lfimvcpaper] X. Liu et al., "Late Fusion Incomplete Multi-View Clustering," in IEEE Transactions on Pattern
Analysis and Machine Intelligence, vol. 41, no. 10, pp. 2410-2423, 1 Oct. 2019,
doi: 10.1109/TPAMI.2018.2879108.
.. [#lfimvccode] https://github.com/xinwangliu/Late-Fusion-Incomplete-Multi-view-Clustering
Example
--------
>>> import numpy as np
>>> import pandas as pd
>>> from imml.cluster import LFIMVC
>>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
>>> estimator = LFIMVC(n_clusters = 2)
>>> labels = estimator.fit_predict(Xs)
"""
def __init__(self, n_clusters: int = 8, kernel: callable = kernels.Sum(kernels.DotProduct(), kernels.WhiteKernel()),
lambda_reg: float = 1., max_iter: int = 200, random_state:int = None, engine: str ="python",
verbose = False, clean_space: bool = True):
if not isinstance(n_clusters, int):
raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.")
if n_clusters < 2:
raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.")
engines_options = ["octave", "python"]
if engine not in engines_options:
raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.")
if (engine == "octave") and (not octavemodule_installed):
raise ImportError(oct2py_module_error)
self.n_clusters = n_clusters
self.kernel = kernel
self.lambda_reg = lambda_reg
self.random_state = random_state
self.max_iter = max_iter
self.engine = engine
self.verbose = verbose
self.clean_space = clean_space
if self.engine == "octave":
octave_folder = dirname(__file__)
octave_folder = os.path.join(octave_folder, "_" + (os.path.basename(__file__).split(".")[0]))
self._octave_folder = octave_folder
octave_files = [x for x in os.listdir(octave_folder) if x.endswith(".m")]
self._oc = oct2py.Oct2Py(temp_dir= octave_folder)
for octave_file in octave_files:
with open(os.path.join(octave_folder, octave_file)) as f:
self._oc.eval(f.read())
[docs]
def fit(self, Xs, y=None):
r"""
Fit the transformer to the input data.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : Fitted estimator.
"""
Xs = check_Xs_y(Xs, ensure_all_finite='allow-nan')
if self.engine=="octave":
imputer = MMTransformer(transformer=SimpleImputer())
transformed_Xs = imputer.fit_transform(Xs)
transformed_Xs = [self.kernel(X) for X in transformed_Xs]
transformed_Xs = np.array(transformed_Xs).swapaxes(0, -1)
if self.random_state is not None:
self._oc.rand('seed', self.random_state)
U, WP,HP, obj = self._oc.IncompleteMultikernelLatefusionclusteringV1Hv(transformed_Xs, self.n_clusters,
self.lambda_reg, self.max_iter, nout=4)
if self.clean_space:
self._clean_space()
elif self.engine=="python":
imputer = MMTransformer(transformer=SimpleImputer())
transformed_Xs = imputer.fit_transform(Xs)
transformed_Xs = [self.kernel(X) for X in transformed_Xs]
transformed_Xs = np.array(transformed_Xs).swapaxes(0, -1)
U, WP, HP, obj = self._incomplete_multikernel_late_fusion_clustering(transformed_Xs, self.n_clusters,
self.lambda_reg)
model = KMeans(n_clusters= self.n_clusters, n_init="auto", random_state= self.random_state)
self.labels_ = model.fit_predict(X= U)
self.embedding_, self.WP_, self.HP_, self.loss_ = U, WP, HP, obj
self.n_iter_ = len(self.loss_)
return self
def _predict(self, Xs):
r"""
Return clustering results for samples.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
return self.labels_
[docs]
def fit_predict(self, Xs, y=None):
r"""
Fit the model and return clustering results.
Convenience method; equivalent to calling fit(X) followed by predict(X).
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
labels = self.fit(Xs)._predict(Xs)
return labels
def _clean_space(self):
[os.remove(os.path.join(self._octave_folder, x)) for x in ["reader.mat", "writer.mat"]]
self._oc.exit()
del self._oc
return None
# def k_center(self, KH):
# r"""
# Center a kernel matrix.
# """
# n = KH.shape[1]
#
# if np.ndim(KH) == 2:
# D = np.sum(KH) / n
# E = np.sum(D) / n
# J = np.matmul(np.ones(shape=(n, 1)), D)
# KH -= J - J.T + np.matmul(E, np.ones(shape=(n, n)))
# K = 0.5 * (KH + KH.T)
#
# elif np.ndim(KH) == 3:
# for i in range(KH.shape[2]):
# D = np.sum(KH[:, :, i], 0) / n
# E = np.sum(D) / n
# J = np.ones(shape=(n,)) * D
# KH[:, :, i] = KH[:, :, i] - J - J.T + E * np.ones(shape=(n, n))
# KH[:, :, i] = 0.5 * (KH[:, :, i] + KH[:, :, i].T) + 1e-12 * np.eye(n)
#
# return KH
#
# def k_norm(self, KH):
# r"""
# Normalize a kernel matrix.
# """
# if KH.shape[2] > 1:
# for i in range(KH.shape[2]):
# KH[:, :, i] = KH[:, :, i] / np.sqrt(np.outer(np.diag(KH[:, :, i]), np.diag(KH[:, :, i]).T))
# else:
# KH = KH / np.sqrt(np.matmul(np.diag(KH),np.diag(KH).T))
# return KH
def _my_kernel_kmeans(self, K, n_clusters):
r"""
Determines eigenvectors.
Parameters
----------
K: 2-D array of shape (n_samples, n_samples)
n_clusters: int
The number of clusters.
Returns
-------
H_normalized: 2-D array of shape (n_samples, n_clusters)
"""
K = (K + K.T) / 2
_, H = eigs(A=K, k=n_clusters, which='LR') # Debuggé
H_normalized = H / np.tile(np.sqrt(np.sum(H ** 2, 1)), reps=(n_clusters, 1)).T
return H_normalized
def _update_wp_absent_clustering_v1(self, HP, Hstar):
r"""
Update WP variable.
Parameters
----------
HP: 3-D array of shape (n_samples, n_clusters, n_mods)
Hstar: 2-D array of shape (n_samples, n_clusters)
Returns
-------
WP: 3-D array of shape (n_clusters, n_clusters, n_mods)
"""
k = HP.shape[1]
numker = HP.shape[2]
WP = np.zeros(shape=(k, k, numker))
for p in range(numker):
Tp = np.matmul(HP[:, :, p].T, Hstar)
Up, Sp, Vp = np.linalg.svd(Tp, full_matrices=False)
V = Vp.T.conj()
WP[:, :, p] = np.matmul(Up, V.T)
return WP
def _update_hp_absent_clustering_v1(self, WP, Hstar, lambda_reg, HP00):
r"""
Update HP variable.
Parameters
----------
WP: 3-D array of shape (n_clusters, n_clusters, n_mods)
Hstar: 2-D array of shape (n_samples, n_clusters)
lambda_reg : float, default=1.
Regularization parameter. The algorithm demonstrated stable performance across a wide range of
this hyperparameter.
HP00: 3-D array of shape (n_samples, n_clusters, n_mods)
Returns
-------
HP: 3-D array of shape (n_samples, n_clusters, n_mods)
"""
num = HP00.shape[0]
k = HP00.shape[1]
numker = HP00.shape[2]
HP = np.zeros(shape=(num, k, numker))
for p in range(numker):
Vp = np.matmul(Hstar, WP[:, :, p]) + lambda_reg * HP00[:, :, p]
Up, Sp, Vp = np.linalg.svd(Vp, full_matrices=False)
V = Vp.T.conj()
HP[:, :, p] = np.matmul(Up, V.T)
return HP
def _incomplete_multikernel_late_fusion_clustering(self, KH, n_clusters, lambda_reg):
r"""
Runs the LFIMVC clustering algorithm.
Parameters
----------
KH: 3-D array of shape (n_samples, ?, n_mods)
n_clusters : int, default=8
The number of clusters to generate.
lambda_reg : float, default=1.
Regularization parameter. The algorithm demonstrated stable performance across a wide range of
this hyperparameter.
normalize: bool, default=True
True if you want to normalize the kernel matrix.
Returns
-------
H_normalized: list of array-likes objects of shape (n_samples, n_clusters)
WP: 3-D array of shape (n_clusters, n_clusters, n_mods)
HP: 3-D array of shape (n_samples, n_clusters, n_views)
obj: list of float
"""
# if normalize:
# KH = self.k_center(KH) # K_center debuggé
# KH = self.k_norm(KH) # K_norm debuggé
num = KH.shape[0] # Number of samples
numker = KH.shape[2] # Number of kernels
HP = np.zeros(shape=(num, n_clusters, numker))
for ker in range(numker):
HP[:, :, ker] = self._my_kernel_kmeans(KH[:, :, ker], n_clusters)
max_iter = 200
WP = np.zeros(shape=(n_clusters, n_clusters, numker))
for p in range(numker):
WP[:, :, p] = np.eye(n_clusters)
HP00 = HP
flag = 1
iter = 0
obj = []
obj.append(0)
while flag:
iter += 1
RpHpwp = np.zeros(shape=(num, n_clusters))
for p in range(numker):
RpHpwp = RpHpwp + (np.matmul(HP[:, :, p], WP[:, :, p]))
Uh, Sh, Vh = np.linalg.svd(RpHpwp, full_matrices=False)
V = Vh.T.conj()
Hstar = np.matmul(Uh, V.T)
WP = self._update_wp_absent_clustering_v1(HP, Hstar)
HP = self._update_hp_absent_clustering_v1(WP, Hstar, lambda_reg, HP00)
RpHpwp = np.zeros(shape=(num, n_clusters))
obj2 = 0
for p in range(numker):
RpHpwp += np.matmul(HP[:, :, p], WP[:, :, p])
obj2 += np.trace(np.matmul(HP[:, :, p].T, HP00[:, :, p]))
obj.append(np.trace(np.matmul(Hstar.T, RpHpwp)) + lambda_reg * obj2)
if (iter > 2) and ((np.abs((obj[iter] - obj[iter - 1]) / obj[iter]) < 1e-4) or (iter > max_iter)):
flag = 0
H_normalized = Hstar / np.tile(A=np.sqrt(np.sum(Hstar ** 2, 1)), reps=(n_clusters, 1)).T
return H_normalized, WP, HP, obj