# License: BSD-3-Clause
import os
from os.path import dirname
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from scipy.linalg import svd
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
from ..preprocessing import select_complete_samples, MMTransformer
from ..utils import check_Xs_y
from ..explore import get_missing_samples_by_mod
from .. import octavemodule_installed, oct2py_module_error
if octavemodule_installed:
import oct2py
[docs]
class SIMCADC(BaseEstimator, ClusterMixin):
r"""
Scalable Incomplete Multiview Clustering with Adaptive Data Completion (SIMC-ADC). [#simcadcpaper]_ [#simcadccode]_
The SIMC-ADC algorithm captures the complementary information from different views by building a view-specific
anchor graph. The anchor graph construction and a structure alignment are jointly optimized to enhance
clustering quality.
Parameters
----------
n_clusters : int, default=8
The number of clusters to generate.
lambda_parameter : float, default=1
Balance the influence between anchor graph generation and alignment term.
n_anchors : int, default=None
Number of anchors. If None, use n_clusters.
beta : float, default=1
Balance the influence between anchor graph generation and alignment term.
gamma : float, default=1
Balance the influence between anchor graph generation and alignment term.
random_state : int, default=None
Determines the randomness. Use an int to make the randomness deterministic.
engine : str, default=python
Engine to use for computing the model. Current options are 'octave' or 'python'.
verbose : bool, default=False
Verbosity mode.
clean_space : bool, default=True
If engine is 'octave' and clean_space is True, the session will be closed after fitting the model.
Attributes
----------
labels_ : array-like of shape (n_samples,)
Labels of each point in training data.
embedding_ : array-like of shape (n_samples, n_clusters)
Consensus clustering matrix to be used as input for the KMeans clustering step.
V_ : array-like of shape (n_clusters, n_clusters)
Commont latent feature matrix.
A_ : array-like of shape (n_clusters, n_clusters)
Learned anchors.
Z_ : array-like of shape (n_clusters, n_samples)
modality-specific anchor graph.
loss_ : array-like of shape (n_iter\_,)
Values of the loss function.
n_iter_ : int
Number of iterations.
References
----------
.. [#simcadcpaper] He, W.-J., Zhang, Z., & Wei, Y. (2023). Scalable incomplete multi-view clustering with adaptive
data completion. Information Sciences, 649, 119562. doi:10.1016/j.ins.2023.119562.
.. [#simcadccode] https://github.com/DarrenZZhang/INS23-SIMC_ADC
Example
--------
>>> import numpy as np
>>> import pandas as pd
>>> from imml.cluster import SIMCADC
>>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
>>> estimator = SIMCADC(n_clusters = 2)
>>> labels = estimator.fit_predict(Xs)
"""
def __init__(self, n_clusters: int = 8, lambda_parameter: float = 1, n_anchors: int = None,
beta: float = 1, gamma: float = 1, eps: float = 1e-25, random_state:int = None,
engine: str ="python", verbose = False, clean_space: bool = True):
if not isinstance(n_clusters, int):
raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.")
if n_clusters < 2:
raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.")
engines_options = ["octave", "python"]
if engine not in engines_options:
raise ValueError(f"Invalid engine. Expected one of {engines_options}. {engine} was passed.")
if (engine == "octave") and (not octavemodule_installed):
raise ImportError(oct2py_module_error)
self.n_clusters = n_clusters
self.lambda_parameter = lambda_parameter
self.beta = beta
self.gamma = gamma
self.eps = eps
self.n_anchors = n_clusters if n_anchors is None else n_anchors
self.random_state = random_state
self.engine = engine
self.verbose = verbose
self.clean_space = clean_space
if self.engine == "octave":
octave_folder = dirname(__file__)
octave_folder = os.path.join(octave_folder, "_" + (os.path.basename(__file__).split(".")[0]))
self._octave_folder = octave_folder
octave_files = [x for x in os.listdir(octave_folder) if x.endswith(".m")]
self._oc = oct2py.Oct2Py(temp_dir= octave_folder)
for octave_file in octave_files:
with open(os.path.join(octave_folder, octave_file)) as f:
self._oc.eval(f.read())
[docs]
def fit(self, Xs, y=None):
r"""
Fit the transformer to the input data.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : Fitted estimator.
"""
Xs = check_Xs_y(Xs, ensure_all_finite='allow-nan')
if self.engine=="octave":
if not isinstance(Xs[0], pd.DataFrame):
Xs = [pd.DataFrame(X) for X in Xs]
mean_mod_profile = [X.mean(axis=0).to_frame(X_id) for X_id, X in enumerate(select_complete_samples(Xs))]
incomplete_samples = get_missing_samples_by_mod(Xs=Xs, return_as_list=True)
mean_mod_profile = [pd.DataFrame(np.tile(means, len(incom))).values for means, incom in
zip(mean_mod_profile, incomplete_samples)]
imputer = MMTransformer(transformer=SimpleImputer(strategy="constant", fill_value=0))
transformed_Xs = imputer.fit_transform(Xs)
transformed_Xs, mean_mod_profile = tuple(transformed_Xs), tuple(mean_mod_profile)
w = [pd.DataFrame(np.eye(len(X)), index=X.index, columns=X.index) for X in Xs]
w = [eye.loc[samples,:].values for eye, samples in zip(w, incomplete_samples)]
w = tuple(w)
n_incomplete_samples_mod = list(len(incomplete_sample) for incomplete_sample in incomplete_samples)
# if self.random_state is not None:
# self._oc.rand('seed', self.random_state)
u,v,a,w,z,iter,obj = self._oc.SIMC(transformed_Xs, len(Xs[0]), self.lambda_parameter,
self.n_clusters, self.n_anchors, w, n_incomplete_samples_mod,
mean_mod_profile, self.beta, self.gamma, nout=7)
obj = obj[0]
if self.clean_space:
self._clean_space()
elif self.engine=="python":
if not isinstance(Xs[0], pd.DataFrame):
Xs = [pd.DataFrame(X) for X in Xs]
mean_mod_profile = [X.mean(axis=0).to_frame(X_id) for X_id, X in enumerate(select_complete_samples(Xs))]
incomplete_samples = get_missing_samples_by_mod(Xs=Xs, return_as_list=True)
mean_mod_profile = [pd.DataFrame(np.tile(means, len(incom))).values for means, incom in
zip(mean_mod_profile, incomplete_samples)]
imputer = MMTransformer(transformer=SimpleImputer(strategy="constant", fill_value=0))
transformed_Xs = imputer.fit_transform(Xs)
# transformed_Xs, mean_view_profile = tuple(transformed_Xs), tuple(mean_view_profile)
w = [pd.DataFrame(np.eye(len(X)), index=X.index, columns=X.index) for X in Xs]
w = [eye.loc[samples, :].values for eye, samples in zip(w, incomplete_samples)]
# w = tuple(w)
n_incomplete_samples_mod = list(len(incomplete_sample) for incomplete_sample in incomplete_samples)
if self.random_state is not None:
np.random.seed(self.random_state)
u, v, a, w, z, iter, obj = self._SIMC(transformed_Xs, len(Xs[0]), self.lambda_parameter,
self.n_clusters, self.n_anchors, w, n_incomplete_samples_mod,
mean_mod_profile, self.beta, self.gamma)
model = KMeans(n_clusters= self.n_clusters, n_init= "auto", random_state= self.random_state)
self.labels_ = model.fit_predict(X=u)
self.embedding_ = u
self.V_ = v
self.A_ = a
self.Z_, self.loss_, self.iter_ = z, obj, iter
self.n_iter_ = len(self.loss_)
return self
def _predict(self, Xs):
r"""
Return clustering results for samples.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
return self.labels_
[docs]
def fit_predict(self, Xs, y=None):
r"""
Fit the model and return clustering results.
Convenience method; equivalent to calling fit(X) followed by predict(X).
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
labels = self.fit(Xs)._predict(Xs)
return labels
def _clean_space(self):
[os.remove(os.path.join(self._octave_folder, x)) for x in ["reader.mat", "writer.mat"]]
self._oc.exit()
del self._oc
return None
def _eproj_simplex_new(self, v, k=1):
r"""
Adjust the v variable if needed.
Parameters
----------
v: list of length (n_clusters)
k: int, default=1
Returns
-------
v0: list of length (n_clusters)
"""
ft = 1
n = len(v)
v0 = v - np.mean(v) + k / n
vmin = np.min(v0)
if vmin < 0:
lambda_m = 0
f = 1
while abs(f) > 1e-10:
v1 = v0 - lambda_m
posidx = v1 > 0
npos = np.sum(posidx)
g = -npos
f = np.sum(v1[posidx]) - k
lambda_m -= f / g
ft += 1
if ft > 100:
return np.maximum(v1, 0)
return np.maximum(v1, 0)
else:
# If no adjustment is needed, return the shifted vector
return v0
def _SIMC(self, Y, num_sample, lambda_parameter, n_clusters, n_anchors, N, Ne, E, beta, gamma):
r"""
Runs the SIMCADC algorithm.
Parameters
----------
Y: list of array-likes objects of length (n_mods)
- Y[i] shape: (n_samples, n_features_i)
num_sample: int
Number of samples
lambda_parameter : float, default=1
Balance the influence between anchor graph generation and alignment term.
n_clusters : int, default=8
The number of clusters to generate.
n_anchors : int, default=None
Number of anchors. If None, use n_clusters.
N: list of arrays-likes of length (n_mods)
- N[i] shape: (missing_view_columns, n_features_i)
Ne: list of arrays-likes of length (n_mods)
Number of missing modality columns
E: list of arrays-likes of length (n_mods)
- E[i] shape: (n_features_i, missing_view_columns)
beta: float, default=1.0
gamma: float, default=1.0
Returns
-------
UU: list of array-likes objects of shape (n_samples, n_clusters)
V: list of array-likes objects of shape (n_clusters, n_clusters)
A: list of array-likes objects of shape (n_clusters, n_anchors)
W: list of array-likes objects of length (n_view)
- W[i] shape: (n_features_i, n_clusters)
Z_final: list of array-likes objects of shape (n_anchors, n_samples)
iter: int
Number of iterations.
obj: List of floats of length (iter)
"""
# Initialize parameters
maxIter = 50
num_view = len(Y)
W = [None] * num_view
A = np.zeros(shape=(n_clusters, n_anchors))
Z_final = np.zeros(shape=(n_anchors, num_sample))
NNT = [None] * num_view
B = [None] * num_view
Z = [None] * num_view
R = [None] * num_view
# Preprocessing Y and initializing
for i in range(num_view):
Y[i] = np.nan_to_num(zscore(Y[i], ddof=1, axis=0).T, nan=0)
di = Y[i].shape[0]
W[i] = np.zeros(shape=(di, n_clusters))
NNT[i] = np.matmul(N[i], N[i].T)
B[i] = np.zeros(shape=(di, num_sample))
Z[i] = np.zeros(shape=(n_anchors, num_sample))
R[i] = np.eye(n_anchors)
Z_final[:, :n_anchors] = np.eye(n_anchors)
alpha = (np.ones(shape=(1, num_view)) / num_view)[0]
flag = 1
iter = 0
obj = []
while flag:
iter += 1
X = [Y[iv] + np.matmul(E[iv], N[iv]) for iv in range(num_view)]
X = [np.nan_to_num(i, nan=0) for i in X]
# Update W_i
for iv in range(num_view):
AZ = np.matmul(A, Z[iv])
C = np.matmul(X[iv], AZ.T)
U, _, Vt = svd(C, full_matrices=False)
Vt = Vt.T.conj()
W[iv] = np.matmul(U, Vt.T)
# Update A
part1 = sum(alpha[ia] ** 2 * np.matmul(W[ia].T, np.matmul(X[ia], Z[ia].T)) for ia in range(num_view))
Unew, _, Vnew = svd(part1, full_matrices=False)
Vnew = Vnew.T.conj()
A = np.matmul(Unew, Vnew.T)
# Update Z_i
for iv in range(num_view):
C1 = alpha[iv] ** 2 * np.matmul(X[iv].T, np.matmul(W[iv], A)) + gamma * np.matmul(Z_final.T, R[iv])
C2 = alpha[iv] ** 2 + gamma
C1 = C1.T
for ii in range(num_sample):
ut = C1[:, ii] / C2
Z[iv][:, ii] = self._eproj_simplex_new(ut)
# Update Z_final
C3 = sum(gamma * np.matmul(Z[iv].T, R[iv].T) for iv in range(num_view))
C3 = C3.T
C4 = num_view * gamma + 1
for ii in range(num_sample):
ut = C3[:, ii] / C4
Z_final[:, ii] = self._eproj_simplex_new(ut)
# Update E
for iv in range(num_view):
B[iv] = Y[iv] - np.matmul(W[iv], np.matmul(A, Z_final))
E[iv] = np.matmul(-B[iv], np.matmul(N[iv].T, np.linalg.inv(NNT[iv] + beta * np.ones((Ne[iv], Ne[iv])))))
# Update R
for iv in range(num_view):
Z_iv_final = np.matmul(Z[iv], Z_final.T)
U_R, _, V_R = svd(Z_iv_final, full_matrices=False)
V_R = V_R.T.conj()
R[iv] = np.matmul(U_R, V_R.T)
# Update alpha
for iv in range(num_view):
alpha[iv] = np.sqrt(1 / (np.linalg.norm(B[iv] + np.matmul(E[iv], N[iv]), 'fro') + np.finfo('float').eps))
# Objective calculation
term1 = sum(alpha[iv] ** 2 * np.linalg.norm(X[iv] - np.matmul(W[iv], np.matmul(A, Z_final)), 'fro') ** 2 for iv in range(num_view))
term2 = lambda_parameter * np.linalg.norm(Z_final, 'fro') ** 2
obj.append(term1 + term2)
if iter > 9 and (abs((obj[iter - 2] - obj[iter - 1]) / obj[iter - 2]) < 1e-3 or iter > maxIter or obj[
iter - 1] < 1e-10):
UU, _, V = svd(Z_final.T, full_matrices=False)
V = V.T.conj()
flag = 0
UU = UU / np.sqrt(np.sum(UU ** 2, axis=1, keepdims=True))
return UU, V, A, W, Z_final, iter, obj