# License: BSD-3-Clause
from typing import Union
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.cluster.hierarchy import cophenet, linkage
from scipy.spatial.distance import pdist
import numpy as np
from ..utils import check_Xs
from ._sumo.utils import extract_ncut
from ._sumo.network import MultiplexNet
from ._sumo.modes.run.solvers.unsupervised_sumo import UnsupervisedSumoNMF
from ._sumo.modes.prepare.similarity import feature_to_adjacency
[docs]
class SUMO(BaseEstimator, ClassifierMixin):
r"""
Subtyping Tool for Multi-Omic Data (SUMO). [#sumopaper1]_ [#sumocode]_
SUMO, originally designed for molecular subtyping in multi-omics datasets, utilizes a state-of-the-art
nonnegative matrix factorization (NMF) algorithm to identify clusters of samples with similar characteristics.
The authors strongly suggest removing features and samples with a large fraction of missing values (>10%); log
transform or a variant stabilizing transform when using count data as input; and standardize each input feature. For
more information, read the sumo's documentation: https://python-sumo.readthedocs.io/en/latest/index.html.
Parameters
----------
n_clusters : int, default=8
The number of clusters to generate. If it is not provided, it will use the default one from the algorithm.
method : str or list of str, default='euclidean'
either one method of sample-sample similarity calculation, or list of methods for every modality (available
methods: ['euclidean', 'cosine', 'pearson', 'spearman']).
missing : float or list of float, default=[0.1]
acceptable fraction of available values for assessment of distance/similarity between pairs of samples - either
one value or list for every modality.
neighbours : float, default=0.1
fraction of nearest neighbours to use for sample similarity calculation using Euclidean distance
similarity.
alpha : float, default=0.5
hypherparameter of RBF similarity kernel, for Euclidean distance similarity.
sparsity : float or list of float, default=[0.1]
either one value or list of sparsity penalty values for H matrix (sumo will try different values and select
the best results).
repetitions : int, default=60
Number of repetitions.
cluster_method : str, default="max_value"
Method of cluster extraction. Options are 'max_value' or'spectral'.
max_iter : int, default=500
Maximum number of iterations for factorization.
tol : float, default=1e-5
If objective cost function value fluctuation is smaller than this value, stop iterations before
reaching max_iter.
subsample : float, default=0.05
Fraction of samples randomly removed from each run, cannot be greater than 0.5.
calc_cost : int, default=20
Number of steps between every calculation of objective cost function.
h_init : int, default=None
index of adjacency matrix to use for H matrix initialization (by default using average adjacency).
rep : int, default=5
number of times consensus matrix is created for the purpose of assessing clustering quality.
random_state : int, default=None
Determines the randomness. Use an int to make the randomness deterministic.
verbose : bool, default=False
Verbosity mode.
n_jobs : int, default=1
Number of threads to run in parallel.
Attributes
----------
labels_ : array-like of shape (n_samples,)
Labels of each point in training data.
embedding_ : array-like of shape (n_samples, n_clusters)
The final spectral representation of the data to be used as input for the KMeans clustering step.
graph_ : MultiplexNet
Multi-modal graph.
nmf_ : UnsupervisedSumoNMF
The nonnegative matrix factorization (NMF) object.
similarity_ : dict of length n_mods, with mods as keys and an array-like of shape (n_samples,n_samples) as values.
List of adjacency matrix.
cophenet_list_ : ndarray of shape (rep,).
Object created by SUMO
pac_list_ : ndarray of shape (rep,).
Object created by SUMO
References
----------
.. [#sumopaper1] Sienkiewicz, K., Chen, J., Chatrath, A., Lawson, J. T., Sheffield, N. C., Zhang, L., & Ratan, A.
(2022). Detecting molecular subtypes from multi-omics datasets using SUMO. In Cell Reports Methods
(Vol. 2, Issue 1, p. 100152). Elsevier BV. https://doi.org/10.1016/j.crmeth.2021.100152
.. [#sumocode] https://github.com/ratan-lab/sumo/tree/master
Example
--------
>>> import numpy as np
>>> import pandas as pd
>>> from imml.cluster import SUMO
>>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
>>> estimator = SUMO(n_clusters = 2)
>>> labels = estimator.fit_predict(Xs)
"""
def __init__(self, n_clusters: int = 8, method: Union[str, list] = None, missing: list = None,
neighbours: float = 0.1, alpha: float = 0.5, sparsity: list = None, repetitions: int = 60,
cluster_method: str = "max_value", max_iter: int = 500, tol: float = 1e-5, subsample: float = 0.05,
calc_cost: int = 20, h_init: int = None, rep: int = 5, random_state: int = None,
verbose: bool = False, n_jobs: int = 1):
if not isinstance(n_clusters, int):
raise ValueError(f"Invalid n_clusters. It must be an int. A {type(n_clusters)} was passed.")
if n_clusters < 2:
raise ValueError(f"Invalid n_clusters. It must be an greater than 1. {n_clusters} was passed.")
if random_state is None:
random_state = int(np.random.default_rng().integers(10000))
if method is None:
method = ['euclidean']
method_option = ['euclidean', 'cosine', 'pearson', 'spearman']
if isinstance(method, str):
if method not in method_option:
msg = f"Invalid method. Expected one of: {method_option}."
raise ValueError(msg)
elif isinstance(method, list):
for method_i in method:
if method_i not in method_option:
msg = f"Invalid method. Expected one of: {method_option}."
raise ValueError(msg)
if missing is None:
missing = [0.1]
if sparsity is None:
sparsity = [0.1]
if repetitions < 1:
msg = "Incorrect repetitions. It must be repetitions > 0."
raise ValueError(msg)
if subsample > 0.5 or subsample < 0:
msg = "Incorrect subsample. It must be 0 < subsample < 0.5."
raise ValueError(msg)
if rep < 1:
msg = "Incorrect rep. It must be rep > 1."
raise ValueError(msg)
if n_jobs < 1:
msg = "Incorrect n_jobs. It must be n_jobs > 0."
raise ValueError(msg)
self.method = method
self.missing = missing
self.neighbours = neighbours
self.alpha = alpha
self.n_clusters = n_clusters
self.sparsity = sparsity
self.repetitions = repetitions
self.cluster_method = cluster_method
self.max_iter = max_iter
self.tol = tol
self.subsample = subsample
self.calc_cost = calc_cost
self.h_init = h_init
self.rep = rep
self.random_state = random_state
self.verbose = verbose
self.n_jobs = n_jobs
self.runs_per_con = max(round(repetitions * 0.8), 1) # number of runs per consensus matrix creation
[docs]
def fit(self, Xs, y=None):
r"""
Fit the transformer to the input data.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : returns an instance of self.
"""
Xs = check_Xs(Xs, ensure_all_finite='allow-nan')
if not isinstance(Xs[0], pd.DataFrame):
Xs = [pd.DataFrame(X) for X in Xs]
if len(self.missing) == 1:
if self.verbose:
print(f"#Setting all 'missing' parameters to {self.missing[0]}")
self.missing = [self.missing[0]] * len(Xs)
if len(self.method) == 1:
self.method = [self.method[0]] * len(Xs)
elif len(Xs) != len(self.method):
raise ValueError("len(Xs) and number of similarity methods does not correspond.")
self.graph_ = None
self.nmf_ = None
all_samples = Xs[0].index
if self.verbose:
print(f"Total number of unique samples: {len(all_samples)}")
self.similarity_ = {}
adj_matrices = []
# create adjacency matrices
for X_idx, X in enumerate(Xs):
if self.verbose:
print(f"#Modality: {X_idx}")
print(f"Feature matrix: ({X.shape[0]} samples x {X.shape[1]} features)")
# create adjacency matrix
a = feature_to_adjacency(X.values, missing=self.missing[X_idx], method=self.method[X_idx],
n=self.neighbours, alpha=self.alpha)
if self.verbose:
print(f"Adjacency matrix: ({a.shape} created [similarity method: {self.method[X_idx]}")
# add matrices to output arrays
adj_matrices.append(a)
self.similarity_[str(X_idx)] = a
##################################################################
if self.h_init is not None:
if self.h_init >= len(adj_matrices) or self.h_init < 0:
raise ValueError("Incorrect h_init.")
# create multilayer graph
self.graph_ = MultiplexNet(adj_matrices=adj_matrices, node_labels=all_samples)
n_sub_samples = round(all_samples.size * self.subsample)
if self.verbose:
print(f"#Number of samples randomly removed in each run: {n_sub_samples} out of {all_samples.size}")
# create solver
self.nmf_ = UnsupervisedSumoNMF(graph=self.graph_, nbins=self.repetitions,
bin_size=self.graph_.nodes - n_sub_samples,
random_state=self.random_state)
global _sumo_run
_sumo_run = self # this solves multiprocessing issue with pickling
results = [SUMO._run_factorization(sparsity=sparsity, k=self.n_clusters,
sumo_run=_sumo_run, verbose=self.verbose,
random_state=self.random_state)
for sparsity in self.sparsity]
sparsity_order = self.sparsity
# select best result
best_result = sorted(results, reverse=True)[0]
best_eta = None
quality_output = []
for (result, sparsity) in zip(results, sparsity_order):
if self.verbose:
print(f"#Clustering quality (eta={sparsity}): {result[0]}")
quality_output.append(np.array([sparsity, result[0]]))
if result[1] == best_result[1]:
best_eta = sparsity
# summarize results
assert best_eta is not None
out_arrays = best_result[1]
self.cophenet_list_ = out_arrays["cophenet"]
self.pac_list_ = out_arrays["pac"]
self.embedding_ = out_arrays["embedding"]
self.labels_ = out_arrays["clusters"][:,1].astype(int)
return self
def _predict(self, Xs):
r"""
Return clustering results for samples.
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
Returns
-------
labels : list of array-likes objects, shape (n_samples,)
The predicted data.
"""
labels = self.labels_
return labels
[docs]
def fit_predict(self, Xs, y=None):
r"""
Fit the model and return clustering results.
Convenience method; equivalent to calling fit(X) followed by predict(X).
Parameters
----------
Xs : list of array-likes objects
- Xs length: n_mods
- Xs[i] shape: (n_samples, n_features_i)
A list of different modalities.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray, shape (n_samples,)
The predicted data.
"""
labels = self.fit(Xs)._predict(Xs)
return labels
@staticmethod
def _run_factorization(sparsity: float, k: int, sumo_run, verbose: bool, random_state):
""" Run factorization for set sparsity and number of clusters
Args:
sparsity (float): value of sparsity penalty
k (int): number of clusters
sumo_run: SumoRun object
Returns:
quality (float): assessed quality of cluster structure
outfile (str): path to .npz output file with results of factorization
"""
# run factorization N times
results = []
for repeat in range(sumo_run.repetitions):
if verbose:
print(f"#Runing NMF algorithm with sparsity {sparsity} (N={repeat + 1})")
opt_args = {
"sparsity_penalty": sparsity,
"k": k,
"max_iter": sumo_run.max_iter,
"tol": sumo_run.tol,
"calc_cost": sumo_run.calc_cost,
"bin_id": repeat,
"h_init": sumo_run.h_init
}
result = sumo_run.nmf_.factorize(**opt_args)
# extract computed clusters
if verbose:
print(f"#Using {sumo_run.cluster_method} for cluster labels extraction)")
result.extract_clusters(method=sumo_run.cluster_method, random_state=sumo_run.random_state, verbose=verbose)
results.append(result)
# consensus graph
assert len(results) > 0
all_REs = [] # residual errors
for run_idx in range(sumo_run.repetitions):
all_REs.append(results[run_idx].RE)
out_arrays = {'pac': np.array([]), 'cophenet': np.array([])}
minRE, maxRE = min(all_REs), max(all_REs)
for rep in range(sumo_run.rep):
run_indices = list(np.random.default_rng(sumo_run.random_state).choice(range(len(results)),
sumo_run.runs_per_con,
replace=False))
consensus = np.zeros((sumo_run.graph_.nodes, sumo_run.graph_.nodes))
weights = np.empty((sumo_run.graph_.nodes, sumo_run.graph_.nodes))
weights[:] = np.nan
all_equal = np.allclose(minRE, maxRE)
for run_idx in run_indices:
weight = np.empty((sumo_run.graph_.nodes, sumo_run.graph_.nodes))
weight[:] = np.nan
sample_ids = results[run_idx].sample_ids
if all_equal:
weight[sample_ids, sample_ids[:, None]] = 1.
else:
weight[sample_ids, sample_ids[:, None]] = (maxRE - results[run_idx].RE) / (maxRE - minRE)
weights = np.nansum(np.stack((weights, weight)), axis=0)
consensus_run = np.nanprod(np.stack((results[run_idx].connectivity, weight)), axis=0)
consensus = np.nansum(np.stack((consensus, consensus_run)), axis=0)
if verbose:
print(f"#Creating consensus graphs [{rep + 1} out of {sumo_run.rep}]")
assert not np.any(np.isnan(consensus))
consensus = consensus / weights
org_con = consensus.copy()
consensus[consensus < 0.5] = 0
# calculate cophenetic correlation coefficient
dist = pdist(org_con, metric="correlation")
if np.any(np.isnan(dist)):
ccc = np.nan
if verbose:
print(
"Cannot calculate cophenetic correlation coefficient! Please inspect values in your consensus matrix")
else:
ccc = cophenet(linkage(dist, method="complete", metric="correlation"), dist)[0]
# calculate proportion of ambiguous clustering
den = (sumo_run.graph_.nodes ** 2) - sumo_run.graph_.nodes
num = org_con[(org_con > 0.1) & (org_con < 0.9)].size
pac = num * (1. / den)
out_arrays.update({'pac': np.append(out_arrays['pac'], pac),
'cophenet': np.append(out_arrays['cophenet'], ccc)})
if verbose:
print("#Extracting final clustering result, using normalized cut")
consensus_labels, embeddings = extract_ncut(consensus, k=k, random_state=random_state)
cluster_array = np.empty((sumo_run.graph_.sample_names.shape[0], 2), dtype=object)
cluster_array[:, 0] = sumo_run.graph_.sample_names
cluster_array[:, 1] = consensus_labels
clusters_dict = {num: sumo_run.graph_.sample_names[list(np.where(consensus_labels == num)[0])] for num in
np.unique(consensus_labels)}
for cluster_idx in sorted(clusters_dict.keys()):
if verbose:
print(
f"Cluster {cluster_idx} ({len(clusters_dict[cluster_idx])} samples): \n{clusters_dict[cluster_idx]}")
# calculate quality of clustering for given sparsity
quality = sumo_run.graph_.get_clustering_quality(labels=cluster_array[:, 1])
# create output file
conf_array = np.empty((9, 2), dtype=object)
conf_array[:, 0] = ['method', 'n', 'max_iter', 'tol', 'subsample', 'calc_cost', 'h_init', 'seed', 'sparsity']
conf_array[:, 1] = [sumo_run.cluster_method, sumo_run.repetitions, sumo_run.max_iter, sumo_run.tol,
sumo_run.subsample,
sumo_run.calc_cost, np.nan if sumo_run.h_init is None else sumo_run.h_init,
np.nan if sumo_run.random_state is None else sumo_run.random_state, sparsity]
out_arrays.update({
"clusters": cluster_array,
"consensus": consensus,
"unfiltered_consensus": org_con,
"quality": np.array(quality),
"samples": sumo_run.graph_.sample_names,
"embedding": embeddings,
"config": conf_array
})
steps_reached = [results[i].steps for i in range(len(results))]
maxiter_proc = round((sum([step == sumo_run.max_iter for step in steps_reached]) / len(steps_reached)) * 100, 3)
if verbose:
print(f"#Reached maximum number of iterations in {maxiter_proc}% of runs")
if maxiter_proc >= 90:
if verbose:
print(f"Consider increasing -max_iter and decreasing -tol to achieve better accuracy")
out_arrays['steps'] = np.array([steps_reached])
return quality, out_arrays