Source code for imml.feature_selection.jnmf_feature_selection

# License: BSD-3-Clause

import pandas as pd
import numpy as np

from ..decomposition import JNMF


[docs] class JNMFFeatureSelector(JNMF): r""" Feature selection for multi-modal datasets using the Joint Non-negative Matrix Factorization (JNMF) method. [#jnmfpaper1]_ [#jnmfpaper2]_ [#jnmfpaper3]_ [#jnmfpaper4]_ [#jnmfpaper5]_ [#jnmfpaper6]_ [#jnmfcode1]_ [#jnmfcode2]_ This class extends the functionality of the `JNMF` method to perform feature selection across multiple modalities or blocks of data. The selected features are those with the highest contributions to the derived components from JNMF. This feature selection can be based on either the largest contribution for each component, the maximum overall contribution, or the average contribution across all components. Parameters ---------- select_by : str, default="component" Criterion used to select features. Must be one of ["component", "max", "average"]: - "component": Selects the feature with the largest contribution for each component. - "max": Selects the features with the largest overall contribution. - "average": Selects the features with the highest average contribution across all components. f_per_component : int, default=1 Number of features to select per component. - If `select_by="component"`, this controls how many features are selected for each component. - If `select_by="max"`, the top `n_components` * `f_per_component` features across all components are selected. - If `select_by="average"`, it selects `n_components` * `f_per_component` features with the highest average contribution for each component. kwargs : dict Arguments passed to the `JNMF` method. Attributes ---------- selected_features_ : list of str of shape (n_components * f_per_component,) List of selected features. weights_ : list of float of shape (n_components * f_per_component,) The importance or contribution scores of the selected features in absolute values. These scores reflect how strongly each feature contributes to the components derived from JNMF. References ---------- .. [#jnmfpaper1] Tsuyuzaki et al., (2023). nnTensor: An R package for non-negative matrix/tensor decomposition. Journal of Open Source Software, 8(84), 5015, https://doi.org/10.21105/joss.05015 .. [#jnmfpaper2] Liviu Badea, (2008) Extracting Gene Expression Profiles Common to Colon and Pancreatic Adenocarcinoma using Simultaneous nonnegative matrix factorization. Pacific Symposium on Biocomputing 13:279-290. .. [#jnmfpaper3] Shihua Zhang, et al. (2012) Discovery of multi-dimensional modules by integrative analysis of cancer genomic data. Nucleic Acids Research 40(19), 9379-9391. .. [#jnmfpaper4] Zi Yang, et al. (2016) A non-negative matrix factorization method for detecting modules in heterogeneous omics multi-modal data, Bioinformatics 32(1), 1-8. .. [#jnmfpaper5] Y. Kenan Yilmaz et al., (2010) Probabilistic Latent Tensor Factorization, International Conference on Latent Variable Analysis and Signal Separation 346-353. .. [#jnmfpaper6] N. Fujita et al., (2018) Biomarker discovery by integrated joint non-negative matrix factorization and pathway signature analyses, Scientific Report. .. [#jnmfcode1] https://rdrr.io/cran/nnTensor/man/JNMF.html .. [#jnmfcode2] https://github.com/rikenbit/nnTensor Example -------- >>> import numpy as np >>> import pandas as pd >>> from imml.feature_selection import JNMFFeatureSelector >>> Xs = [pd.DataFrame(np.random.default_rng(42).uniform(size=(20, 10))) for i in range(3)] >>> transformer = JNMFFeatureSelector(n_components = 5) >>> transformed_Xs = transformer.fit_transform(Xs) """ def __init__(self, select_by: str = "component", f_per_component: int = 1, **kwargs): select_by_options = ["max", "component", "average"] if select_by not in select_by_options: raise ValueError(f"Invalid select_by. Expected one of {select_by}. {select_by_options} was passed.") super().__init__(**kwargs) self.select_by = select_by self.f_per_component = f_per_component
[docs] def fit(self, Xs, y = None): r""" Fit the transformer to the input data. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : returns an instance of self. """ super().fit(Xs) hs = self.H_ if isinstance(Xs[0], pd.DataFrame): hs = [pd.DataFrame(h, index=X.columns) for h,X in zip(hs,Xs)] hs = pd.concat(hs, axis=0) elif isinstance(Xs[0], np.ndarray): hs = [pd.DataFrame(h) for h in hs] hs = pd.concat(hs, axis=0) hs.columns = range(hs.columns.size) hs = hs.abs() selected_features = {} components = [] if self.select_by == "component": for n in range(self.f_per_component): hs = hs.loc[:, hs.max().sort_values(ascending=False).index] for col in hs: components.append(col) component = hs[col] feature = component.idxmax() selected_features[feature] = component.max() hs = hs.drop(labels=feature) self.component_ = components elif self.select_by == "average": hs = hs.mean(axis=1) for i in range(self.n_components * self.f_per_component): feature = hs.idxmax() selected_features[feature] = hs.max() hs = hs.drop(labels=feature) elif self.select_by == "max": hs = hs.stack().reset_index(drop=True, level=1) for i in range(self.n_components * self.f_per_component): feature = hs.idxmax() selected_features[feature] = hs.max() hs = hs.drop(labels=feature) self.selected_features_ = list(selected_features.keys()) self.weights_ = list(selected_features.values()) return self
[docs] def transform(self, Xs): r""" Project data into the learned space. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. Returns ------- transformed_Xs : list of array-likes objects, shape (n_samples, n_components) The projected data. """ if isinstance(Xs[0], pd.DataFrame): transformed_Xs = [X.iloc[:, X.columns.isin(self.selected_features_)] for X in Xs] elif isinstance(Xs[0], np.ndarray): selected_features = np.array(self.selected_features_) dims = [X.shape[1] for X in Xs] dims = np.cumsum(dims) transformed_Xs = [X[:, selected_features[(selected_features >= dim - X.shape[1]) & (selected_features < dim)]] for X,dim in zip(Xs,dims)] return transformed_Xs
[docs] def fit_transform(self, Xs, y = None, **fit_params): r""" Fit to data, then transform it. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples_i, n_features_i) A list of different mods. y : Ignored Not used, present here for API consistency by convention. fit_params : Ignored Not used, present here for API consistency by convention. Returns ------- transformed_X : array-likes objects of shape (n_samples, n_components) The projected data. """ transformed_X = self.fit(Xs).transform(Xs) return transformed_X