Source code for imml.decomposition.dfmf

from typing import Union
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils.validation import _generate_get_feature_names_out

from ..utils import check_Xs
from ._skfusion import fusion


[docs] class DFMF(TransformerMixin, BaseEstimator): r""" Data Fusion by Matrix Factorization (DFMF). [#dfmfpaper]_ [#dfmfcode]_ DMFM is a data fusion approach with penalized matrix tri-factorization (DFMF) that simultaneously factorizes data matrices to reveal hidden associations. This method can deal with both block- and single-wise missing. Parameters ---------- n_components : int, default=10 Number of components to keep. max_iter : int, default=100 Maximum number of iterations to perform. init_type : str or list of str, default='random_c' The algorithm to initialize latent matrix factors. Options are 'random', 'random_c' and 'random_vcol'. It can be a list, each item being for fit and transform, respectively. n_run: int, default=1 Number of components to keep. stopping : tuple (target_matrix, eps), default=None Terminate iteration if the reconstruction error of target matrix improves by less than eps. stopping_system : float, default=None Terminate iteration if the reconstruction error of the fused system improves by less than eps. compute_err is to True to compute the error of the fused system. compute_err : bool, default=False Compute the reconstruction error of every relation matrix if True. callback : callable, default=None An optional user-supplied function to call after each iteration. Called as callback(G, S, cur_iter), where S and G are current latent estimates. fill_value : float, default=0 Value to use to initially fill missing values. random_state : int, default=None Determines the randomness. Use an int to make the randomness deterministic. verbose : bool, default=False Verbosity mode. n_jobs : int, default=None Number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. Attributes ---------- fuser_ : Dfmf object Model. transformer_ : DfmfTransform object Object for transforming unseen data. t_: fusion.ObjectType ts_: list of fusion.ObjectType References ---------- .. [#dfmfpaper] M. Žitnik and B. Zupan, "Data Fusion by Matrix Factorization," in IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 37, no. 1, pp. 41-53, 1 Jan. 2015, doi: 10.1109/TPAMI.2014.2343973. .. [#dfmfcode] https://github.com/mims-harvard/scikit-fusion/tree/master Example -------- >>> import numpy as np >>> import pandas as pd >>> from imml.decomposition import DFMF >>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)] >>> transformer = DFMF(n_components = 5) >>> transformed_Xs = transformer.fit_transform(Xs) """ def __init__(self, n_components : int = 10, max_iter: int = 100, init_type: Union[str, list] = 'random_c', n_run: int = 1, stopping=None, stopping_system=None, verbose=0, compute_err=False, callback=None, random_state: int = None, n_jobs=1, fill_value=0): if isinstance(init_type, str): init_type = [init_type] if len(init_type) == 1: init_type *= 2 if n_components < 1: raise ValueError("Invalid n_components. It must be greater than or equal to 1.") self.n_components = n_components self.callback = callback self.max_iter = max_iter self.init_type = init_type self.n_run = n_run self.stopping = stopping self.stopping_system = stopping_system self.verbose = verbose self.compute_err = compute_err self.random_state = random_state self.n_jobs = n_jobs self.fill_value = fill_value self.fuser_ = fusion.Dfmf(max_iter=max_iter, init_type=init_type[0], n_run=n_run, stopping=stopping, stopping_system=stopping_system, verbose=verbose, compute_err=compute_err, callback=callback, random_state=random_state, n_jobs=n_jobs) self.transformer_ = fusion.DfmfTransform(max_iter=max_iter, init_type=init_type[1], n_run=n_run, stopping=stopping, stopping_system=stopping_system, verbose=verbose, compute_err=compute_err, callback=callback, random_state=random_state, n_jobs=n_jobs, fill_value=fill_value) self.t_ = fusion.ObjectType('Type 0', n_components) self.transform_ = None def fit(self, Xs, y = None): r""" Fit the transformer to the input data. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : returns an instance of self. """ Xs = check_Xs(Xs, ensure_all_finite='allow-nan') if not isinstance(Xs[0], pd.DataFrame): self.transform_ = "numpy" Xs = [pd.DataFrame(X) for X in Xs] else: self.transform_ = "pandas" self.ts_ = [fusion.ObjectType(f'Type {i + 1}', self.n_components) for i in range(len(Xs))] relations = [fusion.Relation(X.values, self.t_, t) for X,t in zip(Xs, self.ts_)] fusion_graph = fusion.FusionGraph(relations) self.fuser_.fuse(fusion_graph) return self def transform(self, Xs): r""" Project data into the learned space. Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. Returns ------- transformed_Xs : list of array-likes objects, shape (n_samples, n_components) The projected data. """ Xs = check_Xs(Xs, ensure_all_finite='allow-nan') if not isinstance(Xs[0], pd.DataFrame): Xs = [pd.DataFrame(X) for X in Xs] relations = [fusion.Relation(X.values, self.t_, t) for X,t in zip(Xs, self.ts_)] fusion_graph = fusion.FusionGraph(relations) transformed_X = self.transformer_.transform(self.t_, fusion_graph, self.fuser_).factor(self.t_) if self.transform_ == "pandas": transformed_X = pd.DataFrame(transformed_X, index= Xs[0].index) return transformed_X