Source code for imml.decomposition.mofa

# License: BSD-3-Clause

import os
import contextlib
import tempfile
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator

from ._mofa.run.entry_point import entry_point
from ._mofa._mofax import core as mfx
from ._mofa.core._BayesNet import BayesNet, StochasticBayesNet, _ModifiedBayesNet, _ModifiedStochasticBayesNet
from ..utils import check_Xs



[docs]
class MOFA(TransformerMixin, BaseEstimator):
    r"""
    Multi-Omics Factor Analysis (MOFA). [#mofapaper1]_ [#mofapaper2]_ [#mofacode]_

    MOFA is a factor analysis model that provides a general framework for the integration of (originally, multi-omic
    data sets) incomplete multi-modal datasets, in an unsupervised fashion. Intuitively, MOFA can be viewed as a
    versatile and statistically rigorous generalization of principal component analysis to multi-modal data. Given
    several data matrices with measurements of multiple data types on the same or on overlapping sets of
    samples, MOFA infers an interpretable low-dimensional representation in terms of a few latent factors.

    It can deal with both modality- and feature-wise missing.

    Parameters
    ----------
    n_components : int, default=10
        Number of components to keep.
    impute : bool, default=True
        True if missing values should be imputed.
    data_options : dict, default=None
        Data processing options, such as scale_views and scale_groups.
    data_matrix : dict, default=None
        Keys such as likelihoods, view_names, etc.
    model_options : dict, default=None
        Model options, such as ard_factors or ard_weights.
    train_options : dict, default=None
        Keys such as iter, tolerance.
    stochastic_options : dict, default=None
        Stochastic variational inference options, such as learning rate or batch size.
    covariates : dict, default=None
        Slot to store sample covariate for training in MEFISTO. Keys are sample_cov and covariates_names.
    smooth_options : dict, default=None
        options for smooth inference, such as scale_cov or model_groups.
    random_state : int, default=None
        Determines the randomness. Use an int to make the randomness deterministic.
    verbose : bool, default=False
        Verbosity mode.

    Attributes
    ----------
    mofa_ : mofa object
        Entry point as the original library. This can be used for data analysis and explainability.
    factors_: array-like of shape (n_samples, n_components)
        Factors computed by the model.
    weights_: list of n_mods array-likes objects of shape (n_features_i, n_components)
        Weights of the MOFA model.

    References
    ----------
    .. [#mofapaper1] Argelaguet R, Velten B, Arnol D, Dietrich S, Zenz T, Marioni JC, Buettner F, Huber W, Stegle O
                    (2018). “Multi‐Omics Factor Analysis—a framework for unsupervised integration of multi‐omics data
                    sets.” Molecular Systems Biology, 14. doi:10.15252/msb.20178124.
    .. [#mofapaper2] Argelaguet R, Arnol D, Bredikhin D, Deloro Y, Velten B, Marioni JC, Stegle O (2020). “MOFA+: a
                     statistical framework for comprehensive integration of multi-modal single-cell data.” Genome
                     Biology, 21. doi:10.1186/s13059-020-02015-1.
    .. [#mofacode] https://biofam.github.io/MOFA2/index.html

    Example
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from imml.decomposition import MOFA
    >>> Xs = [pd.DataFrame(np.random.default_rng(42).random((20, 10))) for i in range(3)]
    >>> transformer = MOFA(n_components = 5)
    >>> transformed_Xs = transformer.fit_transform(Xs)
    """

    
    def __init__(self, n_components : int = 10, impute:bool = True,
                 data_options : dict = None, data_matrix : dict = None, model_options : dict = None,
                 train_options : dict = None, stochastic_options : dict = None, covariates : dict = None,
                 smooth_options : dict = None, random_state : int = None, verbose = False):

        if data_options is None:
            data_options = {}
        if data_matrix is None:
            data_matrix = {}
        if model_options is None:
            model_options = {}
        if train_options is None:
            train_options = {}
        if stochastic_options is None:
            stochastic_options = {}
        if covariates is None:
            covariates = {}
        if smooth_options is None:
            smooth_options = {}

        if not isinstance(n_components, int):
            raise ValueError(f"Invalid n_components. It must be an int. A {type(n_components)} was passed.")
        if n_components < 1:
            raise ValueError(f"Invalid n_components. It must be greater or equal to 1. {n_components} was passed.")
        self.n_components = n_components
        self.impute = impute
        self.random_state = random_state
        self.verbose = verbose        
        if self.verbose:
            self.mofa_ = entry_point()
        else:
            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
                self.mofa_ = entry_point()
        self.data_options = data_options
        self.data_matrix = data_matrix
        self.model_options = model_options
        self.train_options = train_options
        self.stochastic_options = stochastic_options
        self.covariates = covariates
        self.smooth_options = smooth_options
        self.transform_ = None

        
    def fit(self, Xs, y = None):
        r"""
        Fit the transformer to the input data.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples, n_features_i)
            A list of different modalities.
        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self :  returns an instance of self.
        """
        Xs = check_Xs(Xs, ensure_all_finite='allow-nan')
        if not isinstance(Xs[0], pd.DataFrame):
            self.transform_ = "numpy"
            Xs = [pd.DataFrame(X) for X in Xs]
        else:
            self.transform_ = "pandas"
        if self.verbose:
            self._run_mofa(data = [[X] for X in Xs])
        else:
            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
                self._run_mofa(data = [[X] for X in Xs])
        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
            with tempfile.TemporaryDirectory() as tmp:
                outfile = os.path.join(tmp, 'tmp.hdf5')
                self.mofa_.save(outfile=outfile, save_data=True, save_parameters=False, expectations=None)
                model = mfx.mofa_model(outfile)
                self.weights_ = model.get_weights(concatenate_views= False)
                self.factors_ = model.get_factors()
                model.close()
        return self


    def transform(self, Xs):
        r"""
        Project data into the learned space.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples_i, n_features_i)
            A list of different mods.

        Returns
        -------
        transformed_Xs : list of n_mods array-likes objects of shape (n_samples, n_components)
            The projected data.
        """
        Xs = check_Xs(Xs, ensure_all_finite='allow-nan')
        if not isinstance(Xs[0], pd.DataFrame):
            Xs = [pd.DataFrame(X) for X in Xs]

        ws = self.weights_
        winv = [np.linalg.pinv(w) for w in ws]
        transformed_Xs = [np.dot(X, w.T) for X,w in zip(Xs, winv)]

        if self.transform_ == "pandas":
            transformed_Xs = [pd.DataFrame(transformed_X, index=X.index) for X,transformed_X in zip(Xs,transformed_Xs)]
        return transformed_Xs


    def fit_transform(self, Xs, y = None, **fit_params):
        r"""
        Fit to data, then transform it.

        Parameters
        ----------
        Xs : list of array-likes objects
            - Xs length: n_mods
            - Xs[i] shape: (n_samples_i, n_features_i)
            A list of different mods.
        y : Ignored
            Not used, present here for API consistency by convention.
        fit_params : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        transformed_X : array-likes objects of shape (n_samples, n_components)
            The projected data.
        """
        transformed_X = self.fit(Xs).factors_
        if self.transform_ == "pandas":
            transformed_X = pd.DataFrame(transformed_X, index=Xs[0].index)
        return transformed_X

    
    def _run_mofa(self, data):
        self.mofa_.set_data_options(**self.data_options)
        self.mofa_.set_data_matrix(data = data, **self.data_matrix)
        self.mofa_.set_model_options(factors = self.n_components, **self.model_options)
        self.mofa_.set_train_options(seed = self.random_state, verbose = self.verbose, **self.train_options)
        self.mofa_.set_stochastic_options(**self.stochastic_options)
        if self.covariates:
            self.mofa_.set_covariates(**self.covariates)
            self.mofa_.set_smooth_options(**self.smooth_options)
        self.mofa_.build()
        if isinstance(self.mofa_.model, BayesNet):
            self.mofa_.model = _ModifiedBayesNet(self.mofa_.model.dim, self.mofa_.model.nodes)
        elif isinstance(self.mofa_.model, StochasticBayesNet):
            self.mofa_.model = _ModifiedStochasticBayesNet(self.mofa_.model.dim, self.mofa_.model.nodes)
        self.mofa_.run()
        return None