Source code for imml.utils.check_xs

# License: BSD-3-Clause

import numpy as np
import pandas as pd
from sklearn.utils import check_array

try:
    import torch
except ImportError:
    torch = object


[docs] def check_Xs(Xs, enforce_modalities=None, copy=False, ensure_all_finite="allow-nan",return_dimensions=False): r""" Checks Xs and ensures it to be a list of 2D matrices. Adapted from `̀mvlearn` [#checkxspaper]_ [#checkxscode]_ . Parameters ---------- Xs : list of array-likes objects - Xs length: n_mods - Xs[i] shape: (n_samples, n_features_i) A list of different modalities. enforce_modalities : int, (default=not checked) If provided, ensures this number of modalities in Xs. Otherwise not checked. copy : boolean, (default=False) If True, the returned Xs is a copy of the input Xs, and operations on the output will not affect the input. If False, the returned Xs is a modality of the input Xs, and operations on the output will change the input. ensure_all_finite : bool or 'allow-nan', default='allow-nan' Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - False: accepts np.inf, np.nan, pd.NA in array. - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. return_dimensions : boolean, (default=False) If True, the function also returns the dimensions of the multi-modal dataset. The dimensions are n_mods, n_samples, n_features where n_samples and n_mods are respectively the number of modalities and the number of samples, and n_features is a list of length n_mods containing the number of features of each modality. References ---------- .. [#checkxspaper] Perry, Ronan, et al. "mvlearn: Multiview Machine Learning in Python." Journal of Machine Learning Research 22.109 (2021): 1-7. .. [#checkxscode] https://mvlearn.github.io/references/utils.html Returns ------- Xs_converted : object The converted and validated Xs (list of data arrays). n_mods : int The number of modalities in the dataset. Returned only if ``return_dimensions`` is ``True``. n_samples : int The number of samples in the dataset. Returned only if ``return_dimensions`` is ``True``. n_features : list List of length ``n_mods`` containing the number of features in each modality. Returned only if ``return_dimensions`` is ``True``. """ if not isinstance(Xs, list): raise ValueError(f"Invalid Xs. It must be a list. A {type(Xs)} was passed.") n_mods = len(Xs) if len(Xs) < 2: raise ValueError(f"Invalid Xs. It must have at least two modalities. Got {n_mods} modalities.") if any(len(X) == 0 for X in Xs): raise ValueError(f"Invalid Xs. All elements must have at least one sample. Got {[len(X) for X in Xs]}.") if enforce_modalities is not None and n_mods != enforce_modalities: raise ValueError(f"Invalid Xs. Wrong number of modalities. Expected {enforce_modalities} but found {n_mods}") if len(set([len(X) for X in Xs])) != 1: raise ValueError(f"Invalid Xs. All modalities should have the same number of samples. Got {[len(X) for X in Xs]}.") dtype = type(Xs[0]) if not all(isinstance(X, dtype) for X in Xs): msg = "All modalities should be the same data type" raise ValueError(f"Invalid Xs. All modalities should be the same data type. Got {[type(X) for X in Xs]}.") if isinstance(Xs[0],pd.DataFrame): Xs = [pd.DataFrame(check_array(X, allow_nd=False, copy=copy, ensure_all_finite=ensure_all_finite, dtype=None), index=X.index, columns=X.columns) for X_idx, X in enumerate(Xs)] elif isinstance(Xs[0], np.ndarray) or isinstance(Xs[0], list): Xs = [check_array(X, allow_nd=False, copy=copy, ensure_all_finite=ensure_all_finite, dtype=None) for X in Xs] elif isinstance(Xs[0], torch.Tensor): Xs = [torch.from_numpy(check_array(X, allow_nd=False, copy=copy, ensure_all_finite=ensure_all_finite, dtype=None)) for X in Xs] if return_dimensions: n_samples = Xs[0].shape[0] n_features = [X.shape[1] for X in Xs] return Xs, n_mods, n_samples, n_features else: return Xs