"""Helper functions.""" import numbers import numpy as np import pandas as pd import itertools as it import collections.abc from tabulate import tabulate from .config import options __all__ = [ "_perm_pval", "print_table", "_postprocess_dataframe", "_check_eftype", "remove_na", "_flatten_list", "_check_dataframe", "_is_sklearn_installed", "_is_statsmodels_installed", "_is_mpmath_installed", ] def _perm_pval(bootstat, estimate, alternative="two-sided"): """ Compute p-values from a permutation test. Parameters ---------- bootstat : 1D array Permutation distribution. estimate : float or int Point estimate. alternative : str Tail for p-value. Can be either `'two-sided'` (default), `'greater'` or `'less'`. Returns ------- p : float P-value. """ assert alternative in ["two-sided", "greater", "less"], "Wrong tail argument." assert isinstance(estimate, (int, float)) bootstat = np.asarray(bootstat) assert bootstat.ndim == 1, "bootstat must be a 1D array." n_boot = bootstat.size assert n_boot >= 1, "bootstat must have at least one value." if alternative == "greater": p = np.greater_equal(bootstat, estimate).sum() / n_boot elif alternative == "less": p = np.less_equal(bootstat, estimate).sum() / n_boot else: p = np.greater_equal(np.fabs(bootstat), abs(estimate)).sum() / n_boot return p ############################################################################### # PRINT & EXPORT OUTPUT TABLE ############################################################################### def print_table(df, floatfmt=".3f", tablefmt="simple"): """Pretty display of table. Parameters ---------- df : :py:class:`pandas.DataFrame` Dataframe to print (e.g. ANOVA summary) floatfmt : string Decimal number formatting tablefmt : string Table format (e.g. 'simple', 'plain', 'html', 'latex', 'grid', 'rst'). For a full list of available formats, please refer to https://pypi.org/project/tabulate/ """ if "F" in df.keys(): print("\n=============\nANOVA SUMMARY\n=============\n") if "A" in df.keys(): print("\n==============\nPOST HOC TESTS\n==============\n") print(tabulate(df, headers="keys", showindex=False, floatfmt=floatfmt, tablefmt=tablefmt)) print("") def _postprocess_dataframe(df): """Apply some post-processing to an ouput dataframe (e.g. rounding). Whether and how rounding is applied is governed by options specified in `pingouin.options`. The default rounding (number of decimals) is determined by `pingouin.options['round']`. You can specify rounding for a given column name by the option `'round.column.'`, e.g. `'round.column.CI95%'`. Analogously, `'round.row.'` also works (where `rowname`) refers to the pandas index), as well as `'round.cell.[]x[>> import numpy as np >>> from pingouin import remove_na >>> x = [6.4, 3.2, 4.5, np.nan] >>> remove_na(x) array([6.4, 3.2, 4.5]) With two paired 1D arrays >>> y = [2.3, np.nan, 5.2, 4.6] >>> remove_na(x, y, paired=True) (array([6.4, 4.5]), array([2.3, 5.2])) With two independent 2D arrays >>> x = np.array([[4, 2], [4, np.nan], [7, 6]]) >>> y = np.array([[6, np.nan], [3, 2], [2, 2]]) >>> x_no_nan, y_no_nan = remove_na(x, y, paired=False) """ # Safety checks x = np.asarray(x) assert axis in ["rows", "columns"], "axis must be rows or columns." if y is None: return _remove_na_single(x, axis=axis) elif isinstance(y, (int, float, str)): return _remove_na_single(x, axis=axis), y else: # y is list, np.array, pd.Series y = np.asarray(y) assert y.size != 0, "y cannot be an empty list or array." # Make sure that we just pass-through if y have only 1 element if y.size == 1: return _remove_na_single(x, axis=axis), y if x.ndim != y.ndim or paired is False: # x and y do not have the same dimension x_no_nan = _remove_na_single(x, axis=axis) y_no_nan = _remove_na_single(y, axis=axis) return x_no_nan, y_no_nan # At this point, we assume that x and y are paired and have same dimensions if x.ndim == 1: # 1D arrays x_mask = ~np.isnan(x) y_mask = ~np.isnan(y) else: # 2D arrays ax = 1 if axis == "rows" else 0 x_mask = ~np.any(np.isnan(x), axis=ax) y_mask = ~np.any(np.isnan(y), axis=ax) # Check if missing values are present if ~x_mask.all() or ~y_mask.all(): ax = 0 if axis == "rows" else 1 ax = 0 if x.ndim == 1 else ax both = np.logical_and(x_mask, y_mask) x = x.compress(both, axis=ax) y = y.compress(both, axis=ax) return x, y ############################################################################### # ARGUMENTS CHECK ############################################################################### def _flatten_list(x, include_tuple=False): """Flatten an arbitrarily nested list into a new list. This can be useful to select pandas DataFrame columns. From https://stackoverflow.com/a/16176969/10581531 Examples -------- >>> from pingouin.utils import _flatten_list >>> x = ['X1', ['M1', 'M2'], 'Y1', ['Y2']] >>> _flatten_list(x) ['X1', 'M1', 'M2', 'Y1', 'Y2'] >>> x = ['Xaa', 'Xbb', 'Xcc'] >>> _flatten_list(x) ['Xaa', 'Xbb', 'Xcc'] >>> x = ['Xaa', ('Xbb', 'Xcc'), (1, 2), (1)] >>> _flatten_list(x) ['Xaa', ('Xbb', 'Xcc'), (1, 2), 1] >>> _flatten_list(x, include_tuple=True) ['Xaa', 'Xbb', 'Xcc', 1, 2, 1] """ # If x is not iterable, return x if not isinstance(x, collections.abc.Iterable): return x # Initialize empty output variable result = [] # Loop over items in x for el in x: # Check if element is iterable el_is_iter = isinstance(el, collections.abc.Iterable) if el_is_iter: if not isinstance(el, (str, tuple)): result.extend(_flatten_list(el)) else: if isinstance(el, tuple) and include_tuple: result.extend(_flatten_list(el)) else: result.append(el) else: result.append(el) # Remove None from output result = [r for r in result if r is not None] return result def _check_eftype(eftype): """Check validity of eftype""" if eftype.lower() in [ "none", "hedges", "cohen", "r", "pointbiserialr", "eta-square", "odds-ratio", "auc", "cles", ]: return True else: return False def _check_dataframe(data=None, dv=None, between=None, within=None, subject=None, effects=None): """Checks whether data is a dataframe or can be converted to a dataframe. If successful, a dataframe is returned. If not successful, a ValueError is raised. """ # Check that data is a dataframe if not isinstance(data, pd.DataFrame): # DataMatrix objects can be safely convert to DataFrame objects. By # first checking the name of the class, we avoid having to actually # import DataMatrix unless it is necessary. if data.__class__.__name__ == "DataMatrix": # noqa try: from datamatrix import DataMatrix, convert as cnv # noqa except ImportError: raise ValueError( "Failed to convert object to pandas dataframe (DataMatrix not available)" # noqa ) else: if isinstance(data, DataMatrix): data = cnv.to_pandas(data) else: raise ValueError("Data must be a pandas dataframe or compatible object.") else: raise ValueError("Data must be a pandas dataframe or compatible object.") # Check that both dv and data are provided. if any(v is None for v in [dv, data]): raise ValueError("DV and data must be specified") # Check that dv is a numeric variable if data[dv].dtype.kind not in "fi": raise ValueError("DV must be numeric.") # Check that effects is provided if effects not in ["within", "between", "interaction", "all"]: raise ValueError("Effects must be: within, between, interaction, all") # Check that within is a string, int or a list (rm_anova2) if effects == "within" and not isinstance(within, (str, int, list)): raise ValueError("within must be a string, int or a list.") # Check that subject identifier is provided in rm_anova and friedman. if effects == "within" and subject is None: raise ValueError("subject must be specified when effects=within") # Check that between is a string or a list (anova2) if effects == "between" and not isinstance(between, (str, int, list)): raise ValueError("between must be a string, int or a list.") # Check that both between and within are present for interaction if effects == "interaction": for input in [within, between]: if not isinstance(input, (str, int, list)): raise ValueError("within and between must be specified when effects=interaction") return data ############################################################################### # DEPENDENCIES ############################################################################### def _is_statsmodels_installed(raise_error=False): """Check if statsmodels is installed.""" try: import statsmodels # noqa is_installed = True except OSError: # pragma: no cover is_installed = False # Raise error (if needed) : if raise_error and not is_installed: # pragma: no cover raise OSError("statsmodels needs to be installed. Please use `pip " "install statsmodels`.") return is_installed def _is_sklearn_installed(raise_error=False): """Check if sklearn is installed.""" try: import sklearn # noqa is_installed = True except OSError: # pragma: no cover is_installed = False # Raise error (if needed) : if raise_error and not is_installed: # pragma: no cover raise OSError("sklearn needs to be installed. Please use `pip " "install scikit-learn`.") return is_installed def _is_mpmath_installed(raise_error=False): """Check if mpmath is installed.""" try: import mpmath # noqa is_installed = True except OSError: # pragma: no cover is_installed = False # Raise error (if needed) : if raise_error and not is_installed: # pragma: no cover raise OSError("mpmath needs to be installed. Please use `pip " "install mpmath`.") return is_installed