U
    Qvf                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ dd	d
gZdddZdd Zddd	ZdddZdddZdd Zejddd
ZdS )     N)tnorm)pinvhlstsq)options)	remove_na)_flatten_list)_postprocess_dataframelinear_regressionlogistic_regressionmediation_analysisTF皙?c	           3      C   s|  t | tjr|   }	nt | tjr0| jg}	ng }	t| } t|}|j	dksZt
dd|  k rndk stn t
| j	dkr| dtjf } |rt| |dtjf ddd\} }t|}t| }
t|  }|
st
d|st
d	|jd | jd ks
t
d
|	s(dd t| jd D }	|rTtt| jd | f} |	dd tj| dd}t|dk}t|rt| |d} t|	|}	ttj| | dddf kddd }t|dkrt| |dd d} t|	|dd }	t|dkrdnd}| jd dkrg }tt| jd dD ]B}t| dd|d f | dd|d f r4||d  q4t|rt| |d} t|	|}	| jd | jd  }}|dkst
d|dkst
d|dk	rz|rtdt|}|j	dkst
d|j|ks t
dt |! r8t
d|dk ! rNt
dt|}t"t#|}||  }|| }nt|}| }|}t$||dd\}}}}|jdkr|d n|}|r|S d}||jd k rt%&d| d|jd  d d}|| }|| }|| }|| } |r,| d ' }|| }!t'||tj(||d d  }"|rhd||"  }#nd||!  }#dd|# ||  |  }$|| }%|%tj)*|j+| ,  }&t#|&}'||' }(dt-.t/|(| })t-0d|d  |}*|*|' }+||+ },||+ }-d d!| d  }.d d!d|d    }/d"|	d#|d$|'d%|(d&|)d'|#d(|$|.|,|/|-i	}0|r tj1tj|d)gd*tj| |	d*gddd+}1d|	krt2|1j3dgd*j4dd,}2dg|2d"  |2d"< t|2d- dtj5|2d-< t|2d. dtj5|2d.< nt2|1j4dd,}2|06|2 |r.t7t|0}0||0_8||0_9d|0_:| |0_:nJ||0d/< ||0d0< | |0d1< | |0d2< ||0d)< ||0d3< |dk	rx||0d4< ||0d5< |0S )6af.  (Multiple) Linear regression.

    Parameters
    ----------
    X : array_like
        Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*.
    y : array_like
        Dependent variable, of shape *(n_samples)*.
    add_intercept : bool
        If False, assume that the data are already centered. If True, add a
        constant term to the model. In this case, the first value in the
        output dict is the intercept of the model.

        .. note:: It is generally recommended to include a constant term
            (intercept) to the model to limit the bias and force the residual
            mean to equal zero. The intercept coefficient and p-values
            are however rarely meaningful.
    weights : array_like
        An optional vector of sample weights to be used in the fitting
        process, of shape *(n_samples)*. Missing or negative weights are not
        allowed. If not null, a weighted least squares is calculated.

        .. versionadded:: 0.3.5
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\text{CI} = [\alpha / 2 ; 1 - \alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    relimp : bool
        If True, returns the relative importance (= contribution) of
        predictors. This is irrelevant when the predictors are uncorrelated:
        the total :math:`R^2` of the model is simply the sum of each univariate
        regression :math:`R^2`-values. However, this does not apply when
        predictors are correlated. Instead, the total :math:`R^2` of the model
        is partitioned by averaging over all combinations of predictors,
        as done in the `relaimpo
        <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_
        R package (``calc.relimp(type="lmg")``).

        .. warning:: The computation time roughly doubles for each
            additional predictor and therefore this can be extremely slow for
            models with more than 12-15 predictors.

        .. versionadded:: 0.3.0

    Returns
    -------
    stats : :py:class:`pandas.DataFrame` or dict
        Linear regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients
        * ``'se'``: standard errors
        * ``'T'``: T-values
        * ``'pval'``: p-values
        * ``'r2'``: coefficient of determination (:math:`R^2`)
        * ``'adj_r2'``: adjusted :math:`R^2`
        * ``'CI[2.5%]'``: lower confidence intervals
        * ``'CI[97.5%]'``: upper confidence intervals
        * ``'relimp'``: relative contribution of each predictor to the final                        :math:`R^2` (only if ``relimp=True``).
        * ``'relimp_perc'``: percent relative contribution

        In addition, the output dataframe comes with hidden attributes such as
        the residuals, and degrees of freedom of the model and residuals, which
        can be accessed as follow, respectively:

        >>> lm = pg.linear_regression() # doctest: +SKIP
        >>> lm.residuals_, lm.df_model_, lm.df_resid_ # doctest: +SKIP

        Note that to follow scikit-learn convention, these hidden atributes end
        with an "_". When ``as_dataframe=False`` however, these attributes
        are no longer hidden and can be accessed as any other keys in the
        output dictionary.

        >>> lm = pg.linear_regression() # doctest: +SKIP
        >>> lm['residuals'], lm['df_model'], lm['df_resid'] # doctest: +SKIP

        When ``as_dataframe=False`` the dictionary also contains the
        processed ``X`` and ``y`` arrays (i.e, with NaNs removed if
        ``remove_na=True``) and the model's predicted values ``pred``.

        >>> lm['X'], lm['y'], lm['pred'] # doctest: +SKIP

        For a weighted least squares fit, the weighted ``Xw`` and ``yw``
        arrays are included in the dictionary.

        >>> lm['Xw'], lm['yw'] # doctest: +SKIP

    See also
    --------
    logistic_regression, mediation_analysis, corr

    Notes
    -----
    The :math:`\beta` coefficients are estimated using an ordinary least
    squares (OLS) regression, as implemented in the
    :py:func:`scipy.linalg.lstsq` function. The OLS method minimizes
    the sum of squared residuals, and leads to a closed-form expression for
    the estimated :math:`\beta`:

    .. math:: \hat{\beta} = (X^TX)^{-1} X^Ty

    It is generally recommended to include a constant term (intercept) to the
    model to limit the bias and force the residual mean to equal zero.
    Note that intercept coefficient and p-values are however rarely meaningful.

    The standard error of the estimates is a measure of the accuracy of the
    prediction defined as:

    .. math:: \sigma = \sqrt{\text{MSE} \cdot (X^TX)^{-1}}

    where :math:`\text{MSE}` is the mean squared error,

    .. math::

        \text{MSE} = \frac{SS_{\text{resid}}}{n - p - 1}
         = \frac{\sum{(\text{true} - \text{pred})^2}}{n - p - 1}

    :math:`p` is the total number of predictor variables in the model
    (excluding the intercept) and :math:`n` is the sample size.

    Using the :math:`\beta` coefficients and the standard errors,
    the T-values can be obtained:

    .. math:: T = \frac{\beta}{\sigma}

    and the p-values approximated using a T-distribution with
    :math:`n - p - 1` degrees of freedom.

    The coefficient of determination (:math:`R^2`) is defined as:

    .. math:: R^2 = 1 - (\frac{SS_{\text{resid}}}{SS_{\text{total}}})

    The adjusted :math:`R^2` is defined as:

    .. math:: \overline{R}^2 = 1 - (1 - R^2) \frac{n - 1}{n - p - 1}

    The relative importance (``relimp``) column is a partitioning of the
    total :math:`R^2` of the model into individual :math:`R^2` contribution.
    This is calculated by taking the average over average contributions in
    models of different sizes. For more details, please refer to
    `Groemping et al. 2006 <http://dx.doi.org/10.18637/jss.v017.i01>`_
    and the R package `relaimpo
    <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_.

    Note that Pingouin will automatically remove any duplicate columns
    from :math:`X`, as well as any column with only one unique value
    (constant), excluding the intercept.

    Results have been compared against sklearn, R, statsmodels and JASP.

    Examples
    --------
    1. Simple linear regression using columns of a pandas dataframe

    In this first example, we'll use the tips dataset to see how well we
    can predict the waiter's tip (in dollars) based on the total bill (also
    in dollars).

    >>> import numpy as np
    >>> import pingouin as pg
    >>> df = pg.read_dataset('tips')
    >>> # Let's predict the tip ($) based on the total bill (also in $)
    >>> lm = pg.linear_regression(df['total_bill'], df['tip'])
    >>> lm.round(2)
            names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0   Intercept  0.92  0.16   5.76   0.0  0.46    0.45      0.61       1.23
    1  total_bill  0.11  0.01  14.26   0.0  0.46    0.45      0.09       0.12

    It comes as no surprise that total bill is indeed a significant predictor
    of the waiter's tip (T=14.26, p<0.05). The :math:`R^2` of the model is 0.46
    and the adjusted :math:`R^2` is 0.45, which means that our model roughly
    explains ~45% of the total variance in the tip amount.

    2. Multiple linear regression

    We can also have more than one predictor and run a multiple linear
    regression. Below, we add the party size as a second predictor of tip.

    >>> # We'll add a second predictor: the party size
    >>> lm = pg.linear_regression(df[['total_bill', 'size']], df['tip'])
    >>> lm.round(2)
            names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0   Intercept  0.67  0.19   3.46  0.00  0.47    0.46      0.29       1.05
    1  total_bill  0.09  0.01  10.17  0.00  0.47    0.46      0.07       0.11
    2        size  0.19  0.09   2.26  0.02  0.47    0.46      0.02       0.36

    The party size is also a significant predictor of tip (T=2.26, p=0.02).
    Note that adding this new predictor however only improved the :math:`R^2`
    of our model by ~1%.

    This function also works with numpy arrays:

    >>> X = df[['total_bill', 'size']].to_numpy()
    >>> y = df['tip'].to_numpy()
    >>> pg.linear_regression(X, y).round(2)
           names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  0.67  0.19   3.46  0.00  0.47    0.46      0.29       1.05
    1         x1  0.09  0.01  10.17  0.00  0.47    0.46      0.07       0.11
    2         x2  0.19  0.09   2.26  0.02  0.47    0.46      0.02       0.36

    3. Get the residuals

    >>> # For clarity, only display the first 9 values
    >>> np.round(lm.residuals_, 2)[:9]
    array([-1.62, -0.55,  0.31,  0.06, -0.11,  0.93,  0.13, -0.81, -0.49])

    Using pandas, we can show a summary of the distribution of the residuals:

    >>> import pandas as pd
    >>> pd.Series(lm.residuals_).describe().round(2)
    count    244.00
    mean      -0.00
    std        1.01
    min       -2.93
    25%       -0.55
    50%       -0.09
    75%        0.51
    max        4.04
    dtype: float64

    5. No intercept and return only the regression coefficients

    Sometimes it may be useful to remove the constant term from the regression,
    or to only return the regression coefficients without calculating the
    standard errors or p-values. This latter can potentially save you a lot of
    time if you need to calculate hundreds of regression and only care about
    the coefficients!

    >>> pg.linear_regression(X, y, add_intercept=False, coef_only=True)
    array([0.1007119 , 0.36209717])

    6. Return a dictionnary instead of a dataframe

    >>> lm_dict = pg.linear_regression(X, y, as_dataframe=False)
    >>> lm_dict.keys()
    dict_keys(['names', 'coef', 'se', 'T', 'pval', 'r2', 'adj_r2', 'CI[2.5%]',
               'CI[97.5%]', 'df_model', 'df_resid', 'residuals', 'X', 'y',
               'pred'])

    7. Remove missing values

    >>> X[4, 1] = np.nan
    >>> y[7] = np.nan
    >>> pg.linear_regression(X, y, remove_na=True, coef_only=True)
    array([0.65749955, 0.09262059, 0.19927529])

    8. Get the relative importance of predictors

    >>> lm = pg.linear_regression(X, y, remove_na=True, relimp=True)
    >>> lm[['names', 'relimp', 'relimp_perc']]
           names    relimp  relimp_perc
    0  Intercept       NaN          NaN
    1         x1  0.342503    73.045583
    2         x2  0.126386    26.954417

    The ``relimp`` column is a partitioning of the total :math:`R^2` of the
    model into individual contribution. Therefore, it sums to the :math:`R^2`
    of the full model. The ``relimp_perc`` is normalized to sum to 100%. See
    `Groemping 2006 <https://www.jstatsoft.org/article/view/v017i01>`_
    for more details.

    >>> lm[['relimp', 'relimp_perc']].sum()
    relimp           0.468889
    relimp_perc    100.000000
    dtype: float64

    9. Weighted linear regression

    >>> X = [1, 2, 3, 4, 5, 6]
    >>> y = [10, 22, 11, 13, 13, 16]
    >>> w = [1, 0.1, 1, 1, 0.5, 1]  # Array of weights. Must be >= 0.
    >>> lm = pg.linear_regression(X, y, weights=w)
    >>> lm.round(2)
           names  coef    se     T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  9.00  2.03  4.42  0.01  0.51    0.39      3.35      14.64
    1         x1  1.04  0.50  2.06  0.11  0.51    0.39     -0.36       2.44
       y must be one-dimensional.r   .TrowsZpairedaxisRTarget (y) contains NaN or Inf. Please remove them manually or use remove_na=True.UPredictors (X) contain NaN or Inf. Please remove them manually or use remove_na=True.(X and y must have same number of samplesc                 S   s   g | ]}d t |d  qS xr   str.0i r   7/tmp/pip-unpacked-wheel-2te3nxqf/pingouin/regression.py
<listcomp>]  s     z%linear_regression.<locals>.<listcomp>	Interceptr   N      z/At least three valid samples are required in X.z&X must have at least one valid column.z2relimp = True is not supported when using weights.zweights must be a 1D array.z#weights must be of shape n_samples.z!Missing weights are not accepted.z"Negative weights are not accepted.)Zcond)r   FzBDesign matrix supplied with `X` parameter is rank deficient (rank z with zz columns). That means that one or more of the columns in `X` are a linear combination of one of more of the other columns.)weights
CI[%.1f%%]d   namescoefseTpvalr2adj_r2ycolumns)sortr   )Znumeric_onlyrelimprelimp_percdf_modeldf_residZ	residualsXpredywXw);
isinstancepd	DataFramekeystolistSeriesnamenpasarrayndimAssertionErrornewaxisrm_nasqueezeisfiniteallshaperangecolumn_stackonesinsertZcount_nonzeroZflatnonzerolendeletewhere	itertoolscombinationsarray_equalappend
ValueErrorsizeisnananydiagsqrtr   warningswarnsumZaveragelinalgpinvr*   Zdiagonalr   sffabsppfconcat_relimpZdropZcovnanupdater	   Z	df_model_Z	df_resid_Z
residuals_)3r6   r.   Zadd_interceptr$   	coef_onlyalphaas_dataframer   r2   r'   y_gdX_gdZ	n_nonzeroZidx_zero
idx_uniqueZconstantidx_duplicatepairnpwZwtsr9   r8   r(   Zss_resZrank_Zcalc_ss_resr4   r5   r7   Zresidss_totZss_wtotr,   r-   ZmseZbeta_varZbeta_ser*   r+   critZ
marg_errorllulll_nameul_namestatsdataZrelir   r   r   r
      s&     ,





(.





 
           


c                 C   s$  t | tjst| j }|dd }t|}d}td|d }| j	||f }tj
| j||f | j||f  }|| j||f  | }i }	g }
|D ]n}t||}g }td|d D ]}g }t|t|D ]}t|}||g }tt||	 kr|	tt| }n<| j||f }tj
| j||f | | }||	tt|< | j||f }t| j||f | | }||	tt|< || | }|| q|t| q| j||f }tj
| j||f | | }|| }|| }|| |
t| q||
|
t|
 d d}|S )a  Relative importance of predictors in multiple regression.

    This is an internal function. This function should only be used with a low
    number of predictors. Indeed, the computation time roughly doubles for each
    additional predictor.

    Parameters
    ----------
    S : pd.DataFrame
        Covariance matrix. The target variable MUST be the FIRST column,
        followed by the predictors (excluding the intercept).
    r   Nr   r&   )r'   r2   r3   )r:   r;   r<   rD   r0   r>   rO   rA   arangeZiatr_   r`   ZilocZ	setdiff1drR   rS   intlistr   sortedr=   r   rU   meanr^   )ScolsZ
predictorsZnpredZ
target_intZpredictors_intrt   ZbetasZr2_fullZss_reg_precompZ	all_predsr7   ZlooZr2_seq_meankZr2_seqrq   Zp_withZss_reg_withoutZ	S_withoutZS_withZss_reg_withZr2_diffZss_regZ
r2_withoutZstats_relimpr   r   r   re     sT    
"


re   c                  K   s  ddl m} |dd ddlm} t| tjr<|   }	nt| tj	rR| j
g}	ng }	t| } t|}|jdks|tdd|  k rdk sn td| jdkr| d	tjf } |rt| |d	tjf dd
d\} }t|}t| }
t|  }|
std|std|jd | jd ks4tdt|jdkrNtd|	sldd t| jd D }	ttj| | dddf kddd }t|rt| |d} t|	| }	| jd dkrRg }tt| jd dD ]B}t| dd|d f | dd|d f r||d  qt|rRt| |d} t|	| }	d|krdd|d< d|krvd|d< |f |}| | | |! d r|	"dd t#t$| jd | f}t|j%|j&}n
|j&}| }|r|S |j\}}ddt'|(|   }t)||dfj*}|| j*| }tj+,|}t-t.|}|| }dt/0t1| }t/2d|d  }|||  }|||  }dd| d  }ddd|d    }d|	d|d|d |d!|||||i}|rt3t|S |S dS )"uB&  (Multiple) Binary logistic regression.

    Parameters
    ----------
    X : array_like
        Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*.
    y : array_like
        Dependent variable, of shape *(n_samples)*.
        ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic
        regression is not supported.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\text{CI} = [\alpha / 2 ; 1 - \alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    **kwargs : optional
        Optional arguments passed to
        :py:class:`sklearn.linear_model.LogisticRegression` (see Notes).

    Returns
    -------
    stats : :py:class:`pandas.DataFrame` or dict
        Logistic regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients (log-odds)
        * ``'se'``: standard error
        * ``'z'``: z-scores
        * ``'pval'``: two-tailed p-values
        * ``'CI[2.5%]'``: lower confidence interval
        * ``'CI[97.5%]'``: upper confidence interval

    See also
    --------
    linear_regression

    Notes
    -----
    .. caution:: This function is a wrapper around the
        :py:class:`sklearn.linear_model.LogisticRegression` class. However,
        Pingouin internally disables the L2 regularization and changes the
        default solver to 'newton-cg' to obtain results that are similar to R and
        statsmodels.

    Logistic regression assumes that the log-odds (the logarithm of the
    odds) for the value labeled "1" in the response variable is a linear
    combination of the predictor variables. The log-odds are given by the
    `logit <https://en.wikipedia.org/wiki/Logit>`_ function,
    which map a probability :math:`p` of the response variable being "1"
    from :math:`[0, 1)` to :math:`(-\infty, +\infty)`.

    .. math:: \text{logit}(p) = \ln \frac{p}{1 - p} = \beta_0 + \beta X

    The odds of the response variable being "1" can be obtained by
    exponentiating the log-odds:

    .. math:: \frac{p}{1 - p} = e^{\beta_0 + \beta X}

    and the probability of the response variable being "1" is given by the
    `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_:

    .. math:: p = \frac{1}{1 + e^{-(\beta_0 + \beta X})}

    The first coefficient is always the constant term (intercept) of
    the model. Pingouin will automatically add the intercept
    to your predictor(s) matrix, therefore, :math:`X` should not include a
    constant term. Pingouin will remove any constant term (e.g column with only
    one unique value), or duplicate columns from :math:`X`.

    The calculation of the p-values and confidence interval is adapted from a
    `code by Rob Speare
    <https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d>`_.
    Results have been compared against statsmodels, R, and JASP.

    Examples
    --------
    1. Simple binary logistic regression.

    In this first example, we'll use the
    `penguins dataset <https://github.com/allisonhorst/palmerpenguins>`_
    to see how well we can predict the sex of penguins based on their
    bodies mass.

    >>> import numpy as np
    >>> import pandas as pd
    >>> import pingouin as pg
    >>> df = pg.read_dataset('penguins')
    >>> # Let's first convert the target variable from string to boolean:
    >>> df['male'] = (df['sex'] == 'male').astype(int)  # male: 1, female: 0
    >>> # Since there are missing values in our outcome variable, we need to
    >>> # set `remove_na=True` otherwise regression will fail.
    >>> lom = pg.logistic_regression(df['body_mass_g'], df['male'],
    ...                              remove_na=True)
    >>> lom.round(2)
             names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0    Intercept -5.16  0.71 -7.24   0.0     -6.56      -3.77
    1  body_mass_g  0.00  0.00  7.24   0.0      0.00       0.00

    Body mass is a significant predictor of sex (p<0.001). Here, it
    could be useful to rescale our predictor variable from *g* to *kg*
    (e.g divide by 1000) in order to get more intuitive coefficients and
    confidence intervals:

    >>> df['body_mass_kg'] = df['body_mass_g'] / 1000
    >>> lom = pg.logistic_regression(df['body_mass_kg'], df['male'],
    ...                              remove_na=True)
    >>> lom.round(2)
              names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0     Intercept -5.16  0.71 -7.24   0.0     -6.56      -3.77
    1  body_mass_kg  1.23  0.17  7.24   0.0      0.89       1.56

    2. Multiple binary logistic regression

    We'll now add the species as a categorical predictor in our model. To do
    so, we first need to dummy-code our categorical variable, dropping the
    first level of our categorical variable (species = Adelie) which will be
    used as the reference level:

    >>> df = pd.get_dummies(df, columns=['species'], dtype=float, drop_first=True)
    >>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']]
    >>> y = df['male']
    >>> lom = pg.logistic_regression(X, y, remove_na=True)
    >>> lom.round(2)
                   names   coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0          Intercept -26.24  2.84 -9.24  0.00    -31.81     -20.67
    1       body_mass_kg   7.10  0.77  9.23  0.00      5.59       8.61
    2  species_Chinstrap  -0.13  0.42 -0.31  0.75     -0.96       0.69
    3     species_Gentoo  -9.72  1.12 -8.65  0.00    -11.92      -7.52

    3. Using NumPy aray and returning only the coefficients

    >>> pg.logistic_regression(X.to_numpy(), y.to_numpy(), coef_only=True,
    ...                        remove_na=True)
    array([-26.23906892,   7.09826571,  -0.13180626,  -9.71718529])

    4. Passing custom parameters to sklearn

    >>> lom = pg.logistic_regression(X, y, solver='sag', max_iter=10000,
    ...                           random_state=42, remove_na=True)
    >>> print(lom['coef'].to_numpy())
    [-25.98248153   7.02881472  -0.13119779  -9.62247569]

    **How to interpret the log-odds coefficients?**

    We'll use the `Wikipedia example
    <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_
    of the probability of passing an exam
    versus the hours of study:

    *A group of 20 students spends between 0 and 6 hours studying for an
    exam. How does the number of hours spent studying affect the
    probability of the student passing the exam?*

    >>> # First, let's create the dataframe
    >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50,
    ...          2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]
    >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]
    >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass})
    >>> # And then run the logistic regression
    >>> lr = pg.logistic_regression(df['HoursStudy'], df['PassExam']).round(3)
    >>> lr
            names   coef     se      z   pval  CI[2.5%]  CI[97.5%]
    0   Intercept -4.078  1.761 -2.316  0.021    -7.529     -0.626
    1  HoursStudy  1.505  0.629  2.393  0.017     0.272      2.737

    The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1``
    when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating
    the log-odds:

    >>> np.exp(-4.078)
    0.016941314421496552

    i.e. :math:`0.017:1`. Conversely the odds of failing the exam are
    :math:`(1/0.017) \approx 59:1`.

    The probability can then be obtained with the following equation

    .. math:: p = \frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}}

    >>> 1 / (1 + np.exp(-(-4.078)))
    0.016659087580814722

    The ``HoursStudy`` coefficient (1.505) means that for each additional hour
    of study, the log-odds of passing the exam increase by 1.505, and the odds
    are multipled by :math:`e^{1.505} \approx 4.50`.

    For example, a student who studies 2 hours has a probability of passing
    the exam of 25%:

    >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505)))
    0.2557836148964987

    The table below shows the probability of passing the exam for several
    values of ``HoursStudy``:

    +----------------+----------+----------------+------------------+
    | Hours of Study | Log-odds | Odds           | Probability      |
    +================+==========+================+==================+
    | 0              | −4.08    | 0.017 ≈ 1:59   | 0.017            |
    +----------------+----------+----------------+------------------+
    | 1              | −2.57    | 0.076 ≈ 1:13   | 0.07             |
    +----------------+----------+----------------+------------------+
    | 2              | −1.07    | 0.34 ≈ 1:3     | 0.26             |
    +----------------+----------+----------------+------------------+
    | 3              | 0.44     | 1.55           | 0.61             |
    +----------------+----------+----------------+------------------+
    | 4              | 1.94     | 6.96           | 0.87             |
    +----------------+----------+----------------+------------------+
    | 5              | 3.45     | 31.4           | 0.97             |
    +----------------+----------+----------------+------------------+
    | 6              | 4.96     | 141.4          | 0.99             |
    +----------------+----------+----------------+------------------+
    r   )_is_sklearn_installedT)Zraise_error)LogisticRegressionr   r   zalpha must be between 0 and 1..r   r   r   r   r   r"   z"Dependent variable must be binary.c                 S   s   g | ]}d t |d  qS r   r   r   r   r   r   r   f  s     z'logistic_regression.<locals>.<listcomp>Nr!   Zsolverz	newton-cgZpenaltynoneZfit_interceptr    r%   r&   r'   r(   r)   zr+   )4pingouin.utilsr   Zsklearn.linear_modelr   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   uniquerW   rV   rK   rQ   rO   rP   rR   rS   rT   rU   Zfit
get_paramsrN   rL   rM   Z
intercept_Zcoef_coshZdecision_functionZtiler*   r_   r`   r[   rZ   r   ra   rb   rc   r	   ) r6   r.   rh   ri   rj   r   kwargsr   r   r'   rk   rl   rm   rn   ro   ZlomZX_designr(   rp   rq   ZdenomZfimZcraor)   Zz_scoresr+   ru   rv   rw   rx   ry   rz   r   r   r   r   Z  s     a





(
.




       	linearc                 K   s   g }t |D ]^}	|dkr>|t| | |||	f ddd  q|t| | |||	f fddi|d  qt|| || dddd|  }
||
 S )z<Point estimate of indirect effect based on bootstrap sample.r   T)rh   r   rh   r"   )rK   rU   r
   r   )X_valXM_valM_valy_validx
n_mediatormtypelogreg_kwargsZbeta_mjZbeta_yr   r   r   _point_estimate  s    &$"r   c                 C   sn   t t| |k }t d| t |d  d }t d| t d|d   d }tj| ||gd}|S )a  Bias-corrected confidence intervals

    Parameters
    ----------
    bootdist : array-like
        Array with bootstrap estimates for each sample.
    sample_point : float
        Point estimate based on full sample.
    alpha : float
        Alpha for confidence interval.

    Returns
    -------
    CI : 1d array-like
        Lower and upper bias-corrected confidence interval estimates.

    Notes
    -----
    This is what's used in the "cper" method implemented in :py:func:`pingouin.compute_bootci`.

    This differs from the bias-corrected and accelerated method (BCa, default in Matlab and
    SciPy) because it does not correct for skewness. Indeed, the acceleration parameter, a,
    is proportional to the skewness of the bootstrap distribution. The bias-correction parameter,
    z0, is related to the proportion of bootstrap estimates that are less than the observed
    statistic.
    r"   r&   r   )q)r   rc   rA   r   ZcdfZ
percentile)ZbootdistZsample_pointri   Zz0Zadjusted_llZadjusted_ulcir   r   r   _bias_corrected_ci  s
     $r   c                 C   s>   |dkrd}n&dt t| dkt| dk  t|  }t |dS )zCompute p-value from bootstrap distribution.
    Similar to the pval function in the R package mediation.
    Note that this is less accurate than a permutation test because the
    bootstrap distribution is not conditioned on a true null hypothesis.
    r   r   r"   )minr^   rO   )ZbootZestimateoutr   r   r   _pval_from_bootci  s    &r     c
           $         sn  t |ttfstdt |ttfs,tdt |tttfsDtdt |tdtttfs^tt |ttfrr|g}t|}
t  tjstd|
tt	|kstdt |ttfr|g}t |trt|tt	|kstdt	|
|stdt||||g} jtfdd	|D s,td
d}t fdd	|D sPt| |    jd }|dksxtdt |  dkrdnd}|	dkri n|	}	dd| d  }ddd|d    }dddd||g} t||g  } t|||g  } |  } |  }t }dtd< i }t|D ]\}}|dkrxt||dd|f |djdg|f ||< n2t||dd|f fd|i|	jdg|f ||< d| || jd< q:tj|dd }t t||g ||djd|
|f }t|||djdg|f }t|||djdg|f }|d d!d" |d< d#|jd< d$|jd< tj||||fdd }t|d |k d%d&|d'< tj|}|jt |d||fd(}tj!||
fd)}t"|D ]6} t#|||||| ddf |
|f|	|| ddf< qt#||||t ||
|f|	}!d|d|!d|j$ddd*dg |g |g d'g i}"t"|
D ]}t%|dd|f |"d | |d}#|"| &t'|# |"| &t(|# |"d &t)|dd|f |"d |  |"d' &|"d | |k rd%nd& qJtj*|"}"|
dkrd+|"d< n|"d d,d" |"d< tj||"gddd-d.}|j+dd/id0}t,| |rbt-|t.|fS t-|S dS )1u  Mediation analysis using a bias-correct non-parametric bootstrap method.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Dataframe.
    x : str
        Column name in data containing the predictor variable.
        The predictor variable must be continuous.
    m : str or list of str
        Column name(s) in data containing the mediator variable(s).
        The mediator(s) can be continuous or binary (e.g. 0 or 1).
        This function supports multiple parallel mediators.
    y : str
        Column name in data containing the outcome variable.
        The outcome variable must be continuous.
    covar : None, str, or list
        Covariate(s). If not None, the specified covariate(s) will be included
        in all regressions.
    alpha : float
        Significance threshold. Used to determine the confidence interval,
        :math:`\text{CI} = [\alpha / 2 ; 1 - \alpha / 2]`.
    n_boot : int
        Number of bootstrap iterations for confidence intervals and p-values
        estimation. The greater, the slower.
    seed : int or None
        Random state seed.
    logreg_kwargs : dict or None
        Dictionary with optional arguments passed to :py:func:`pingouin.logistic_regression`
    return_dist : bool
        If True, the function also returns the indirect bootstrapped beta
        samples (size = n_boot). Can be plotted for instance using
        :py:func:`seaborn.distplot()` or :py:func:`seaborn.kdeplot()`
        functions.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`
        Mediation summary:

        * ``'path'``: regression model
        * ``'coef'``: regression estimates
        * ``'se'``: standard error
        * ``'CI[2.5%]'``: lower confidence interval
        * ``'CI[97.5%]'``: upper confidence interval
        * ``'pval'``: two-sided p-values
        * ``'sig'``: statistical significance

    See also
    --------
    linear_regression, logistic_regression

    Notes
    -----
    Mediation analysis [1]_ is a *"statistical procedure to test
    whether the effect of an independent variable X on a dependent variable
    Y (i.e., X → Y) is at least partly explained by a chain of effects of the
    independent variable on an intervening mediator variable M and of the
    intervening variable on the dependent variable (i.e., X → M → Y)"* [2]_.

    The **indirect effect** (also referred to as average causal mediation
    effect or ACME) of X on Y through mediator M quantifies the estimated
    difference in Y resulting from a one-unit change in X through a sequence of
    causal steps in which X affects M, which in turn affects Y.
    It is considered significant if the specified confidence interval does not
    include 0. The path 'X --> Y' is the sum of both the indirect and direct
    effect. It is sometimes referred to as total effect.

    A linear regression is used if the mediator variable is continuous and a
    logistic regression if the mediator variable is dichotomous (binary).
    Multiple parallel mediators are also supported.

    This function will only work well if the outcome variable is continuous.
    It does not support binary or ordinal outcome variable. For more
    advanced mediation models, please refer to the
    `lavaan <http://lavaan.ugent.be/tutorial/mediation.html>`_ or  `mediation
    <https://cran.r-project.org/web/packages/mediation/mediation.pdf>`_ R
    packages, or the `PROCESS macro
    <https://www.processmacro.org/index.html>`_ for SPSS.

    The two-sided p-value of the indirect effect is computed using the
    bootstrap distribution, as in the mediation R package. However, the p-value
    should be interpreted with caution since it is not constructed
    conditioned on a true null hypothesis [3]_ and varies depending on the
    number of bootstrap samples and the random seed.

    Note that rows with missing values are automatically removed.

    Results have been tested against the R mediation package and this tutorial
    https://data.library.virginia.edu/introduction-to-mediation-analysis/

    References
    ----------
    .. [1] Baron, R. M. & Kenny, D. A. The moderator–mediator variable
           distinction in social psychological research: Conceptual, strategic,
           and statistical considerations. J. Pers. Soc. Psychol. 51, 1173–1182
           (1986).

    .. [2] Fiedler, K., Schott, M. & Meiser, T. What mediation analysis can
           (not) do. J. Exp. Soc. Psychol. 47, 1231–1236 (2011).


    .. [3] Hayes, A. F. & Rockwood, N. J. Regression-based statistical
           mediation and moderation analysis in clinical research:
           Observations, recommendations, and implementation. Behav. Res.
           Ther. 98, 39–57 (2017).

    Code originally adapted from https://github.com/rmill040/pymediation.

    Examples
    --------
    1. Simple mediation analysis

    >>> from pingouin import mediation_analysis, read_dataset
    >>> df = read_dataset('mediation')
    >>> mediation_analysis(data=df, x='X', m='M', y='Y', alpha=0.05,
    ...                    seed=42)
           path      coef        se          pval  CI[2.5%]  CI[97.5%]  sig
    0     M ~ X  0.561015  0.094480  4.391362e-08  0.373522   0.748509  Yes
    1     Y ~ M  0.654173  0.085831  1.612674e-11  0.483844   0.824501  Yes
    2     Total  0.396126  0.111160  5.671128e-04  0.175533   0.616719  Yes
    3    Direct  0.039604  0.109648  7.187429e-01 -0.178018   0.257226   No
    4  Indirect  0.356522  0.083313  0.000000e+00  0.219818   0.537654  Yes

    2. Return the indirect bootstrapped beta coefficients

    >>> stats, dist = mediation_analysis(data=df, x='X', m='M', y='Y',
    ...                                  return_dist=True)
    >>> print(dist.shape)
    (500,)

    3. Mediation analysis with a binary mediator variable

    >>> mediation_analysis(data=df, x='X', m='Mbin', y='Y', seed=42).round(3)
           path   coef     se   pval  CI[2.5%]  CI[97.5%]  sig
    0  Mbin ~ X -0.021  0.116  0.857    -0.248      0.206   No
    1  Y ~ Mbin -0.135  0.412  0.743    -0.952      0.682   No
    2     Total  0.396  0.111  0.001     0.176      0.617  Yes
    3    Direct  0.396  0.112  0.001     0.174      0.617  Yes
    4  Indirect  0.002  0.050  0.960    -0.072      0.146   No

    4. Mediation analysis with covariates

    >>> mediation_analysis(data=df, x='X', m='M', y='Y',
    ...                    covar=['Mbin', 'Ybin'], seed=42).round(3)
           path   coef     se   pval  CI[2.5%]  CI[97.5%]  sig
    0     M ~ X  0.559  0.097  0.000     0.367      0.752  Yes
    1     Y ~ M  0.666  0.086  0.000     0.495      0.837  Yes
    2     Total  0.420  0.113  0.000     0.196      0.645  Yes
    3    Direct  0.064  0.110  0.561    -0.155      0.284   No
    4  Indirect  0.356  0.086  0.000     0.209      0.553  Yes

    5. Mediation analysis with multiple parallel mediators

    >>> mediation_analysis(data=df, x='X', m=['M', 'Mbin'], y='Y',
    ...                    seed=42).round(3)
                path   coef     se   pval  CI[2.5%]  CI[97.5%]  sig
    0          M ~ X  0.561  0.094  0.000     0.374      0.749  Yes
    1       Mbin ~ X -0.005  0.029  0.859    -0.063      0.052   No
    2          Y ~ M  0.654  0.086  0.000     0.482      0.825  Yes
    3       Y ~ Mbin -0.064  0.328  0.846    -0.715      0.587   No
    4          Total  0.396  0.111  0.001     0.176      0.617  Yes
    5         Direct  0.040  0.110  0.721    -0.179      0.258   No
    6     Indirect M  0.356  0.085  0.000     0.215      0.538  Yes
    7  Indirect Mbin  0.000  0.010  0.952    -0.017      0.025   No
    zy must be a string or int.z*Mediator(s) must be a list, string or int.NzData must be a DataFrame.z!Cannot have duplicates mediators.zCannot have duplicates covar.zMediator cannot be in covar.c                    s   g | ]}| kqS r   r   r   c)r=   r   r   r     s     z&mediation_analysis.<locals>.<listcomp>zColumn(s) are not in DataFrame.z#Columns must be numeric or boolean.c                    s   g | ]} | j jd kqS )Zbfiu)Zdtypekindr   )r{   r   r   r     s     r      z.DataFrame must have at least 5 samples (rows).r"   Zlogisticr   r%   r&   r   r'   r(   r)   r+   round)ri   ri   z%s ~ X)r   r'   T)ignore_indexc                 S   s   d|  S )NzY ~ %sr   r   r   r   r   <lambda>      z$mediation_analysis.<locals>.<lambda>ZDirectZTotalZYesZNosig)replacerW   )rJ   )Zddofr   ZIndirectc                 S   s   d|  S )NzIndirect %sr   r   r   r   r   r   (  r   F)r   r   r1   pathr/   )/r:   r   r}   rD   r~   typerO   r;   r<   set
isdisjoint_flr0   rI   ZdropnarJ   ZnuniqueZto_numpyr   copy	enumerater
   locr   atrd   applyrA   rQ   randomZRandomStatechoicer|   zerosrK   r   Zstdr   rU   r   maxr   	from_dictrenamerg   r	   rG   )$r{   r   mr.   Zcovarri   Zn_bootseedZreturn_distr   r   r0   err_msgrp   r   rx   ry   r   r   r   r   r   Zold_optionsZsxmr   r   ZsmyZsxydirectrz   rngZab_estimatesr   abZindirectZci_jr   )r{   r=   r   r     s     5
  

,"(

                   
"((


)TNFr   TFF)Fr   TF)r   )r   )
NNNNNr   r   NFN)rR   r\   ZnumpyrA   Zpandasr;   Zpandas_flavorpfZscipy.statsr   r   Zscipy.linalgr   r   Zpingouin.configr   r   r   rF   r   r   r	   __all__r
   re   r   r   r   r   Zregister_dataframe_methodr   r   r   r   r   <module>   sZ   
       
   uX       
  `

'          