U
    Qvfz                     @   s   d dl Zd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlZd	d
ddddgZejdd	 Zejd$dd
ZejddddddddfddZejd%ddZd&d dZejd'd#dZdS )(    N)combinationsproduct)options)anova)	multicomp)compute_effsize)_check_dataframe_flatten_list_postprocess_dataframe)studentized_rangepairwise_ttestspairwise_testsptestspairwise_tukeypairwise_gameshowellpairwise_corrc                  O   s   t dt t| |S )zSThis function has been deprecated . Use :py:func:`pingouin.pairwise_tests` instead.z:pairwise_ttests is deprecated, use pairwise_tests instead.)warningswarnUserWarningr   )argskwargs r   5/tmp/pip-unpacked-wheel-2te3nxqf/pingouin/pairwise.pyr      s    T皙?	two-sidednonehedgesautolistwiseFc           B         s
  ddl m} ddlm}m} t||||d d |dks@tdt|tsRtd|d	ks^td
}d
}d}t|t	rt
|dkrd}d}t fdd|D stn|d }t|t	rt
|dkrd}d}t fdd|D stn|d }t||grtdt|ttfr4|dkr4d}|  ks4tt|ttfrd|dkrdd}|  ksdtt|ttfrt|ttfrd}t|  k|  kgstdddddddddd d!d"d#d$d%d&d'd(d)|
g}|dk	r,|dkr|n||g} j|||dd*}|d+kr| }|jd
|d,  |d-kr||dkrDdnd
}|dkrV|n|} j|ddd.| }|j }t
|d/krt	t|d/}t|}|dddf }|dddf }ntd0tjtjtt
||d1} ddddd%d(d)g}!d dg}"| |! t| |!< | |" t| |"< || j dddf< || j dddf< || j dddf< || j ddd%f< || j dddf< t!" }#dt!d2< t| j#d D ]t}$| j$|$df | j$|$df  }%}&|%|%j&tjd3}'|%|&j&tjd3}(|r*d!})||'|(|||d4}*|*j$d5 | j$|$d)f< |*j$d6 | j$|$d$f< n,|rDd#})||'|(|d7}*nd"})||'|(|d7}*t!'|# t(|'|(|
|d8}+|rt)|'| j$|$df< t)|(| j$|$df< tj*|'dd9| j$|$df< tj*|(dd9| j$|$df< |*|) j+d | j$|$|)f< |*d: j+d | j$|$d&f< |+| j$|$|
f< q| d& j,dkr$dn|	}	|	dk	rh|	- d;krxt.| d& & ||	d<\},| d'< |	| d(< nd| d'< d| d(< n|dkr|}-|-}.ddg}/d
}d
d
g}0nx|dkr|}-ddg}.|-}/d}ddg}0nP|r||g}-d|g}.|dg}/d
}d
dg}0n$||g}-|dg}.d|g}/d}dd
g}0t } t/|-D ]\}$}1t|0|$ |grf j||1gd
ddd=j0dd>}2n }2t1||.|$ |/|$ ||2|||||	|
|||d?}3tj2| |3gddd
d@} q,|
r| j#d }4|dk	r؈ 3|4   j|-d dddA| }5 j|-d dddA| }6 j|-dddA| }7|5j }8|6j }9t	t|9d/}:t	t5|8|:};t
|;}<tj6|<dBftdC}t|<D ]}$t7|;|$ ddD||$< qnt8|4|4|< }=| 9| j:;|=} |-d dE |-d  | j |=df< |dddf | j |=df< || j |=df< || j |=d%f< |dddf | j |=df< |ddd/f | j |=df< t!" }#dt!d2< t/|D ]t\}$}>|4|$ }?|>\}@}%}&|7%|@|%fj&tjd3}'|7%|@|&fj&tjd3}(t(|'|(|
|d8}+|rd!})||'|(|||d4}*|*j$d5 | j$|?d)f< |*j$d6 | j$|?d$f< n,|	rd#})||'|(|d7}*nd"})||'|(|d7}*t!'|# |	rt)|'| j$|?df< t)|(| j$|?df< tj*|'dd9| j$|?df< tj*|(dd9| j$|?df< |*|) j+d | j$|?|)f< |*d: j+d | j$|?d&f< |+| j$|?|
f< qL|	dk	
r|	- d;k
rt.| j |=d&f & ||	d<\},}A|A| j |=d'f< |	| j |=d(f< || j ddd f< | t|t<|| j=  } | jdddF} |dGk
r|
r| d j>dHddI | j?d|-d iddJ t@| S )Ka.  Pairwise tests.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame. Note that this function can also directly be used as a
        Pandas method, in which case this argument is no longer needed.
    dv : string
        Name of column containing the dependent variable.
    between : string or list with 2 elements
        Name of column(s) containing the between-subject factor(s).
    within : string or list with 2 elements
        Name of column(s) containing the within-subject factor(s), i.e. the
        repeated measurements.
    subject : string
        Name of column containing the subject identifier. This is mandatory
        when ``within`` is specified.
    parametric : boolean
        If True (default), use the parametric :py:func:`ttest` function.
        If False, use :py:func:`pingouin.wilcoxon` or :py:func:`pingouin.mwu`
        for paired or unpaired samples, respectively.
    marginal : boolean
        If True (default), the between-subject pairwise T-test(s) will be calculated
        after averaging across all levels of the within-subject factor in mixed
        design. This is recommended to avoid violating the assumption of
        independence and conflating the degrees of freedom by the
        number of repeated measurements.

        .. versionadded:: 0.3.2
    alpha : float
        Significance level
    alternative : string
        Defines the alternative hypothesis, or tail of the test. Must be one of
        "two-sided" (default), "greater" or "less". Both "greater" and "less" return one-sided
        p-values. "greater" tests against the alternative hypothesis that the mean of ``x``
        is greater than the mean of ``y``.
    padjust : string
        Method used for testing and adjustment of pvalues.

        * ``'none'``: no correction
        * ``'bonf'``: one-step Bonferroni correction
        * ``'sidak'``: one-step Sidak correction
        * ``'holm'``: step-down method using Bonferroni adjustments
        * ``'fdr_bh'``: Benjamini/Hochberg FDR correction
        * ``'fdr_by'``: Benjamini/Yekutieli FDR correction
    effsize : string or None
        Effect size type. Available methods are:

        * ``'none'``: no effect size
        * ``'cohen'``: Unbiased Cohen d
        * ``'hedges'``: Hedges g
        * ``'r'``: Pearson correlation coefficient
        * ``'eta-square'``: Eta-square
        * ``'odds-ratio'``: Odds ratio
        * ``'AUC'``: Area Under the Curve
        * ``'CLES'``: Common Language Effect Size
    correction : string or boolean
        For independent two sample T-tests, specify whether or not to correct for
        unequal variances using Welch separate variances T-test. If `'auto'`,
        it will automatically uses Welch T-test when the sample sizes are
        unequal, as recommended by Zimmerman 2004.

        .. versionadded:: 0.3.2
    nan_policy : string
        Can be `'listwise'` for listwise deletion of missing values in repeated
        measures design (= complete-case analysis) or `'pairwise'` for the
        more liberal pairwise deletion (= available-case analysis). The former (default) is more
        appropriate for post-hoc analysis following an ANOVA, however it can drastically reduce
        the power of the test: any subject with one or more missing value(s) will be
        completely removed from the analysis.

        .. versionadded:: 0.2.9
    return_desc : boolean
        If True, append group means and std to the output dataframe
    interaction : boolean
        If there are multiple factors and ``interaction`` is True (default),
        Pingouin will also calculate T-tests for the interaction term (see Notes).

        .. versionadded:: 0.2.9
    within_first : boolean
        Determines the order of the interaction in mixed design. Pingouin will
        return within * between when this parameter is set to True (default),
        and between * within otherwise.

        .. versionadded:: 0.3.6

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'Contrast'``: Contrast (= independent variable or interaction)
        * ``'A'``: Name of first measurement
        * ``'B'``: Name of second measurement
        * ``'Paired'``: indicates whether the two measurements are paired or
          independent
        * ``'Parametric'``: indicates if (non)-parametric tests were used
        * ``'T'``: T statistic (only if parametric=True)
        * ``'U-val'``: Mann-Whitney U stat (if parametric=False and unpaired
          data)
        * ``'W-val'``: Wilcoxon W stat (if parametric=False and paired data)
        * ``'dof'``: degrees of freedom (only if parametric=True)
        * ``'alternative'``: tail of the test
        * ``'p-unc'``: Uncorrected p-values
        * ``'p-corr'``: Corrected p-values
        * ``'p-adjust'``: p-values correction method
        * ``'BF10'``: Bayes Factor
        * ``'hedges'``: effect size (or any effect size defined in
          ``effsize``)

    See also
    --------
    ttest, mwu, wilcoxon, compute_effsize, multicomp

    Notes
    -----
    Data are expected to be in long-format. If your data is in wide-format,
    you can use the :py:func:`pandas.melt` function to convert from wide to
    long format.

    If ``between`` or ``within`` is a list (e.g. ['col1', 'col2']),
    the function returns 1) the pairwise T-tests between each values of the
    first column, 2) the pairwise T-tests between each values of the second
    column and 3) the interaction between col1 and col2. The interaction is
    dependent of the order of the list, so ['col1', 'col2'] will not yield the
    same results as ['col2', 'col1']. Furthermore, the interaction will only be
    calculated if ``interaction=True``.

    If ``between`` is a list with two elements, the output
    model is between1 + between2 + between1 * between2.

    Similarly, if ``within`` is a list with two elements, the output model is
    within1 + within2 + within1 * within2.

    If both ``between`` and ``within`` are specified, the output model is
    within + between + within * between (= mixed design), unless
    ``within_first=False`` in which case the model becomes between + within +
    between * within.

    Missing values in repeated measurements are automatically removed using a
    listwise (default) or pairwise deletion strategy. The former is more conservative, as any
    subject with one or more missing value(s) will be completely removed from the dataframe prior
    to calculating the T-tests. The ``nan_policy`` parameter can therefore have a huge impact
    on the results.

    Examples
    --------
    For more examples, please refer to the `Jupyter notebooks
    <https://github.com/raphaelvallat/pingouin/blob/master/notebooks/01_ANOVA.ipynb>`_

    1. One between-subject factor

    >>> import pandas as pd
    >>> import pingouin as pg
    >>> pd.set_option('display.expand_frame_repr', False)
    >>> pd.set_option('display.max_columns', 20)
    >>> df = pg.read_dataset('mixed_anova.csv')
    >>> pg.pairwise_tests(dv='Scores', between='Group', data=df).round(3)
      Contrast        A           B  Paired  Parametric     T    dof alternative  p-unc   BF10  hedges
    0    Group  Control  Meditation   False        True -2.29  178.0   two-sided  0.023  1.813   -0.34

    2. One within-subject factor

    >>> post_hocs = pg.pairwise_tests(dv='Scores', within='Time', subject='Subject', data=df)
    >>> post_hocs.round(3)
      Contrast        A        B  Paired  Parametric      T   dof alternative  p-unc   BF10  hedges
    0     Time   August  January    True        True -1.740  59.0   two-sided  0.087  0.582  -0.328
    1     Time   August     June    True        True -2.743  59.0   two-sided  0.008  4.232  -0.483
    2     Time  January     June    True        True -1.024  59.0   two-sided  0.310  0.232  -0.170

    3. Non-parametric pairwise paired test (wilcoxon)

    >>> pg.pairwise_tests(dv='Scores', within='Time', subject='Subject',
    ...                    data=df, parametric=False).round(3)
      Contrast        A        B  Paired  Parametric  W-val alternative  p-unc  hedges
    0     Time   August  January    True       False  716.0   two-sided  0.144  -0.328
    1     Time   August     June    True       False  564.0   two-sided  0.010  -0.483
    2     Time  January     June    True       False  887.0   two-sided  0.840  -0.170

    4. Mixed design (within and between) with bonferroni-corrected p-values

    >>> posthocs = pg.pairwise_tests(dv='Scores', within='Time', subject='Subject',
    ...                               between='Group', padjust='bonf', data=df)
    >>> posthocs.round(3)
           Contrast     Time        A           B Paired  Parametric      T   dof alternative  p-unc  p-corr p-adjust   BF10  hedges
    0          Time        -   August     January   True        True -1.740  59.0   two-sided  0.087   0.261     bonf  0.582  -0.328
    1          Time        -   August        June   True        True -2.743  59.0   two-sided  0.008   0.024     bonf  4.232  -0.483
    2          Time        -  January        June   True        True -1.024  59.0   two-sided  0.310   0.931     bonf  0.232  -0.170
    3         Group        -  Control  Meditation  False        True -2.248  58.0   two-sided  0.028     NaN      NaN  2.096  -0.573
    4  Time * Group   August  Control  Meditation  False        True  0.316  58.0   two-sided  0.753   1.000     bonf  0.274   0.081
    5  Time * Group  January  Control  Meditation  False        True -1.434  58.0   two-sided  0.157   0.471     bonf  0.619  -0.365
    6  Time * Group     June  Control  Meditation  False        True -2.744  58.0   two-sided  0.008   0.024     bonf  5.593  -0.699

    5. Two between-subject factors. The order of the ``between`` factors matters!

    >>> pg.pairwise_tests(dv='Scores', between=['Group', 'Time'], data=df).round(3)
           Contrast       Group        A           B Paired  Parametric      T    dof alternative  p-unc     BF10  hedges
    0         Group           -  Control  Meditation  False        True -2.290  178.0   two-sided  0.023    1.813  -0.340
    1          Time           -   August     January  False        True -1.806  118.0   two-sided  0.074    0.839  -0.328
    2          Time           -   August        June  False        True -2.660  118.0   two-sided  0.009    4.499  -0.483
    3          Time           -  January        June  False        True -0.934  118.0   two-sided  0.352    0.288  -0.170
    4  Group * Time     Control   August     January  False        True -0.383   58.0   two-sided  0.703    0.279  -0.098
    5  Group * Time     Control   August        June  False        True -0.292   58.0   two-sided  0.771    0.272  -0.074
    6  Group * Time     Control  January        June  False        True  0.045   58.0   two-sided  0.964    0.263   0.011
    7  Group * Time  Meditation   August     January  False        True -2.188   58.0   two-sided  0.033    1.884  -0.558
    8  Group * Time  Meditation   August        June  False        True -4.040   58.0   two-sided  0.000  148.302  -1.030
    9  Group * Time  Meditation  January        June  False        True -1.442   58.0   two-sided  0.155    0.625  -0.367

    6. Same but without the interaction, and using a directional test

    >>> df.pairwise_tests(dv='Scores', between=['Group', 'Time'], alternative="less",
    ...                    interaction=False).round(3)
      Contrast        A           B  Paired  Parametric      T    dof alternative  p-unc   BF10  hedges
    0    Group  Control  Meditation   False        True -2.290  178.0        less  0.012  3.626  -0.340
    1     Time   August     January   False        True -1.806  118.0        less  0.037  1.679  -0.328
    2     Time   August        June   False        True -2.660  118.0        less  0.004  8.998  -0.483
    3     Time  January        June   False        True -0.934  118.0        less  0.176  0.577  -0.170
       )ttest)wilcoxonmwuall)dvbetweenwithinsubjecteffectsdatar   ZgreaterZlessFAlternative must be one of 'two-sided' (default), 'greater' or 'less'.zalpha must be float.r   pairwiseFNTmultiple_betweenc                    s   g | ]}|   kqS r   keys).0br)   r   r   
<listcomp>#  s     z"pairwise_tests.<locals>.<listcomp>r   multiple_withinc                    s   g | ]}|   kqS r   r/   )r1   wr3   r   r   r4   *  s     zXMultiple between and within factors are currently not supported. Please select only one.simple_betweensimple_withinwithin_betweenZContrastZTimeABmean(A)zstd(A)mean(B)zstd(B)ZPairedZ
ParametricTzU-valzW-valdofalternativep-uncp-corrp-adjustBF10)indexcolumnsvaluesobservedr   )ignore_indexZ
value_name)r8   r7   )sortrH      z-Columns must have at least two unique values.)dtyperE   rF   round)rL   )pairedr@   
correction)T-testrD   )rP   r?   )r@   )xyeftyperN   )Zddofp-valr   alphamethod)Zas_indexrH   rJ   Znumeric_only)r$   r%   r&   r'   r)   
parametricmarginalrV   r@   padjusteffsizerO   
nan_policyreturn_desc)axisrI   rJ   )rH   rJ      )shaperL   )Zinclude_tuplez * howr_   )r5   r.   r9   -)inplace)rF   re   )ArY   r    Znonparametricr!   r"   r   AssertionError
isinstancefloatlistlenr#   
ValueErrorstrintr0   Zpivot_tabledropnaZmeltreset_indexgroupbygroupsr   nparraypd	DataFramefloat64rangeastypeobjectboollocr   copyra   at	get_groupto_numpyupdater   ZnanmeanZnanstdZiatsizelowerr   	enumeratemeanr   concatZ	set_indexZ
sort_indexr   zerosr	   arangereindexrE   unionisinrF   Zfillnarenamer
   )Br)   r$   r%   r&   r'   rY   rZ   rV   r@   r[   r\   rO   r]   r^   ZinteractionZwithin_firstr    r!   r"   r.   r5   Zcontrast	col_orderZidx_pivZdata_pivrN   colZgrp_collabelscombsr:   r;   statsZcols_strZ	cols_boolold_optionsicol1col2rQ   rR   Z	stat_nameZdf_ttestef_ZfactorsZfbtZfwtZaggftmpptZnrowsZgrp_fac1Zgrp_fac2Zgrp_bothZlabels_fac1Zlabels_fac2Z	comb_fac2Z
combs_listZncombsZidxitercombZicZfac1Zpcorr   r3   r   r       s    m     


 "




    


  










    
  
r`   z***z***)gMbP?g{Gz?r   c                    s  ddl m} ddlm} ddlm ddlm}	m}
 tt	sFt
dt tsXt
d|rb|
}n|	}| j}t||d}tj||tjd	}| }|D ]N\}}|| | | | f|d
di\}}t| |j||f< ||j||f< q|dk	r(| ||dd }t|d|dd | ||dd< |t}t| d fdd}|r`||}n| fdd}| ||dd | ||dd< |S )a  
    Pairwise T-test between columns of a dataframe.

    T-values are reported on the lower triangle of the output pairwise matrix and p-values on the
    upper triangle. This method is a faster, but less exhaustive, matrix-version of the
    :py:func:`pingouin.pairwise_test` function. Missing values are automatically removed from each
    pairwise T-test.

    .. versionadded:: 0.5.3

    Parameters
    ----------
    self : :py:class:`pandas.DataFrame`
        Input dataframe.
    paired : boolean
        Specify whether the two observations are related (i.e. repeated measures) or independent.
    decimals : int
        Number of decimals to display in the output matrix.
    padjust : string or None
        P-values adjustment for multiple comparison

        * ``'none'``: no correction
        * ``'bonf'``: one-step Bonferroni correction
        * ``'sidak'``: one-step Sidak correction
        * ``'holm'``: step-down method using Bonferroni adjustments
        * ``'fdr_bh'``: Benjamini/Hochberg FDR correction
        * ``'fdr_by'``: Benjamini/Yekutieli FDR correction
    stars : boolean
        If True, only significant p-values are displayed as stars using the pre-defined thresholds
        of ``pval_stars``. If False, all the raw p-values are displayed.
    pval_stars : dict
        Significance thresholds. Default is 3 stars for p-values <0.001, 2 stars for
        p-values <0.01 and 1 star for p-values <0.05.
    **kwargs : optional
        Optional argument(s) passed to the lower-level scipy functions, i.e.
        :py:func:`scipy.stats.ttest_ind` for independent T-test and
        :py:func:`scipy.stats.ttest_rel` for paired T-test.

    Returns
    -------
    mat : :py:class:`pandas.DataFrame`
        Pairwise T-test matrix, of dtype str, with T-values on the lower triangle and p-values on
        the upper triangle.

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> import pingouin as pg
    >>> # Load an example dataset of personality dimensions
    >>> df = pg.read_dataset('pairwise_corr').iloc[:30, 1:]
    >>> df.columns = ["N", "E", "O", 'A', "C"]
    >>> # Add some missing values
    >>> df.iloc[[2, 5, 20], 2] = np.nan
    >>> df.iloc[[1, 4, 10], 3] = np.nan
    >>> df.head().round(2)
          N     E     O     A     C
    0  2.48  4.21  3.94  3.96  3.46
    1  2.60  3.19  3.96   NaN  3.23
    2  2.81  2.90   NaN  2.75  3.50
    3  2.90  3.56  3.52  3.17  2.79
    4  3.02  3.33  4.02   NaN  2.85

    Independent pairwise T-tests

    >>> df.ptests()
            N       E      O      A    C
    N       -     ***    ***    ***  ***
    E  -8.397       -                ***
    O  -8.332  -0.596      -         ***
    A  -8.804    0.12   0.72      -  ***
    C  -4.759   3.753  4.074  3.787    -

    Let's compare with SciPy

    >>> from scipy.stats import ttest_ind
    >>> np.round(ttest_ind(df["N"], df["E"]), 3)
    array([-8.397,  0.   ])

    Passing custom parameters to the lower-level :py:func:`scipy.stats.ttest_ind` function

    >>> df.ptests(alternative="greater", equal_var=True)
            N       E      O      A    C
    N       -
    E  -8.397       -                ***
    O  -8.332  -0.596      -         ***
    A  -8.804    0.12   0.72      -  ***
    C  -4.759   3.753  4.074  3.787    -

    Paired T-test, showing the actual p-values instead of stars

    >>> df.ptests(paired=True, stars=False, decimals=4)
            N        E       O       A       C
    N        -   0.0000  0.0000  0.0000  0.0002
    E  -7.0773        -  0.8776  0.7522  0.0012
    O  -8.0568  -0.1555       -  0.8137  0.0008
    A  -8.3994   0.3191  0.2383       -  0.0009
    C  -4.2511   3.5953  3.7849  3.7652       -

    Adjusting for multiple comparisons using the Holm-Bonferroni method

    >>> df.ptests(paired=True, stars=False, padjust="holm")
            N       E      O      A      C
    N       -   0.000  0.000  0.000  0.001
    E  -7.077       -     1.     1.  0.005
    O  -8.057  -0.155      -     1.  0.005
    A  -8.399   0.319  0.238      -  0.005
    C  -4.251   3.595  3.785  3.765      -
    r   )r   )triu_indices_from)format_float_positional)	ttest_ind	ttest_relz pval_stars must be a dictionary.zdecimals must be an int.rK   )rF   rE   rL   r]   ZomitNr   )kr   rU   rd   c                    s&      D ]\}}| |k r|  S qdS )N )items)rQ   keyvalue)
pval_starsr   r   replace_pval  s    
zptests.<locals>.replace_pvalc                    s   |  dS )N)Z	precisionr   )rQ   )decimalsffpr   r   <lambda>      zptests.<locals>.<lambda>)	itertoolsr   numpyr   r   scipy.statsr   r   rg   dictrf   rm   rF   ri   rt   ru   rr   rv   r|   rM   r{   r   r   rx   rl   Zfill_diagonalZapplymap)selfrN   r   r[   Zstarsr   r   r   Ztifr   r   funccolsr   matZ	mat_upperar2   tpZpvalsr   r   )r   r   r   r   r   W  s8    w"
"
$c                 C   s  t  }dt d< t|| |dd}t | |jd }|jd d }| j|dd| }tt|j	
 }	|  }
|jdd	 }|jd
 |
 }tttt|dj\}}|| ||  }t|| ||  }|| }ttdt| ||}t|dd}g }t||D ]4\}}|t||	| ||	| d|d qtd|	| d|	| d|| d|| d|d|d|d|||i	}t|S )aA  Pairwise Tukey-HSD post-hoc test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame. Note that this function can also directly be used as a Pandas method, in which
        case this argument is no longer needed.
    dv : string
        Name of column containing the dependent variable.
    between: string
        Name of column containing the between factor.
    effsize : string or None
        Effect size type. Available methods are:

        * ``'none'``: no effect size
        * ``'cohen'``: Unbiased Cohen d
        * ``'hedges'``: Hedges g
        * ``'r'``: Pearson correlation coefficient
        * ``'eta-square'``: Eta-square
        * ``'odds-ratio'``: Odds ratio
        * ``'AUC'``: Area Under the Curve
        * ``'CLES'``: Common Language Effect Size

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'A'``: Name of first measurement
        * ``'B'``: Name of second measurement
        * ``'mean(A)'``: Mean of first measurement
        * ``'mean(B)'``: Mean of second measurement
        * ``'diff'``: Mean difference (= mean(A) - mean(B))
        * ``'se'``: Standard error
        * ``'T'``: T-values
        * ``'p-tukey'``: Tukey-HSD corrected p-values
        * ``'hedges'``: Hedges effect size (or any effect size defined in
          ``effsize``)

    See also
    --------
    pairwise_tests, pairwise_gameshowell

    Notes
    -----
    Tukey HSD post-hoc [1]_ is best for balanced one-way ANOVA.

    It has been proven to be conservative for one-way ANOVA with unequal sample sizes. However, it
    is not robust if the groups have unequal variances, in which case the Games-Howell test is
    more adequate. Tukey HSD is not valid for repeated measures ANOVA. Only one-way ANOVA design
    are supported.

    The T-values are defined as:

    .. math::

        t = \frac{\overline{x}_i - \overline{x}_j}
        {\sqrt{2 \cdot \text{MS}_w / n}}

    where :math:`\overline{x}_i` and :math:`\overline{x}_j` are the means of the first and
    second group, respectively, :math:`\text{MS}_w` the mean squares of the error (computed using
    ANOVA) and :math:`n` the sample size.

    If the sample sizes are unequal, the Tukey-Kramer procedure is automatically used:

    .. math::

        t = \frac{\overline{x}_i - \overline{x}_j}{\sqrt{\frac{MS_w}{n_i}
        + \frac{\text{MS}_w}{n_j}}}

    where :math:`n_i` and :math:`n_j` are the sample sizes of the first and second group,
    respectively.

    The p-values are then approximated using the Studentized range distribution
    :math:`Q(\sqrt2|t_i|, r, N - r)` where :math:`r` is the total number of groups and
    :math:`N` is the total sample size.

    References
    ----------
    .. [1] Tukey, John W. "Comparing individual means in the analysis of
           variance." Biometrics (1949): 99-114.

    .. [2] Gleason, John R. "An accurate, non-iterative approximation for
           studentized range quantiles." Computational statistics & data
           analysis 31.2 (1999): 147-158.

    Examples
    --------
    Pairwise Tukey post-hocs on the Penguins dataset.

    >>> import pingouin as pg
    >>> df = pg.read_dataset('penguins')
    >>> df.pairwise_tukey(dv='body_mass_g', between='species').round(3)
               A          B   mean(A)   mean(B)      diff      se       T  p-tukey  hedges
    0     Adelie  Chinstrap  3700.662  3733.088   -32.426  67.512  -0.480    0.881  -0.074
    1     Adelie     Gentoo  3700.662  5076.016 -1375.354  56.148 -24.495    0.000  -2.860
    2  Chinstrap     Gentoo  3733.088  5076.016 -1342.928  69.857 -19.224    0.000  -2.875
    NrM   T)r$   r)   r%   Zdetailed)r   DF)r   r   r   rH   rX   )r   ZMSrK   r   FrN   rS   r:   r;   r<   r=   diffser>   zp-tukey)r   r|   r   r   r}   rp   rr   rs   ri   rq   r0   countr   r   r   r   r>   sqrtr   sfabsclipzipappendr   r~   rt   ru   r
   )r)   r$   r%   r\   r   Zaovdfnggrpr   ngmeansZgvarg1g2mnr   tvalpvalr   idx_aidx_br   r   r   r   r     sb    e

 
         c                 C   s  t ||d| d} | jdd} | |  }| j|dd| }tt|j }|	 
 }|jdd
 }|jdd
 }	tttt|dj\}
}||
 ||  }t|	|
 ||
  |	| ||   }|t|	|
 ||
  |	| ||    }|	|
 ||
  |	| ||   d |	|
 ||
  d ||
 d  |	| ||  d || d    }ttdt| ||}t|d	d}g }t|
|D ]4\}}|t||| ||| d
|d qtd||
 d|| d||
 d|| d|d|d|d|d|||i
}t|S )a  Pairwise Games-Howell post-hoc test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame
    dv : string
        Name of column containing the dependent variable.
    between: string
        Name of column containing the between factor.
    effsize : string or None
        Effect size type. Available methods are:

        * ``'none'``: no effect size
        * ``'cohen'``: Unbiased Cohen d
        * ``'hedges'``: Hedges g
        * ``'r'``: Pearson correlation coefficient
        * ``'eta-square'``: Eta-square
        * ``'odds-ratio'``: Odds ratio
        * ``'AUC'``: Area Under the Curve
        * ``'CLES'``: Common Language Effect Size

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`
        Stats summary:

        * ``'A'``: Name of first measurement
        * ``'B'``: Name of second measurement
        * ``'mean(A)'``: Mean of first measurement
        * ``'mean(B)'``: Mean of second measurement
        * ``'diff'``: Mean difference (= mean(A) - mean(B))
        * ``'se'``: Standard error
        * ``'T'``: T-values
        * ``'df'``: adjusted degrees of freedom
        * ``'pval'``: Games-Howell corrected p-values
        * ``'hedges'``: Hedges effect size (or any effect size defined in
          ``effsize``)

    See also
    --------
    pairwise_tests, pairwise_tukey

    Notes
    -----
    Games-Howell [1]_ is very similar to the Tukey HSD post-hoc test but is much more robust to
    heterogeneity of variances. While the Tukey-HSD post-hoc is optimal after a classic one-way
    ANOVA, the Games-Howell is optimal after a Welch ANOVA. Please note that Games-Howell
    is not valid for repeated measures ANOVA. Only one-way ANOVA design are supported.

    Compared to the Tukey-HSD test, the Games-Howell test uses different pooled variances for
    each pair of variables instead of the same pooled variance.

    The T-values are defined as:

    .. math::

        t = \frac{\overline{x}_i - \overline{x}_j}
        {\sqrt{(\frac{s_i^2}{n_i} + \frac{s_j^2}{n_j})}}

    and the corrected degrees of freedom are:

    .. math::

        v = \frac{(\frac{s_i^2}{n_i} + \frac{s_j^2}{n_j})^2}
        {\frac{(\frac{s_i^2}{n_i})^2}{n_i-1} +
        \frac{(\frac{s_j^2}{n_j})^2}{n_j-1}}

    where :math:`\overline{x}_i`, :math:`s_i^2`, and :math:`n_i` are the mean, variance and sample
    size of the first group and :math:`\overline{x}_j`, :math:`s_j^2`, and :math:`n_j` the mean,
    variance and sample size of the second group.

    The p-values are then approximated using the Studentized range distribution
    :math:`Q(\sqrt2|t_i|, r, v_i)`.

    References
    ----------
    .. [1] Games, Paul A., and John F. Howell. "Pairwise multiple comparison
           procedures with unequal n's and/or variances: a Monte Carlo study."
           Journal of Educational Statistics 1.2 (1976): 113-125.

    .. [2] Gleason, John R. "An accurate, non-iterative approximation for
           studentized range quantiles." Computational statistics & data
           analysis 31.2 (1999): 147-158.

    Examples
    --------
    Pairwise Games-Howell post-hocs on the Penguins dataset.

    >>> import pingouin as pg
    >>> df = pg.read_dataset('penguins')
    >>> pg.pairwise_gameshowell(data=df, dv='body_mass_g',
    ...                         between='species').round(3)
               A          B   mean(A)   mean(B)      diff      se       T       df  pval  hedges
    0     Adelie  Chinstrap  3700.662  3733.088   -32.426  59.706  -0.543  152.455  0.85  -0.074
    1     Adelie     Gentoo  3700.662  5076.016 -1375.354  58.811 -23.386  249.643  0.00  -2.860
    2  Chinstrap     Gentoo  3733.088  5076.016 -1342.928  65.103 -20.628  170.404  0.00  -2.875
    r%   )r$   r%   r(   r)   TZdropr   rX   rK   r   r   Fr   r:   r;   r<   r=   r   r   r>   r   r   )r   ro   nuniquerp   rr   rs   ri   rq   r0   r   r   r   varr   r   r>   r   r   r   r   r   r   r   r   r~   rt   ru   r
   )r)   r$   r%   r\   r   r   r   r   r   gvarsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r     sf    d &*">
          pearsonr-   c                    s  ddl m}m} |dks td|dks,t|  } | jdd| jddd	kf } | j t	 t
tfrp g ttfffd
d	t	| jtjrd}	 dk	rt td}
tdd |
D stnd}	 dkrttd	}nDt	 d ttjfrfdd d D t dkr( d t	 d ttjfrdt d rdfdd d D }nfddD }tt|}nt dkr؈ d krd d  }t| fddD }tt |}nPfdd D  t dkr fddD }tt |}ntt d	}t|}t|dkrHtd|	rtt|ddddf |ddddf }tt|ddddf |ddddf }n |dddf }|dddf }tj||||dtt|dddddd d!d"d#d$d%gd&}|dk	rt	|t
ttjfs"td't	|t
r6|g}nt	|tjrL| }tfd(d|D sltd)||ddg |jdd*  }|jdd+}|jd dkrtd,|d-krt |ddg !  }|dk	r|"| | | # } t$% }dt$d.< t|jd D ]}|j&|df |j&|df  }}|dkr`|| | ! | | ! ||d/}n|| |||||d0}|j }|D ]}|j&||f |j&||f< qqt$'| |(t)t*t)t)t)d1}|j+d#d2id3}|d2 j,dkrdn|}|dk	r,|- d4kr<t.|d2 ! |d5\}|d6< ||d7< nd|d6< d|d7< dddddd d!d"d2d6d7d$d%g}|j/|d3j#d8dd9}|dk	r|j0d:d;t
|d< t1|S )=a}(  Pairwise (partial) correlations between columns of a pandas dataframe.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame. Note that this function can also directly be used as a
        Pandas method, in which case this argument is no longer needed.
    columns : list or str
        Column names in data:

        * ``["a", "b", "c"]``: combination between columns a, b, and c.
        * ``["a"]``: product between a and all the other numeric columns.
        * ``[["a"], ["b", "c"]]``: product between ["a"] and ["b", "c"].
        * ``[["a", "d"], ["b", "c"]]``: product between ["a", "d"] and
          ["b", "c"].
        * ``[["a", "d"], None]``: product between ["a", "d"] and all other
          numeric columns in dataframe.

        If column is None, the function will return the pairwise correlation
        between the combination of all the numeric columns in data.
        See the examples section for more details on this.
    covar : None, string or list
        Covariate(s) for partial correlation. Must be one or more columns
        in data. Use a list if there are more than one covariate. If
        ``covar`` is not None, a partial correlation will be computed using
        :py:func:`pingouin.partial_corr` function.

        .. important:: Only ``method='pearson'`` and ``method='spearman'``
            are currently supported in partial correlation.
    alternative : string
        Defines the alternative hypothesis, or tail of the correlation. Must be one of
        "two-sided" (default), "greater" or "less". Both "greater" and "less" return a one-sided
        p-value. "greater" tests against the alternative hypothesis that the correlation is
        positive (greater than zero), "less" tests against the hypothesis that the correlation is
        negative.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\tau_B` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)
    padjust : string
        Method used for testing and adjustment of pvalues.

        * ``'none'``: no correction
        * ``'bonf'``: one-step Bonferroni correction
        * ``'sidak'``: one-step Sidak correction
        * ``'holm'``: step-down method using Bonferroni adjustments
        * ``'fdr_bh'``: Benjamini/Hochberg FDR correction
        * ``'fdr_by'``: Benjamini/Yekutieli FDR correction
    nan_policy : string
        Can be ``'listwise'`` for listwise deletion of missing values
        (= complete-case analysis) or ``'pairwise'`` (default) for the more
        liberal pairwise deletion (= available-case analysis).

        .. versionadded:: 0.2.9

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'X'``: Name(s) of first columns.
        * ``'Y'``: Name(s) of second columns.
        * ``'method'``: Correlation type.
        * ``'covar'``: List of specified covariate(s), only when covariates are passed.
        * ``'alternative'``: Tail of the test.
        * ``'n'``: Sample size (after removal of missing values).
        * ``'r'``: Correlation coefficients.
        * ``'CI95'``: 95% parametric confidence intervals.
        * ``'p-unc'``: Uncorrected p-values.
        * ``'p-corr'``: Corrected p-values.
        * ``'p-adjust'``: P-values correction method.
        * ``'BF10'``: Bayes Factor of the alternative hypothesis (only for Pearson correlation)
        * ``'power'``: achieved power of the test (= 1 - type II error).

    Notes
    -----
    Please refer to the :py:func:`pingouin.corr()` function for a description
    of the different methods. Missing values are automatically removed from the
    data using a pairwise deletion.

    This function is more flexible and gives a much more detailed
    output than the :py:func:`pandas.DataFrame.corr()` method (i.e. p-values,
    confidence interval, Bayes Factor...). This comes however at
    an increased computational cost. While this should not be discernible for
    a dataframe with less than 10,000 rows and/or less than 20 columns, this
    function can be slow for very large datasets.

    A faster alternative to get the r-values and p-values in a matrix format is
    to use the :py:func:`pingouin.rcorr` function, which works directly as a
    :py:class:`pandas.DataFrame` method (see example below).

    This function also works with two-dimensional multi-index columns. In this
    case, columns must be list(s) of tuple(s). Please refer to this `example
    Jupyter notebook
    <https://github.com/raphaelvallat/pingouin/blob/master/notebooks/04_Correlations.ipynb>`_
    for more details.

    If and only if ``covar`` is specified, this function will compute the
    pairwise partial correlation between the variables. If you are only
    interested in computing the partial correlation matrix (i.e. the raw
    pairwise partial correlation coefficient matrix, without the p-values,
    sample sizes, etc), a better alternative is to use the
    :py:func:`pingouin.pcorr` function (see example 7).

    Examples
    --------
    1. One-sided spearman correlation corrected for multiple comparisons

    >>> import pandas as pd
    >>> import pingouin as pg
    >>> pd.set_option('display.expand_frame_repr', False)
    >>> pd.set_option('display.max_columns', 20)
    >>> data = pg.read_dataset('pairwise_corr').iloc[:, 1:]
    >>> pg.pairwise_corr(data, method='spearman', alternative='greater', padjust='bonf').round(3)
                   X                  Y    method alternative    n      r         CI95%  p-unc  p-corr p-adjust  power
    0    Neuroticism       Extraversion  spearman     greater  500 -0.325  [-0.39, 1.0]  1.000   1.000     bonf  0.000
    1    Neuroticism           Openness  spearman     greater  500 -0.028   [-0.1, 1.0]  0.735   1.000     bonf  0.012
    2    Neuroticism      Agreeableness  spearman     greater  500 -0.151  [-0.22, 1.0]  1.000   1.000     bonf  0.000
    3    Neuroticism  Conscientiousness  spearman     greater  500 -0.356  [-0.42, 1.0]  1.000   1.000     bonf  0.000
    4   Extraversion           Openness  spearman     greater  500  0.243   [0.17, 1.0]  0.000   0.000     bonf  1.000
    5   Extraversion      Agreeableness  spearman     greater  500  0.062  [-0.01, 1.0]  0.083   0.832     bonf  0.398
    6   Extraversion  Conscientiousness  spearman     greater  500  0.056  [-0.02, 1.0]  0.106   1.000     bonf  0.345
    7       Openness      Agreeableness  spearman     greater  500  0.170    [0.1, 1.0]  0.000   0.001     bonf  0.985
    8       Openness  Conscientiousness  spearman     greater  500 -0.007  [-0.08, 1.0]  0.560   1.000     bonf  0.036
    9  Agreeableness  Conscientiousness  spearman     greater  500  0.161   [0.09, 1.0]  0.000   0.002     bonf  0.976

    2. Robust two-sided biweight midcorrelation with uncorrected p-values

    >>> pcor = pg.pairwise_corr(data, columns=['Openness', 'Extraversion',
    ...                                        'Neuroticism'], method='bicor')
    >>> pcor.round(3)
                  X             Y method alternative    n      r           CI95%  p-unc  power
    0      Openness  Extraversion  bicor   two-sided  500  0.247    [0.16, 0.33]  0.000  1.000
    1      Openness   Neuroticism  bicor   two-sided  500 -0.028   [-0.12, 0.06]  0.535  0.095
    2  Extraversion   Neuroticism  bicor   two-sided  500 -0.343  [-0.42, -0.26]  0.000  1.000

    3. One-versus-all pairwise correlations

    >>> pg.pairwise_corr(data, columns=['Neuroticism']).round(3)
                 X                  Y   method alternative    n      r           CI95%  p-unc       BF10  power
    0  Neuroticism       Extraversion  pearson   two-sided  500 -0.350  [-0.42, -0.27]  0.000  6.765e+12  1.000
    1  Neuroticism           Openness  pearson   two-sided  500 -0.010    [-0.1, 0.08]  0.817      0.058  0.056
    2  Neuroticism      Agreeableness  pearson   two-sided  500 -0.134  [-0.22, -0.05]  0.003      5.122  0.854
    3  Neuroticism  Conscientiousness  pearson   two-sided  500 -0.368  [-0.44, -0.29]  0.000  2.644e+14  1.000

    4. Pairwise correlations between two lists of columns (cartesian product)

    >>> columns = [['Neuroticism', 'Extraversion'], ['Openness']]
    >>> pg.pairwise_corr(data, columns).round(3)
                  X         Y   method alternative    n      r         CI95%  p-unc       BF10  power
    0   Neuroticism  Openness  pearson   two-sided  500 -0.010  [-0.1, 0.08]  0.817      0.058  0.056
    1  Extraversion  Openness  pearson   two-sided  500  0.267  [0.18, 0.35]  0.000  5.277e+06  1.000

    5. As a Pandas method

    >>> pcor = data.pairwise_corr(covar='Neuroticism', method='spearman')

    6. Pairwise partial correlation

    >>> pg.pairwise_corr(data, covar=['Neuroticism', 'Openness'])
                   X                  Y   method                        covar alternative    n         r          CI95%     p-unc
    0   Extraversion      Agreeableness  pearson  ['Neuroticism', 'Openness']   two-sided  500 -0.038737  [-0.13, 0.05]  0.388361
    1   Extraversion  Conscientiousness  pearson  ['Neuroticism', 'Openness']   two-sided  500 -0.071427  [-0.16, 0.02]  0.111389
    2  Agreeableness  Conscientiousness  pearson  ['Neuroticism', 'Openness']   two-sided  500  0.123108   [0.04, 0.21]  0.005944

    7. Pairwise partial correlation matrix using :py:func:`pingouin.pcorr`

    >>> data[['Neuroticism', 'Openness', 'Extraversion']].pcorr().round(3)
                  Neuroticism  Openness  Extraversion
    Neuroticism         1.000     0.092        -0.360
    Openness            0.092     1.000         0.281
    Extraversion       -0.360     0.281         1.000

    8. Correlation matrix with p-values using :py:func:`pingouin.rcorr`

    >>> data[['Neuroticism', 'Openness', 'Extraversion']].rcorr()
                 Neuroticism Openness Extraversion
    Neuroticism            -                   ***
    Openness           -0.01        -          ***
    Extraversion       -0.35    0.267            -
    r   )corrpartial_corrr*   r+   r,   NT)rn   rK   c                 3   s0   t | |r&| D ]} ||E dH  qn| V  dS )zbHelper function to flatten nested lists.
        From https://stackoverflow.com/a/6340578
        N)rg   )o
tree_typesr   )traverser   r   r   +  s    
zpairwise_corr.<locals>.traverse)r   c                 s   s    | ]}t |ttd fV  qd S )N)rg   tupletyper1   cr   r   r   	<genexpr>=  s     z pairwise_corr.<locals>.<genexpr>Fc                    s   g | ]}| kr|qS r   r   r1   er/   r   r   r4   H  s      z!pairwise_corr.<locals>.<listcomp>r   c                    s   g | ]}| kr|qS r   r   r   r/   r   r   r4   N  s      c                    s   g | ]}| kr|qS r   r   r   )group1r   r   r4   Q  s      z&"%s" is not in data or is not numeric.c                    s   g | ]}| d  kr|qS r   r   r   rF   r   r   r4   [  s      c                    s   g | ]}| kr|qS r   r   r   r/   r   r   r4   `  s      c                    s   g | ]}| d  kr|qS r   r   r   r   r   r   r4   c  s      zNo column combination found. Please make sure that the specified columns exist in the dataframe, are numeric, and contains at least two unique values.)XYrW   r@   r   r   rW   r@   r   outliersrzCI95%rT   rD   power)rE   rF   zcovar must be list or string.c                    s   g | ]}| kqS r   r   r   r/   r   r   r4     s     z3Covariate(s) are either not in data or not numeric.)r_   r   zNo column combination found. Please make sure that the specified columns and covar exist in the dataframe, are numeric, and contains at least two unique values.r   rM   )r@   rW   )r)   rQ   rR   covarr@   rW   )r   r   rT   r   r   rA   r   r   )rW   rB   rC   r#   rb   r`   r   )r{   columnr   )2Zpingouin.correlationr   r   rf   Z_get_numeric_datar{   r   rF   tolistrg   rl   r   ri   rt   Z
MultiIndexr#   r   rr   Zndarrayrj   r   r   rk   rs   r   ru   rw   ZIndexr   anyro   ra   uniquer   extendrn   r   r|   r}   r   rx   rh   rm   r   r   r   r   r   insertr
   )r)   rF   r   r@   rW   r[   r]   r   r   Zmulti_indexZcol_flattenr   Zgroup2msgZothersr   r   r   Zall_colsr   r   r   r   Zcor_stZcor_st_keysr   rejectr   r   )rF   r   r0   r   r   r   Q  s    F


$
*,

 




 
       
 



)NNNNNTTr   r   r   r   r   r   FTT)NNNr   )NNNr   )NNr   r   r   r-   ) r   rr   Zpandasrt   Zpandas_flavorpfr   r   r   Zpingouin.configr   Zpingouin.parametricr   Zpingouin.multicompr   Zpingouin.effsizer   Zpingouin.utilsr   r	   r
   r   r   r   __all__Zregister_dataframe_methodr   r   r   r   r   r   r   r   r   r   <module>   s|   

                    :
 + &
 *      