U
    Qvf5                     @   sX   d dl Zd dlZd dlmZ d dlmZmZ dddgZ	dddZ
dd
dZdddZdS )    N)
namedtuple)	remove_na_postprocess_dataframemultivariate_normalitymultivariate_ttestbox_m皙?c                 C   s  ddl m} t| } | jdks(td| t| jdd  } | j\}}|dksZtd|dksjtd	tj	| d
dd}tj
j|dd| j}| | d }ttj
|||jg}tj
| || jg}	d|	j tt|	j||d tt|	j|df }
dtd d| d d d|d    |d|d    }tj
||kr|d|d  ttt|d  d |
  dd|d  | d   d|  tt|d dd|d     |   dd|d   | d    }n|d }d|d  dd|d    }dd|d   }d|| d  d||d  |  ||d  |d  d|d      }ddd|d   | d   d||   dd| |d  |d   d| |d  |d  d|d      d|| d   dd| |d  d|   ||d  |d  d|d      }tt|d ||d   }tt||d  }|j||t|d}||krdnd
}tddddg}||||dS )a  Henze-Zirkler multivariate normality test.

    Parameters
    ----------
    X : np.array
        Data matrix of shape (n_samples, n_features).
    alpha : float
        Significance level.

    Returns
    -------
    hz : float
        The Henze-Zirkler test statistic.
    pval : float
        P-value.
    normal : boolean
        True if X comes from a multivariate normal distribution.

    See Also
    --------
    normality : Test the univariate normality of one or more variables.
    homoscedasticity : Test equality of variance.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The Henze-Zirkler test [1]_ has a good overall power against alternatives
    to normality and works for any dimension and sample size.

    Adapted to Python from a Matlab code [2]_ by Antonio Trujillo-Ortiz and
    tested against the
    `MVN <https://cran.r-project.org/web/packages/MVN/MVN.pdf>`_ R package.

    Rows with missing values are automatically removed.

    References
    ----------
    .. [1] Henze, N., & Zirkler, B. (1990). A class of invariant consistent
       tests for multivariate normality. Communications in Statistics-Theory
       and Methods, 19(10), 3595-3617.

    .. [2] Trujillo-Ortiz, A., R. Hernandez-Walls, K. Barba-Rojo and L.
       Cupul-Magana. (2007). HZmvntest: Henze-Zirkler's Multivariate
       Normality Test. A MATLAB file.

    Examples
    --------
    >>> import pingouin as pg
    >>> data = pg.read_dataset('multivariate')
    >>> X = data[['Fever', 'Pressure', 'Aches']]
    >>> pg.multivariate_normality(X, alpha=.05)
    HZResults(hz=0.540086101851555, pval=0.7173686509622386, normal=True)
    r   )lognorm   z+X must be of shape (n_samples, n_features).   axis   zX must have at least 3 rows.z!X must have at least two columns.FT)rowvarZbiasZ	hermitian      )Zscale	HZResultshzpvalnormal)r   r   r   )scipy.statsr	   npasarrayndimAssertionErrorisnananyshapecovlinalgpinvZastypeZdtypemeanZdiagZ	multi_dotTrepeatreshapeZtilesqrtZmatrix_ranksumexploglog1psfr   )Xalphar	   npSZS_invZdifTZDjYZDjkbr   wbamuZsi2Zpmupsir   r   r    r9   9/tmp/pip-unpacked-wheel-2te3nxqf/pingouin/multivariate.pyr   	   sh    6

>:.(	F>: Fc                 C   sz  ddl m} t| }|jdks(td|dkrZt|jd }|t|j	dd  }n|j\}}t|}|jdkstd	|jdkr|j
|kstn8d
}|jd |kst||rd}|jd |kst|t|||dd\}}|j\}}	|jd }
|dkstd|jdks"|dkr|}|jdkrPtj|dd}|d| }n&tj|| dd}|d|d }tjj|dd}|| | | }n||
 d }tj|dd}tj|dd}|d | |
d |  |d  }tjjd| d|
  | dd}|d|d }|| | }|||	  |	|d   }|	}||	 }||||}|||||d}tj|dgd}t|S )a
  Hotelling T-squared test (= multivariate T-test)

    Parameters
    ----------
    X : np.array
        First data matrix of shape (n_samples, n_features).
    Y : np.array or None
        Second data matrix of shape (n_samples, n_features). If ``Y`` is a 1D
        array of shape (n_features), a one-sample test is performed where the
        null hypothesis is defined in ``Y``. If ``Y`` is None, a one-sample
        is performed against np.zeros(n_features).
    paired : boolean
        Specify whether the two observations are related (i.e. repeated
        measures) or independent. If ``paired`` is True, ``X`` and ``Y`` must
        have exactly the same shape.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'T2'``: T-squared value
        * ``'F'``: F-value
        * ``'df1'``: first degree of freedom
        * ``'df2'``: second degree of freedom
        * ``'p-val'``: p-value

    See Also
    --------
    multivariate_normality : Multivariate normality test.
    ttest : Univariate T-test.

    Notes
    -----
    The Hotelling 's T-squared test [1]_ is the multivariate counterpart of
    the T-test.

    Rows with missing values are automatically removed using the
    :py:func:`remove_na` function.

    Tested against the `Hotelling
    <https://cran.r-project.org/web/packages/Hotelling/Hotelling.pdf>`_ R
    package.

    References
    ----------
    .. [1] Hotelling, H. The Generalization of Student's Ratio. Ann. Math.
           Statist. 2 (1931), no. 3, 360--378.

    See also http://www.real-statistics.com/multivariate-statistics/

    Examples
    --------
    Two-sample independent Hotelling T-squared test

    >>> import pingouin as pg
    >>> data = pg.read_dataset('multivariate')
    >>> dvs = ['Fever', 'Pressure', 'Aches']
    >>> X = data[data['Condition'] == 'Drug'][dvs]
    >>> Y = data[data['Condition'] == 'Placebo'][dvs]
    >>> pg.multivariate_ttest(X, Y)
                     T2         F  df1  df2      pval
    hotelling  4.228679  1.326644    3   32  0.282898

    Two-sample paired Hotelling T-squared test

    >>> pg.multivariate_ttest(X, Y, paired=True)
                     T2         F  df1  df2      pval
    hotelling  4.468456  1.314252    3   15  0.306542

    One-sample Hotelling T-squared test with a specified null hypothesis

    >>> null_hypothesis_means = [37.5, 70, 5]
    >>> pg.multivariate_ttest(X, Y=null_hypothesis_means)
                       T2          F  df1  df2          pval
    hotelling  253.230991  74.479703    3   15  3.081281e-09
    r   )fr
   z*x must be of shape (n_samples, n_features)Nr   r   )r   r
   zY must be 1D or 2D.z:X and Y must have the same number of features (= columns).z4X and Y must have the same number of rows if paired.Zrows)pairedr      z#At least five samples are required.TF)r   r   )ZT2Fdf1df2r   Z	hotelling)index)r   r;   r   r   r   r   zerosr    r   r   sizer   r!   r$   r"   r#   r-   pd	DataFramer   )r.   r3   r<   r;   xyZnxZkxerrknyr0   r!   ZdiffZinv_covt2Zx_covZy_covZ
pooled_covZfvalr?   r@   r   statsr9   r9   r:   r   }   sV    M





  MbP?c                 C   s  ddl m} t| tjs td|| jks2tdt|| jsJtd| j	|dd| }|j
dksntd	|jdd
}|jj\}}| jdddf  }	|	 }
|	d }| ||d}||d  jdd|
|  }tj|}tj|| |d   }tt|	dkrX|d d|d  d|  d  d| |d  |
| d   }nHd|d  d|  d d|d  |d   }|d|  d|   9 }dd|  t| }d| |d  |d  }|||}||krdnd}tjdg|g|g|g|gdd}t|S )aJ  Test equality of covariance matrices using the Box's M test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Long-format dataframe.
    dvs : list
        Dependent variables.
    group : str
        Grouping variable.
    alpha : float
        Significance level. Default is 0.001 as recommended in [2]_. A
        non-significant p-value (higher than alpha) indicates that the
        covariance matrices are homogenous (= equal).

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'Chi2'``: Test statistic
        * ``'pval'``: p-value
        * ``'df'``: The Chi-Square statistic's degree of freedom
        * ``'equal_cov'``: True if ``data`` has equal covariance

    Notes
    -----
    .. warning:: Box's M test is susceptible to errors if the data does not
        meet the assumption of multivariate normality or if the sample size is
        too large or small [3]_.

    Pingouin uses :py:meth:`pandas.DataFrameGroupBy.cov` to calculate the
    variance-covariance matrix of each group. Missing values are automatically
    excluded from the calculation by Pandas.

    Mathematical expressions can be found in [1]_.

    This function has been tested against the boxM package of the `biotools`
    R package [4]_.

    References
    ----------
    .. [1] Rencher, A. C. (2003). Methods of multivariate analysis (Vol. 492).
           John Wiley & Sons.

    .. [2] Hahs-Vaughn, D. (2016). Applied Multivariate Statistical Concepts.
           Taylor & Francis.

    .. [3] https://en.wikipedia.org/wiki/Box%27s_M_test

    .. [4] https://cran.r-project.org/web/packages/biotools/index.html

    Examples
    --------
    1. Box M test with 3 dependent variables of 4 groups (equal sample size)

    >>> import pandas as pd
    >>> import pingouin as pg
    >>> from scipy.stats import multivariate_normal as mvn
    >>> data = pd.DataFrame(mvn.rvs(size=(100, 3), random_state=42),
    ...                     columns=['A', 'B', 'C'])
    >>> data['group'] = [1] * 25 + [2] * 25 + [3] * 25 + [4] * 25
    >>> data.head()
              A         B         C  group
    0  0.496714 -0.138264  0.647689      1
    1  1.523030 -0.234153 -0.234137      1
    2  1.579213  0.767435 -0.469474      1
    3  0.542560 -0.463418 -0.465730      1
    4  0.241962 -1.913280 -1.724918      1

    >>> pg.box_m(data, dvs=['A', 'B', 'C'], group='group')
              Chi2    df      pval  equal_cov
    box  11.634185  18.0  0.865537       True

    2. Box M test with 3 dependent variables of 2 groups (unequal sample size)

    >>> data = pd.DataFrame(mvn.rvs(size=(30, 2), random_state=42),
    ...                     columns=['A', 'B'])
    >>> data['group'] = [1] * 20 + [2] * 10
    >>> pg.box_m(data, dvs=['A', 'B'], group='group')
             Chi2   df      pval  equal_cov
    box  0.706709  3.0  0.871625       True
    r   )chi2z data must be a pandas dataframe.z%The grouping variable is not in data.zThe DVs are not in data.T)Zobservedr   z$Data must have at least two columns.)Znumeric_onlyNr   ).NNr   r
   r      r   g      ?FZbox)ZChi2dfr   	equal_cov)rA   data)r   rN   
isinstancerD   rE   r   columnssetissubsetgroupbyZngroupsr!   rA   ZlevshapecountZilocZto_numpyr)   r'   r   r"   Zdetprodlenuniquer+   r-   r   )rR   Zdvsgroupr/   rN   grpZcovsZn_covsZn_dvsZn_sampZnobsvr2   ZS_detMcurP   r1   rQ   rL   r9   r9   r:   r   
  s<    T, )r   )NF)rM   )Znumpyr   ZpandasrD   collectionsr   Zpingouin.utilsr   r   __all__r   r   r   r9   r9   r9   r:   <module>   s   

t
 