U
    KvfwT                     @   s4  d dl mZmZmZ d dlmZ d dlmZmZ d dl	Z
d dlZerRd dlmZ ndd Zd dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZmZmZmZ dZe
 ed Z!dd Z"d>ddZ#d?ddZ$d@ddZ%dAddZ&dBddZ'dCddZ(e#e
j)e
j*e
j+e
j,e$e
j-e(e%e'e&dZ.dd  Z/d!d" Z0dDd#d$Z1d%Z2d&Z3d'd( e3D Z4e2e5e4 Z6G d)d* d*Z7ee7j8Z9e9:d+edd,d-g e9:d.g  e9:d/d0gd1gfd2gd3gfg ee;e9dEd4d4d5d6ed7d8ee
j<ej=ej>f ee; e?e?e@e?eeeAe@f  e?ej>d9	d:d;ZBG d<d= d=ZCdS )F    )PD_LT_2Appenderis_numeric_dtype)SP_LT_19)SequenceUnionNis_categorical_dtypec                 C   s   t | tjS N)
isinstancepdZCategoricalDtypedtype r   F/tmp/pip-unpacked-wheel-2v6byqio/statsmodels/stats/descriptivestats.pyr	      s    r	   )stats)SimpleTable)jarque_bera)cache_readonly)	Docstring	Parameter)
array_like	bool_like
float_likeint_like)	      
      2   K   Z   _   c   g      Y@c                 C   s   |   |   S r
   )maxmindfr   r   r   pd_ptp!   s    r(   c                 C   s   dt |  j|dS )Nr   axis)npisnansum)xr*   r   r   r   nancount%   s    r/   c                 C   s   t j| |dt j| |d S Nr)   )r+   nanmaxnanminZarrr*   r   r   r   nanptp)   s    r4   c                 C   s   t j| d |dS )N   r)   )r+   Znansumr3   r   r   r   nanuss-   s    r6   c                 C   s   t j| t|dS r0   )r+   nanpercentilePERCENTILESr3   r   r   r   r7   1   s    r7   c                 C   s   t j| |ddS NZomit)r*   Z
nan_policy)r   kurtosisr3   r   r   r   nankurtosis5   s    r;   c                 C   s   t j| |ddS r9   )r   skewr3   r   r   r   nanskewness9   s    r=   )Zobsmeanstdr$   r%   Zptpvarr<   Zussr:   percentilesc                 C   s.   zt | }W n tk
r(   tj}Y nX |S )zi
    wrapper for scipy.stats.kurtosis that returns nan instead of raising Error

    missing options
    )r   r:   
ValueErrorr+   nanaresr   r   r   	_kurtosisL   s
    rG   c                 C   s.   zt | }W n tk
r(   tj}Y nX |S )ze
    wrapper for scipy.stats.skew that returns nan instead of raising Error

    missing options
    )r   r<   rB   r+   rC   rD   r   r   r   _skewY   s
    rH   c                 C   s   t | } t | |k}t | |k }|| d }ztt|||| dj}W n, tk
r|   tt|||| d}Y nX ||fS )a8  
    Signs test

    Parameters
    ----------
    samp : array_like
        1d array. The sample for which you want to perform the sign test.
    mu0 : float
        See Notes for the definition of the sign test. mu0 is 0 by
        default, but it is common to set it to the median.

    Returns
    -------
    M
    p-value

    Notes
    -----
    The signs test returns

    M = (N(+) - N(-))/2

    where N(+) is the number of values above `mu0`, N(-) is the number of
    values below.  Values equal to `mu0` are discarded.

    The p-value for M is calculated using the binomial distribution
    and can be interpreted the same as for a t-test. The test-statistic
    is distributed Binom(min(N(+), N(-)), n_trials, .5) where n_trials
    equals N(+) + N(-).

    See Also
    --------
    scipy.stats.wilcoxon
    g       @      ?)	r+   asarrayr-   r   Z	binomtestr%   ZpvalueAttributeErrorZ
binom_test)ZsampZmu0posnegMpr   r   r   	sign_testf   s    #
rP   )nobsmissingr>   std_errcir?   iqr
iqr_normalmad
mad_normalcoef_varranger$   r%   r<   r:   r   modemedianrA   )rQ   rR   distincttopfreqc                 C   s   g | ]}|t kr|qS r   )NUMERIC_STATISTICS.0statr   r   r   
<listcomp>   s     rd   c                   @   s   e Zd ZdZdddgZeZeZe	Z
ddddded	d
eejejejf ee eeeeeeeef  edddZejejdddZeejdddZeejdddZeejdddZedddZedddZdS )Descriptiona  
    Extended descriptive statistics for data

    Parameters
    ----------
    data : array_like
        Data to describe. Must be convertible to a pandas DataFrame.
    stats : Sequence[str], optional
        Statistics to include. If not provided the full set of statistics is
        computed. This list may evolve across versions to reflect best
        practices. Supported options are:
        "nobs", "missing", "mean", "std_err", "ci", "ci", "std", "iqr",
        "iqr_normal", "mad", "mad_normal", "coef_var", "range", "max",
        "min", "skew", "kurtosis", "jarque_bera", "mode", "freq",
        "median", "percentiles", "distinct", "top", and "freq". See Notes for
        details.
    numeric : bool, default True
        Whether to include numeric columns in the descriptive statistics.
    categorical : bool, default True
        Whether to include categorical columns in the descriptive statistics.
    alpha : float, default 0.05
        A number between 0 and 1 representing the size used to compute the
        confidence interval, which has coverage 1 - alpha.
    use_t : bool, default False
        Use the Student's t distribution to construct confidence intervals.
    percentiles : sequence[float]
        A distinct sequence of floating point values all between 0 and 100.
        The default percentiles are 1, 5, 10, 25, 50, 75, 90, 95, 99.
    ntop : int, default 5
        The number of top categorical labels to report. Default is

    Attributes
    ----------
    numeric_statistics
        The list of supported statistics for numeric data
    categorical_statistics
        The list of supported statistics for categorical data
    default_statistics
        The default list of statistics

    See Also
    --------
    pandas.DataFrame.describe
        Basic descriptive statistics
    describe
        A simplified version that returns a DataFrame

    Notes
    -----
    The selectable statistics include:

    * "nobs" - Number of observations
    * "missing" - Number of missing observations
    * "mean" - Mean
    * "std_err" - Standard Error of the mean assuming no correlation
    * "ci" - Confidence interval with coverage (1 - alpha) using the normal or
      t. This option creates two entries in any tables: lower_ci and upper_ci.
    * "std" - Standard Deviation
    * "iqr" - Interquartile range
    * "iqr_normal" - Interquartile range relative to a Normal
    * "mad" - Mean absolute deviation
    * "mad_normal" - Mean absolute deviation relative to a Normal
    * "coef_var" - Coefficient of variation
    * "range" - Range between the maximum and the minimum
    * "max" - The maximum
    * "min" - The minimum
    * "skew" - The skewness defined as the standardized 3rd central moment
    * "kurtosis" - The kurtosis defined as the standardized 4th central moment
    * "jarque_bera" - The Jarque-Bera test statistic for normality based on
      the skewness and kurtosis. This option creates two entries, jarque_bera
      and jarque_beta_pval.
    * "mode" - The mode of the data. This option creates two entries in all tables,
      mode and mode_freq which is the empirical frequency of the modal value.
    * "median" - The median of the data.
    * "percentiles" - The percentiles. Values included depend on the input value of
      ``percentiles``.
    * "distinct" - The number of distinct categories in a categorical.
    * "top" - The mode common categories. Labeled top_n for n in 1, 2, ..., ``ntop``.
    * "freq" - The frequency of the common categories. Labeled freq_n for n in 1,
      2, ..., ``ntop``.
    rQ   rR   r]   NT皙?Fr   numericcategoricalalphause_trA   ntopdatar   rh   ri   rj   rk   rA   rl   c             	   C   s  |}	t |tjtjfs$t|ddd}	|	jdkr8t|}t|d}t|d}g }
d}|rh|
tj	 d}|r|
d ||dkrd	nd7 }|d7 }|s|st
d
t||
| _| jjd dkrt
d| ddd | jjD | _dd | jjD | _|d k	r.dd |D }|r.t
d| d|d kr@ttnt|| _t|d| _d| jk| _d| jk| _| jr| jd  krt| jk rn nt
dddgddgddgdd td| jd D dd td| jd D d }|D ]H}|| jkr| j|}| jd | ||  | j|d d   | _qt|d!dd"d#| _t| j| _t| jjd | jjd krt
d$t| jd%kst| jdkrt
d&t |d'| _!d|  k rdk sn t
d(t|d)| _"d S )*Nrn   r5   )maxdimr   rh   ri    categoryzand z4At least one of numeric and categorical must be Truer   z
Selecting z results in an empty DataFramec                 S   s   g | ]}t |qS r   )r   rb   dtr   r   r   rd   0  s     z(Description.__init__.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   r   rr   r   r   r   rd   1  s    c                 S   s   g | ]}|t kr|qS r   )DEFAULT_STATISTICSra   r   r   r   rd   6  s      z, z are not known statisticsrl   r^   r_   z"top must be a non-negative integerr[   	mode_frequpper_cilower_cir   jarque_bera_pvalc                 S   s   g | ]}d | qS Ztop_r   rb   ir   r   r   rd   I  s     c                 S   s   g | ]}d | qS Zfreq_r   rz   r   r   r   rd   J  s     )r[   rT   r   r^   r_   rA   d)ro   r   zpercentiles must be distinctd   z.percentiles must be strictly between 0 and 100rj   z&alpha must be strictly between 0 and 1rk   )#r   r   Series	DataFramer   ndimr   appendr+   numberrB   Zselect_dtypes_datashapeZdtypes_is_numeric_is_cat_likejoinlistrt   _statsr   _ntop_compute_top_compute_freqr-   rZ   index_percentilessortuniqueanyr   _alpha_use_t)selfrn   r   rh   ri   rj   rk   rA   rl   Zdata_arrincludeZ	col_typesZundefreplacementskeyidxr   r   r   __init__
  s    






,    $zDescription.__init__)r'   returnc                    s    j  fdd| jD  S )Nc                    s   g | ]}| j kr|qS r   r   )rb   sr&   r   r   rd   d  s     
 z(Description._reorder.<locals>.<listcomp>)locr   )r   r'   r   r&   r   _reorderc  s    zDescription._reorder)r   c                 C   sT   | j }| j}|jd dkr|S |jd dkr0|S tj||gdd}| || jj S )z
        Descriptive statistics for both numeric and categorical data

        Returns
        -------
        DataFrame
            The statistics
        r   r   r)   )rh   ri   r   r   concatr   r   columns)r   rh   ri   r'   r   r   r   framef  s    
zDescription.framec           "         s  j jddjf }|j}|j\}}| }| }| }||   }|	 }	|	j|dk  |j|dk d   < j
rt|d djd  }
ntjdjd  }
dd }||j}|jdkr^t|tjrtj|d td	}tj|d tjd	}nPg }g }|jD ],}|j| }||d  ||d  qt|}t|}ntd }}|dk}t|jd tj}|| |j|  ||< |}zZdd
l m!} |	 }|D ]<}||| j"r|| # $ r|| %tj||< qW n t&k
r   Y nX |jd dkr:|'d|'d }n|}dd  |j fddddj}|	 }tj|j|dk< || }tj(tj)|tjd	|jd  |d|jd | ||	||
|	  ||
|	  ||||t*||+ |, |d |d |t-tjddg |t.dtj/  |d |d tj(||dtj(||d|0 d}fdd|1 D }tjt2|3 |t2|4 d}dj5kr|S |jd dkr|'j6d 7t}ntjj6d td}t8t9d|j d|j krdd |jD |_nd}d}|j} |rF|d9 }t9||j }t8t-|dkr
d }q
t9||  |d  } d!t:t;|d d  d"}!d#|! d$fd%d| D |_j5|j<  _5=tj>||gdd&S )'z
        Descriptive statistics for numeric data

        Returns
        -------
        DataFrame
            The statistics of the numeric columns
        Nr   rI   r   g      ?r5   c                 S   s   t | jtjr| jn| jj}|  j|d}tr4i nddi}tj|f|}t	|d rlt
|d |d fS |d jd dkrdd |D S tjtjfS )Nr   ZkeepdimsTr   r   c                 S   s   g | ]}t |qS r   )floatrb   valr   r   r   rd     s     z6Description.numeric.<locals>._mode.<locals>.<listcomp>)r   r   r+   Znumpy_dtypedropnaZto_numpyr   r   r[   Zisscalarr   r   rC   )Zserr   Zser_no_missingkwargsZmode_resr   r   r   _mode  s    z"Description.numeric.<locals>._moder   )is_extension_array_dtypeg      ?g      ?c                 S   s,   t | }|jd dk r$t jfd S t|S )Nr   r5      )r+   rJ   r   rC   r   )crE   r   r   r   _safe_jarque_bera  s    
z.Description.numeric.<locals>._safe_jarque_berac                    s   t  |  S r
   )r   r   )r.   )r   r   r   <lambda>      z%Description.numeric.<locals>.<lambda>expand)Zresult_typer      )rQ   rR   r>   rS   rv   rw   r?   rU   rW   rY   rZ   r$   r%   r<   r:   rV   rX   r   rx   r[   ru   r\   c                    s    i | ]\}}| j kr||qS r   r   rb   kvr   r   r   
<dictcomp>  s     
  z'Description.numeric.<locals>.<dictcomp>)r   r   rA   r~   )r   r   c                 S   s   g | ]}t d |  dqS )r~   %)intrb   r   r   r   r   rd     s     z'Description.numeric.<locals>.<listcomp>Tr   Fz0.fz{0:z}%c                    s   g | ]}  |qS r   )formatr   )outputr   r   rd     s     r)   )?r   r   r   r   r   r?   countr>   abscopyr   r   tZppfr   ZnormapplyTsizer   r   r   r+   rJ   r   int64r   r   Z
atleast_1demptyfullrC   Zpandas.api.typesr   r   isnullr   fillnaImportErrorZquantiler   onesr(   r$   r%   Zdiffsqrtpir\   itemsr   valueskeysr   r   astypeallfloorlenstrtolistr   r   )"r   r'   cols_r   r?   r   r>   rW   rS   qr   Zmode_valuesr[   Zmode_countsr   r   r   ru   Z_dfr   colrU   ZjbZnan_meanrY   resultsfinal
results_dfpercZdupeZscaler   fmtr   )r   r   r   r   rh   y  s    

$ 



  


  
"zDescription.numericc                    s  j jdddd jD f   jd } j} fdd D tjfddD tjd}i }i }D ]}| }|jd	 j	kr|j
dj	 ||< t|jdd
 ||< qlt|j
}|dgj	t|  7 }|||< t|}	|	tjgj	t|	  7 }	t|	||< qldd tdj	d D }
tj|d|
|d}dd tdj	d D }
tj|d|
|d}tjtj|tjd jd	  |d jd	    |d}fdd| D }tjt| |t| dd}jrtj||gd	d}jrtj||gd	d}|S )z
        Descriptive statistics for categorical data

        Returns
        -------
        DataFrame
            The statistics of the categorical columns
        Nc                 S   s   g | ]}|qS r   r   rb   r   r   r   r   rd     s     z+Description.categorical.<locals>.<listcomp>r   c                    s   i | ]}| | j d dqS )T)	normalize)Zvalue_countsr   r&   r   r   r     s      z+Description.categorical.<locals>.<dictcomp>c                    s   i | ]}| | j d  qS )r   )r   r   )vcr   r   r     s      r   r   r   c                 S   s   g | ]}d | qS ry   r   rz   r   r   r   rd   ,  s     object)r   r   r   c                 S   s   g | ]}d | qS r|   r   rz   r   r   r   rd   .  s     r   )rQ   rR   r]   c                    s    i | ]\}}| j kr||qS r   r   r   r   r   r   r   8  s     
  )r   r   r   r)   )r   r   r   r   r   r   r   r+   r   r   r   rJ   Zilocr   r   rC   rZ   r   r   r   r   r   r   r   r   r   r   )r   r   r   r]   r^   r_   r   Zsingler   Zfreq_valr   Ztop_dfZfreq_dfr   r   r   r   )r'   r   r   r   ri     sX     
 
 

zDescription.categoricalc              	   C   s   | j t}|   r&|d}dd |jD }dd |jD }g }| D ]\}}|	dd |D  qRdd }t
|||dd	d
|didgt| dS )z
        Summary table of the descriptive statistics

        Returns
        -------
        SimpleTable
            A table instance supporting export to text, csv and LaTeX
        rp   c                 S   s   g | ]}t |qS r   r   r   r   r   r   rd   R  s     z'Description.summary.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   r   r   r   r   r   rd   S  s     c                 S   s   g | ]}|qS r   r   )rb   r   r   r   r   rd   V  s     c                 S   s.   t | tr| S | d | kr&tt| S | dS )Nr   z0.4g)r   r   r   )r   r   r   r   
_formatterX  s
    
z'Description.summary.<locals>._formatterzDescriptive StatisticsZ	data_fmtsz%s)r   r   r   )headerstubstitleZtxt_fmtZ	datatypes)r   r   r   r   r   r   r   r   Ziterrowsr   r   r   )r   r'   r   r   rn   r   rowr   r   r   r   summaryF  s"    	
zDescription.summaryc                 C   s   t |   S r
   )r   r   Zas_textr   r   r   r   __str__h  s    zDescription.__str__)N) __name__
__module____qualname____doc__Z_int_fmtr`   Znumeric_statisticsCATEGORICAL_STATISTICSZcategorical_statisticsrt   Zdefault_statisticsr8   r   r+   ndarrayr   r   r   r   r   boolr   r   r   r   r   r   rh   ri   r   r   r   r   r   r   r   re      sB   R
 Y 9"re   ZReturnsr   zDescriptive statisticsZ
AttributeszSee Also)zpandas.DataFrame.describeNzBasic descriptive statistics)re   Nz;Descriptive statistics class with additional output optionsTrf   Fr   rg   )	rn   r   rh   ri   rj   rk   rA   rl   r   c             
   C   s   t | |||||||djS )Nrg   )re   r   rm   r   r   r   describe  s    r   c                   @   s   e Zd ZdZdd ZdS )Describez
    Removed.
    c                 C   s   t dd S )NzDescribe has been removed)NotImplementedError)r   Zdatasetr   r   r   r     s    zDescribe.__init__N)r   r   r   r   r   r   r   r   r   r     s   r   )r   )r   )r   )r   )r   )r   )r   )N)DZstatsmodels.compat.pandasr   r   r   Zstatsmodels.compat.scipyr   typingr   r   Znumpyr+   Zpandasr   Zpandas.core.dtypes.commonr	   Zscipyr   Zstatsmodels.iolib.tabler   Zstatsmodels.stats.stattoolsr   Zstatsmodels.tools.decoratorsr   Zstatsmodels.tools.docstringr   r   Zstatsmodels.tools.validationr   r   r   r   r8   arrayZ	QUANTILESr(   r/   r4   r6   r7   r;   r=   ZnanmeanZnanstdr1   r2   ZnanvarMISSINGrG   rH   rP   r`   r   Z_additionaltuplert   re   r   ZdsZreplace_blockr   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s   






/   =
 
 