U
    KvfqS  ã                   @   sÂ  d Z ddlmZ ddlZddlmZ ddlmZ ddl	m
Z dd„ ZdUdd„Zdd„ ZeZdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zi Zeeeeeeedœed< dd„ Zdd „ d!d „ d"d „ d#œed$< d%eied&< G d'd(„ d(ƒZdVd*d+„ZdWd,d-„ZdXd/d0„ZdYd1d2„ZG d3d4„ d4ƒZed5kr¾dd6lmZ ej j!d7d.d8Z!e"d9ƒ e"ee!d:ƒƒ ee!d:ƒZ#e"e# $¡ ƒ d%d;d<d=d>d?d@gZ%e%D ]Z&e"e&e# $e&d¡ƒ qze"dAƒ ddBl'm(Z( e(e)ƒZ*d.Z+e,d)ƒD ]FZ-ej. /e+¡Z!ee!d:ƒZ#e%D ]$Z&e*e&  0e# $e&d¡d dC ¡ qÚq¼e 1dDdE„ e%D ƒ¡Z2e"dFdG 3e%¡ƒ e"dHe2dIk  4dC¡ƒ e"dJe2dKk  4dC¡ƒ e"dLe2dMk  4dC¡ƒ edNd „ d:d.dO d.Z+d)Z5eeƒ dPe+e5ddQZ6e 7e5e 1dRdSdTg¡ ¡ 8e9¡Z:e"e6e: ƒ dS )Zat  More Goodness of fit tests

contains

GOF : 1 sample gof tests based on Stephens 1970, plus AD A^2
bootstrap : vectorized bootstrap p-values for gof test with fitted parameters


Created : 2011-05-21
Author : Josef Perktold

parts based on ks_2samp and kstest from scipy.stats
(license: Scipy BSD, but were completely rewritten by Josef Perktold)


References
----------

é    )ÚlmapN)Údistributions)Úcache_readonly)Ú
kolmogorovc           
      C   sî   t tj| |fƒ\} }| jd }|jd }t| ƒ}t|ƒ}t | ¡} t |¡}t | |g¡}tj| |ddd|  }tj||ddd|  }t t 	|| ¡¡}t 
|| t|| ƒ ¡}zt|d d|  | ƒ}	W n   d}	Y nX ||	fS )aA  
    Computes the Kolmogorov-Smirnof statistic on 2 samples.

    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.

    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different


    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value


    Notes
    -----

    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import ks_2samp

    >>> #fix random seed to get the same result
    >>> np.random.seed(12345678)

    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    different distribution
    we can reject the null hypothesis since the pvalue is below 1%

    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1)
    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
    >>> ks_2samp(rvs1,rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)

    slightly different distribution
    we cannot reject the null hypothesis at a 10% or lower alpha since
    the pvalue at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
    >>> ks_2samp(rvs1,rvs3)
    (0.10333333333333333, 0.14498781825751686)

    identical distribution
    we cannot reject the null hypothesis since the pvalue is high, 41%

    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
    >>> ks_2samp(rvs1,rvs4)
    (0.07999999999999996, 0.41126949729859719)
    r   Úright)Zsideç      ð?ç¸…ëQ¸¾?ç)\Âõ(¼?)r   ÚnpZasarrayÚshapeÚlenÚsortZconcatenateZsearchsortedÚmaxÚabsoluteÚsqrtÚfloatÚksprob)
Zdata1Zdata2Zn1Zn2Zdata_allZcdf1Zcdf2ÚdÚenZprob© r   úM/tmp/pip-unpacked-wheel-2v6byqio/statsmodels/sandbox/distributions/gof_new.pyÚks_2samp   s"    H




r   r   é   Ú	two_sidedÚapproxc                 K   s¼  t | tƒr8|r|| kr0tt| ƒj}tt| ƒj} ntdƒ‚t |tƒrNtt|ƒj}t| ƒrpd|i}t 	| ||Ž¡}nt 	| ¡}t
|ƒ}||f|žŽ }|dkrÎt d|d ¡| |  ¡ }	|dkrÎ|	tj |	|¡fS |dkr|t d|¡|   ¡ }
|d	kr|
tj |
|¡fS |d
kr¸t |	|
g¡}|dkrH|tj |t |¡ ¡fS |dkr¸tj |t |¡ ¡}|dksˆ|d|d d  kr¢|tj |t |¡ ¡fS |tj ||¡d fS dS )aÉ  
    Perform the Kolmogorov-Smirnov test for goodness of fit

    This performs a test of the distribution G(x) of an observed
    random variable against a given distribution F(x). Under the null
    hypothesis the two distributions are identical, G(x)=F(x). The
    alternative hypothesis can be either 'two_sided' (default), 'less'
    or 'greater'. The KS test is only valid for continuous distributions.

    Parameters
    ----------
    rvs : str or array or callable
        string: name of a distribution in scipy.stats

        array: 1-D observations of random variables

        callable: function to generate random variables, requires keyword
        argument `size`

    cdf : str or callable
        string: name of a distribution in scipy.stats, if rvs is a string then
        cdf can evaluate to `False` or be the same as rvs
        callable: function to evaluate cdf

    args : tuple, sequence
        distribution parameters, used if rvs or cdf are strings
    N : int
        sample size if rvs is string or callable
    alternative : 'two_sided' (default), 'less' or 'greater'
        defines the alternative hypothesis (see explanation)

    mode : 'approx' (default) or 'asymp'
        defines the distribution used for calculating p-value

        'approx' : use approximation to exact distribution of test statistic

        'asymp' : use asymptotic distribution of test statistic


    Returns
    -------
    D : float
        KS test statistic, either D, D+ or D-
    p-value :  float
        one-tailed or two-tailed p-value

    Notes
    -----

    In the one-sided test, the alternative is that the empirical
    cumulative distribution function of the random variable is "less"
    or "greater" than the cumulative distribution function F(x) of the
    hypothesis, G(x)<=F(x), resp. G(x)>=F(x).

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import kstest

    >>> x = np.linspace(-15,15,9)
    >>> kstest(x,'norm')
    (0.44435602715924361, 0.038850142705171065)

    >>> np.random.seed(987654321) # set random seed to get the same result
    >>> kstest('norm','',N=100)
    (0.058352892479417884, 0.88531190944151261)

    is equivalent to this

    >>> np.random.seed(987654321)
    >>> kstest(stats.norm.rvs(size=100),'norm')
    (0.058352892479417884, 0.88531190944151261)

    Test against one-sided alternative hypothesis:

    >>> np.random.seed(987654321)

    Shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x):

    >>> x = stats.norm.rvs(loc=0.2, size=100)
    >>> kstest(x,'norm', alternative = 'less')
    (0.12464329735846891, 0.040989164077641749)

    Reject equal distribution against alternative hypothesis: less

    >>> kstest(x,'norm', alternative = 'greater')
    (0.0072115233216311081, 0.98531158590396395)

    Do not reject equal distribution against alternative hypothesis: greater

    >>> kstest(x,'norm', mode='asymp')
    (0.12464329735846891, 0.08944488871182088)


    Testing t distributed random variables against normal distribution:

    With 100 degrees of freedom the t distribution looks close to the normal
    distribution, and the kstest does not reject the hypothesis that the sample
    came from the normal distribution

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
    (0.072018929165471257, 0.67630062862479168)

    With 3 degrees of freedom the t distribution looks sufficiently different
    from the normal distribution, that we can reject the hypothesis that the
    sample came from the normal distribution at a alpha=10% level

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
    (0.131016895759829, 0.058826222555312224)
    ú5if rvs is string, cdf has to be the same distributionÚsize)r   Úgreaterr   é   r   )r   Úlessç        r   r   Úasympr   éj
  çš™™™™™é?ç333333Ó?ç     @@é   N)Ú
isinstanceÚstrÚgetattrr   ÚcdfÚrvsÚAttributeErrorÚcallabler
   r   r   Úaranger   ÚksoneÚsfÚ	kstwobignr   )r+   r*   ÚargsÚNÚalternativeÚmodeÚkwdsÚvalsÚcdfvalsÚDplusÚDminÚDÚpval_twor   r   r   Úkstest}   s<    s







 r=   c                 C   sZ   t  |¡d dt  |¡  }| | }t  d|d  ¡}t  | t  dddg¡k¡}|||fS )Nr   r	   éþÿÿÿr&   g=
×£p=ê?r   ©r
   r   ÚexpÚsumÚarray©ÚstatÚnobsÚ
mod_factorÚstat_modifiedÚpvalÚdigitsr   r   r   Údplus_st70_upp  s
    rJ   c                 C   s^   t  |¡d dt  |¡  }| | }dt  d|d  ¡ }t  | t  dddg¡k¡}|||fS )Nr   r	   r&   r>   g…ëQ¸í?gHáz®Gñ?r?   rC   r   r   r   Ú
d_st70_upp&  s
    rK   c                 C   sj   t  |¡d dt  |¡  }| | }|d }d| d t  d| ¡ }t  | t  dddg¡k¡}|||fS )Ng×£p=
×Ã?g¸…ëQ¸Î?r&   é   r>   gö(\Âõð?g)\Âõ(ô?r?   )rD   rE   rF   rG   ZzsqurH   rI   r   r   r   Ú
v_st70_upp.  s    rM   c                 C   sN   d| }| d|  d|d   d|  }dt  dd|  ¡ }t j}|||fS )	Nr   gš™™™™™Ù?g333333ã?r&   r   çš™™™™™©?gR¸…ëQ@é   )r
   r@   Únan©rD   rE   ZnobsinvrG   rH   rI   r   r   r   Úwsqu_st70_upp7  s
     rR   c                 C   sp   d| }| d|  d|d   }|dd|  9 }dt  d| t jd  ¡ }t  | t  dddg¡k¡}|||fS )	Nr   çš™™™™™¹?r&   r   r#   r>   gÂõ(\Ò?gÃõ(\ÂÕ?©r
   r@   ÚpirA   rB   rQ   r   r   r   Úusqu_st70_upp?  s    rV   c                 C   st   d| }| d|  d|d   }|dd|  9 }dt  d| d	 t jd  ¡ }t  | t  d
d
dg¡k¡}|||fS )Nr   gffffffæ?çÍÌÌÌÌÌì?r&   r   g®Gáz®ó?g‘í|?5^ô?r>   ç       @r	   g!°rh‘íÜ?rT   rQ   r   r   r   Ú
a_st70_uppH  s     rY   )Úd_plusÚd_minusr   ÚvÚwsquÚusquÚaÚstephens70uppc                 C   sn   t j | t |¡ ¡}|dks2|d|d d  krP| t j | t |¡ ¡tjfS | t j | |¡d tjfS d S )Nr"   r#   r$   r%   r&   )r   r1   r0   r
   r   rP   r/   )r;   r3   r<   r   r   r   Úpval_kstest_approx^  s    ra   c                 C   s   | t j | |¡tjfS ©N©r   r/   r0   r
   rP   )r9   r3   r   r   r   Ú<lambda>f  ó    rd   c                 C   s   | t j | |¡tjfS rb   rc   )r:   r3   r   r   r   rd   g  re   c                 C   s   | t j | t |¡ ¡tjfS rb   )r   r1   r0   r
   r   rP   )r;   r3   r   r   r   rd   h  re   )rZ   r[   r   Úscipyr   Zscipy_approxc                   @   s„   e Zd ZdZddd„Zedd„ ƒZedd	„ ƒZed
d„ ƒZedd„ ƒZ	edd„ ƒZ
edd„ ƒZedd„ ƒZedd„ ƒZddd„ZdS )ÚGOFaP  One Sample Goodness of Fit tests

    includes Kolmogorov-Smirnov D, D+, D-, Kuiper V, Cramer-von Mises W^2, U^2 and
    Anderson-Darling A, A^2. The p-values for all tests except for A^2 are based on
    the approximatiom given in Stephens 1970. A^2 has currently no p-values. For
    the Kolmogorov-Smirnov test the tests as given in scipy.stats are also available
    as options.


    design: I might want to retest with different distributions, to calculate
    data summary statistics only once, or add separate class that holds
    summary statistics and data (sounds good).


    r   r   c                 C   s¦   t |tƒr8|r||kr0tt|ƒj}tt|ƒj}ntdƒ‚t |tƒrNtt|ƒj}t|ƒrpd|i}t 	|||Ž¡}nt 	|¡}t
|ƒ}||f|žŽ }|| _|| _|| _d S )Nr   r   )r'   r(   r)   r   r*   r+   r,   r-   r
   r   r   rE   Zvals_sortedr8   )Úselfr+   r*   r2   r3   r6   r7   r8   r   r   r   Ú__init__†  s     


zGOF.__init__c                 C   s(   | j }| j}t d|d ¡| |  ¡ S )Nr   r   ©rE   r8   r
   r.   r   ©rh   rE   r8   r   r   r   rZ      s    z
GOF.d_plusc                 C   s$   | j }| j}|t d|¡|   ¡ S )Nr    rj   rk   r   r   r   r[   ¦  s    zGOF.d_minusc                 C   s   t  | j| jg¡S rb   )r
   r   rZ   r[   ©rh   r   r   r   r   ¬  s    zGOF.dc                 C   s   | j | j S )ZKuiper)rZ   r[   rl   r   r   r   r\   °  s    zGOF.vc                 C   sH   | j }| j}|dt d|d ¡ d | d  d  ¡ d| d  }|S )zCramer von MisesrX   r   r   r&   g      (@)rE   r8   r
   r.   rA   )rh   rE   r8   r]   r   r   r   r]   µ  s    *
ÿzGOF.wsquc                 C   s*   | j }| j}| j|| ¡ d d   }|S )Nç      à?r&   )rE   r8   r]   Úmean)rh   rE   r8   r^   r   r   r   r^   ¿  s    zGOF.usquc                 C   sp   | j }| j}d}td|ƒD ]<}|| |d |…  }|dk}d||  ||< || ¡ 7 }q|d d| |  }|S )Nr   r   rm   g      @rX   )rE   r8   ÚrangerA   )rh   rE   r8   ZmsumÚjZmjÚmaskr_   r   r   r   r_   Ç  s    zGOF.ac              	   C   sX   | j }| j}dt d|d ¡ d t |¡t d|ddd…  ¡   ¡  | | }|S )z4Stephens 1974, does not have p-value formula for A^2rX   r   r   Néÿÿÿÿ)rE   r8   r
   r.   ÚlogrA   )rh   rE   r8   Úasqur   r   r   rt   ×  s     ÿÿÿzGOF.asqur   r`   c                 C   sB   t | |ƒ}|dkr*t| | || jƒ|fS t| | || jƒS dS )z


        r`   N)r)   Ú	gof_pvalsrE   )rh   ZtestidZpvalsrD   r   r   r   Úget_testã  s    
zGOF.get_testN)r   r   )r   r`   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__ri   r   rZ   r[   r   r\   r]   r^   r_   rt   rv   r   r   r   r   rg   n  s&   







rg   éd   c           	         sÄ   ddl m} |tƒ‰ tdƒD ]>}| |ƒ}t||ƒ}tD ]"}ˆ |  | |d¡d d ¡ q6qt 	‡ fdd„tD ƒ¡}t
dd	 t¡ƒ t
d
|dk  d¡ƒ t
d|dk  d¡ƒ t
d|dk  d¡ƒ d S )Nr   ©Údefaultdictiè  r`   r   c                    s   g | ]}ˆ | ‘qS r   r   ©Ú.0Úti©Úresultsr   r   Ú
<listcomp>   s     zgof_mc.<locals>.<listcomp>ú	         ú      úat 0.01:ç{®Gáz„?úat 0.05:rN   úat 0.10:rS   )Úcollectionsr}   Úlistro   rg   Úall_gofsÚappendrv   r
   rB   ÚprintÚjoinrn   )	ZrandfnÚdistrrE   r}   Úir+   Úgoftr€   Úresarrr   r   r   Úgof_mcõ  s    
"r”   c                 C   sœ   t | jƒ}| j| }tdƒg| }dg| }tdƒ||< tdddƒ||< dt d|d ¡t|ƒ  d t | ¡t d| t|ƒ  ¡  |  |¡ | }|S )z.vectorized Anderson Darling A^2, Stephens 1974Nrr   rX   r   r   )r   r   Úslicer
   r.   Útuplers   rA   )r8   ÚaxisÚndimrE   Zslice_reverseÚislicert   r   r   r   Úasquare  s     


ÿÿÿþrš   éÈ   c                 C   s0  |dk	r´|dkrt dƒ‚tt |t|ƒ ¡ƒ}d}t|ƒD ]h}| j|fd||fiŽ}	| j|	dd}
tdd„ |
ƒ}
tj	|  
|	|
¡dd}t|dd}|||k ¡ 7 }q:|t|| ƒ S | j|fd||fiŽ}	| j|	dd}
td	d„ |
ƒ}
tj	|  
|	|
¡dd}t|dd}|dkr t 	|¡}|S ||k ¡ S dS )
a  Monte Carlo (or parametric bootstrap) p-values for gof

    currently hardcoded for A^2 only

    assumes vectorized fit_vec method,
    builds and analyses (nobs, nrep) sample in one step

    rename function to less generic

    this works also with nrep=1

    Nzusing batching requires a valuer   r   r   ©r—   c                 S   s   t  | d¡S ©Nr   ©r
   Zexpand_dims©Úxr   r   r   rd   ;  re   zbootstrap.<locals>.<lambda>c                 S   s   t  | d¡S r   rž   rŸ   r   r   r   rd   D  re   )Ú
ValueErrorÚintr
   Úceilr   ro   r+   Úfit_vecr   r   r*   rš   rA   rn   )r   r2   rE   ÚnrepÚvalueZ
batch_sizeZn_batchÚcountÚirepr+   Úparamsr8   rD   Zstat_sortedr   r   r   Ú	bootstrap  s,    

rª   c                 C   sd   d}t |ƒD ]J}|j|fd|iŽ}| |¡}t | ||¡¡}	t|	dd}
||
| k7 }q|d | S )zþMonte Carlo (or parametric bootstrap) p-values for gof

    currently hardcoded for A^2 only

    non vectorized, loops over all parametric bootstrap replications and calculates
    and returns specific p-value,

    rename function to less generic

    r   r   rœ   r   )ro   r+   r¤   r
   r   r*   rš   )r¦   r   r2   rE   r¥   r§   r¨   r+   r©   r8   rD   r   r   r   Ú
bootstrap2O  s    
r«   c                   @   s*   e Zd ZdZd
dd„Zdd„ Zdd„ Zd	S )ÚNewNormz-just a holder for modified distributions
    r   c                 C   s   |  |¡| |¡fS rb   )rn   Zstd)rh   r    r—   r   r   r   r¤   p  s    zNewNorm.fit_vecc                 C   s   t jj||d |d dS )Nr   r   )ÚlocÚscale)r   Únormr*   )rh   r    r2   r   r   r   r*   s  s    zNewNorm.cdfc                 C   s&   |d }|d }||t jj|d  S )Nr   r   ©r   )r   r¯   r+   )rh   r2   r   r­   r®   r   r   r   r+   v  s    zNewNorm.rvsN)r   )rw   rx   ry   rz   r¤   r*   r+   r   r   r   r   r¬   l  s   
r¬   Ú__main__)Ústatsé   r°   zscipy kstestr¯   rZ   r[   r\   r]   r^   r_   z
Is it correctly sized?r|   r   c                 C   s   g | ]}t | ‘qS r   r   r~   r   r   r   rƒ   —  s     rƒ   r„   r…   r†   r‡   rˆ   rN   r‰   rS   c                 C   s   t jjd| dS )Nr³   r°   )r²   Útr+   ©rE   r   r   r   rd     re   rµ   )r   r   )r2   rE   r¥   r¦   g®Gáz®ï?gffffffî?rW   )r   r   r   r   )r{   )r   )r   r›   r{   NN)r   r›   r{   );rz   Zstatsmodels.compat.pythonr   Znumpyr
   Zscipy.statsr   Zstatsmodels.tools.decoratorsr   Zscipy.specialr   r   r   r=   rJ   Zdminus_st70_upprK   rM   rR   rV   rY   ru   ra   rg   r”   rš   rª   r«   r¬   rw   rf   r²   r´   r+   rŽ   r’   rv   rŒ   r€   rŠ   r}   r‹   r‚   rE   ro   r‘   ÚrandomZrandnr   rB   r“   r   rn   r¥   ZbtÚfloorZastyper¢   Z
quantindexr   r   r   r   Ú<module>   sŒ   _
 		ù
	ý
 ÿ 


2



&