U
    Kvfm'  ã                   @   s,  d Z ddlmZ ddlZdd„ Zdd„ Zd"dd	„ZG d
d„ dƒZe	dkr(ddl
mZ dZej e¡ZdgZdekrüeeƒ eeedƒƒ eeedƒƒ e e ¡ e ¡ ¡ZeeeƒZe ee¡ e eeeƒed ¡ eje e¡e eeƒ¡ddZe dd„ eD ƒ¡Ze ¡  e ee¡ eeƒZe ¡  e ee¡ e ¡  e  edd… e !e¡e !e¡ ¡ e e ¡ e ¡ d¡Z"e ¡  e  e"dd… e !ee"ƒ¡e !e"¡ ¡ e e¡Z#e#dded … Z$e ¡  e  e$dd… e !ee$ƒ¡e !e$¡ ¡ eeƒZ%ee% &¡ ƒ ee% 'e%j(¡ƒ ee% )dddg¡ƒ ee% 'dddddg¡ƒ e e ¡ e ¡ d¡Zeje e¡e eeƒ¡ddZeeƒZe ¡  e ee¡ ejeeddZ*e*eƒZ+e e+e¡ ej,e eeƒ¡e e¡dddZ-e-eƒZ.e e.e¡ edƒ ed e !e+¡ ¡ ƒ ed!e !e¡ ¡ ƒ dS )#a 
  
from David Huard's scipy sandbox, also attached to a ticket and
in the matplotlib-user mailinglist  (links ???)


Notes
=====

out of bounds interpolation raises exception and would not be completely
defined ::

>>> scoreatpercentile(x, [0,25,50,100])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.
>>> percentileofscore(x, [-50, 50])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.


idea
====

histogram and empirical interpolated distribution
-------------------------------------------------

dual constructor
* empirical cdf : cdf on all observations through linear interpolation
* binned cdf : based on histogram
both should work essentially the same, although pdf of empirical has
many spikes, fluctuates a lot
- alternative: binning based on interpolated cdf : example in script
* ppf: quantileatscore based on interpolated cdf
* rvs : generic from ppf
* stats, expectation ? how does integration wrt cdf work - theory?

Problems
* limits, lower and upper bound of support
  does not work or is undefined with empirical cdf and interpolation
* extending bounds ?
  matlab has pareto tails for empirical distribution, breaks linearity

empirical distribution with higher order interpolation
------------------------------------------------------

* should work easily enough with interpolating splines
* not piecewise linear
* can use pareto (or other) tails
* ppf how do I get the inverse function of a higher order spline?
  Chuck: resample and fit spline to inverse function
  this will have an approximation error in the inverse function
* -> does not work: higher order spline does not preserve monotonicity
  see mailing list for response to my question
* pmf from derivative available in spline

-> forget this and use kernel density estimator instead


bootstrap/empirical distribution:
---------------------------------

discrete distribution on real line given observations
what's defined?
* cdf : step function
* pmf : points with equal weight 1/nobs
* rvs : resampling
* ppf : quantileatscore on sample?
* moments : from data ?
* expectation ? sum_{all observations x} [func(x) * pmf(x)]
* similar for discrete distribution on real line
* References : ?
* what's the point? most of it is trivial, just for the record ?


Created on Monday, May 03, 2010, 11:47:03 AM
Author: josef-pktd, parts based on David Huard
License: BSD

é    Nc                 C   s6   t  |¡}t| ƒ}t t  |¡t  | ¡¡}||d ƒS )zÄReturn the score at the given percentile of the data.

    Example:
        >>> data = randn(100)
            >>> scoreatpercentile(data, 50)

        will return the median of sample `data`.
    ç      Y@)ÚnpÚarrayÚempiricalcdfÚinterpolateÚinterp1dÚsort)ÚdataZ
percentileZperÚcdfÚinterpolator© r   úJ/tmp/pip-unpacked-wheel-2v6byqio/statsmodels/sandbox/stats/stats_dhuard.pyÚscoreatpercentileV   s    	
r   c                 C   s,   t | ƒ}t t | ¡t |¡¡}||ƒd S )aD  Return the percentile-position of score relative to data.

    score: Array of scores at which the percentile is computed.

    Return percentiles (0-100).

    Example
            r = randn(50)
        x = linspace(-2,2,100)
        percentileofscore(r,x)

    Raise an error if the score is outside the range of data.
    r   )r   r   r   r   r   )r	   Úscorer
   r   r   r   r   Úpercentileofscored   s    r   ÚHazenc                 C   sÀ   t  t  | ¡¡d }t| ƒ}| ¡ }|dkr:|d | }n‚|dkrP||d  }nl|dkrf|d | }nV|dkr€|d |d  }n<|d	krš|d |d
  }n"|dkr´|d |d  }ntdƒ‚|S )a  Return the empirical cdf.

    Methods available:
        Hazen:       (i-0.5)/N
            Weibull:     i/(N+1)
        Chegodayev:  (i-.3)/(N+.4)
        Cunnane:     (i-.4)/(N+.2)
        Gringorten:  (i-.44)/(N+.12)
        California:  (i-1)/N

    Where i goes from 1 to N.
    ç      ð?Úhazenç      à?ÚweibullÚ
californiaÚ
chegodayevç333333Ó?çš™™™™™Ù?Úcunnaneçš™™™™™É?Ú
gringortenç)\Âõ(Ü?ç¸…ëQ¸¾?ú[Unknown method. Choose among Weibull, Hazen,Chegodayev, Cunnane, Gringorten and California.)r   ÚargsortÚlenÚlowerÚ
ValueError)r	   ÚmethodÚiÚNr
   r   r   r   r   v   s"    r   c                   @   s<   e Zd ZdZdd„ Zddd„Zdd	„ Zd
d„ Zddd„ZdS )ÚHistDistz»Distribution with piecewise linear cdf, pdf is step function

    can be created from empiricial distribution or from a histogram (not done yet)

    work in progress, not finished


    c                 C   s„   t  |¡| _t  | j ¡ | j ¡ g¡| _t  |¡}|| | _t  |¡| _	|  
¡ }t  |¡| _t | j| j¡| _t | j| j¡| _d S )N)r   Z
atleast_1dr	   r   ÚminÚmaxÚbinlimitr    Z_datasortedÚrankingr   r   Z_empcdfsortedr   r   ÚcdfintpÚppfintp)Úselfr	   Zsortindr
   r   r   r   Ú__init__¤   s    

zHistDist.__init__Nr   c                 C   sÖ   |dkr| j }| j}nt t |¡¡d }t|ƒ}| ¡ }|dkrP|d | }n‚|dkrf||d  }nl|dkr||d | }nV|dkr–|d |d	  }n<|d
kr°|d	 |d  }n"|dkrÊ|d |d  }ntdƒ‚|S )aA  Return the empirical cdf.

        Methods available:
            Hazen:       (i-0.5)/N
                Weibull:     i/(N+1)
            Chegodayev:  (i-.3)/(N+.4)
            Cunnane:     (i-.4)/(N+.2)
            Gringorten:  (i-.44)/(N+.12)
            California:  (i-1)/N

        Where i goes from 1 to N.
        Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   )r	   r+   r   r    r!   r"   r#   )r.   r	   r$   r%   r&   r
   r   r   r   r   °   s(    zHistDist.empiricalcdfc                 C   s
   |   |¡S ©z&
        this is score in dh

        )r,   )r.   r   r   r   r   Úcdf_empÙ   s    zHistDist.cdf_empc                 C   s
   |   |¡S r0   )r-   )r.   Zquantiler   r   r   Úppf_empá   s    zHistDist.ppf_empÚFreedmanc                 C   sp   t | jƒ}|dkr8|  d¡|  d¡ }d| |d  }n |dkrXdt | j¡ |d  }t | j¡| | _| jS )z”Find the optimal number of bins and update the bin countaccordingly.
        Available methods : Freedman
                            Scott
        r3   ç      è?ç      Ð?é   gUUUUUUÕ¿ZScottgìQ¸…ë@)r!   r	   r2   r   ZstdZptpr*   Znbin)r.   r$   ÚnobsZIQRÚwidthr   r   r   Úoptimize_binningë   s    
zHistDist.optimize_binning)Nr   )r3   )	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r/   r   r1   r2   r9   r   r   r   r   r'   š   s   	
)
r'   Ú__main__éd   r6   é   r   é2   )Úkc                 C   s   g | ]}t  |¡d  ‘qS )r@   )ÚempZderivatives)Ú.0Úxir   r   r   Ú
<listcomp>  s     rF   éÿÿÿÿé   r5   r4   g      à¿g      Ð¿iô  é   g¸…ëQ¸ž?)rB   Úsznegative densityz(np.diff(ppfs)).min()z(np.diff(cdf_ongrid)).min())r   )/r=   Zscipy.interpolater   Znumpyr   r   r   r   r'   r:   Zmatplotlib.pyplotZpyplotZpltr7   ÚrandomZrandnÚxZexamplesÚprintZlinspacer(   r)   ZxsuppÚposZplotZInterpolatedUnivariateSpliner   rC   r   ZpdfempÚfigureZ
cdf_ongridÚstepZdiffZxsupp2ZxsoÚxsZhistdr9   r1   r*   r2   r-   ZppfsZUnivariateSplineZppfempZppfer   r   r   r   Ú<module>   sl   R
$d


 $(
( "