U
    Kvfý#  ã                   @   sÖ  d Z ddlmZ ddlZddlmZ d?dd„Zdd„ Z	d	d
„ Z
dZdZdd„ Zdd„ Zdd„ ZedkrÒdZejjdedfd defdefdefdefg¡Zejjedfd defdefg¡ZddlZejjjeefdd Ze ed!fe¡Z ej!j"D ]Z#ee# e e#< qøej!j"D ]Z#ee# e e#< qed"e ƒ\Z$Z"ed#e ƒ\Z$Z"e 'd$d%„ e"D ƒ¡Z(e( )d!¡d&ejjed  Z*e +e*e(¡ ,¡ Z-e&e-j.ƒ e&eee-ƒ ƒ e 'd'd%„ ed(e"ƒD ƒ¡Z(e( )d!¡d&ejjed  Z*e +e*e(¡ ,¡ Z-e&e-j.ƒ e&eee-ƒ ƒ e !d)efd*efd+efd,efd-efd.efd/efd0efg¡Z/ej0d1e/d2dd3Z1e&d4d5d%„ e1j!j"D ƒƒ e1j2 e3¡Z4e4 5d6e6e1j!j"ƒ¡ 7d!¡Z8e1e8 dd…f j9Z:e&e:j;ƒ e&e:j!ƒ ed7e:ƒ\Z<Z=e 'd8d%„ ed9e=ƒD ƒ¡Z>e:d0 Z?e +e?e>¡ ,¡ Z@e&e@j.ƒ e&eee@ƒ ƒ d: Ae1j!j"dd6… ¡ZBed;e:ƒ\ZCZDe 'd<d%„ ed9eDƒD ƒ¡ZEe:d0 ZFe +eFeE¡ ,¡ ZGe&eGj.ƒ e&eeeGƒ ƒ eDD ]TZHe&d=eHfƒ e 'd>d%„ eeHeDƒD ƒ¡ZIe:d0 ZJe +eJeI¡ ,¡ ZKe&eeeKƒ ƒ q|dS )@a   convenience functions for ANOVA type analysis with OLS

Note: statistical results of ANOVA are not checked, OLS is
checked but not whether the reported results are the ones used
in ANOVA

includes form2design for creating dummy variables

TODO:
 * ...
 *

é    )ÚlmapNFc                 C   s^   |   ¡ } t | ¡}|r0| dd…df |k t¡S | dd…df |k t¡dd…dd…f S dS )z|convert array of categories to dummy variables
    by default drops dummy variable for last category
    uses ravel, 1d onlyNéÿÿÿÿ)ZravelÚnpÚuniqueÚastypeÚint)ÚxÚ	returnallÚgroups© r   úP/tmp/pip-unpacked-wheel-2v6byqio/statsmodels/sandbox/regression/try_ols_anova.pyÚ
data2dummy   s
    
r   c                 C   sL   t  tt|  ¡ ƒ¡}| |dd…ddd…f k d¡j t¡dd…dd…f S )zÝcreates product dummy variables from 2 columns of 2d array

    drops last dummy variable, but not from each category
    singular with simple dummy variable but not with constant

    quickly written, no safeguards

    Nr   )	r   r   r   ÚtupleÚtolistÚallÚTr   r   )r   r
   r   r   r   Údata2proddummy   s    r   c                 C   s.   |j dkr|dd…df }t| dd}|| S )zýcreate dummy continuous variable

    Parameters
    ----------
    x1 : 1d array
        label or group array
    x2 : 1d array (float)
        continuous variable

    Notes
    -----
    useful for group specific slope coefficients in regression
    é   NT)r	   )Úndimr   )Úx1Zx2Údummyr   r   r   Údata2groupcont.   s    
r   aW  
ANOVA statistics (model sum of squares excludes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i        %(ess)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
a]  
ANOVA statistics (model sum of squares includes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i      %(ssmwithmean)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
c                 C   sb   i }|  | j¡ ddddddddd	d
dg}|D ]}t| |ƒ||< q.| jj|d< | j| j |d< |S )zjupdate regression results dictionary with ANOVA specific statistics

    not checked for completeness
    Zdf_modelZdf_residZessÚssrÚuncentered_tssZ	mse_modelZ	mse_residZ	mse_totalZfvalueZf_pvalueZrsquaredÚnobsZssmwithmean)ÚupdateÚ__dict__ÚgetattrÚmodelr   r   r   )ÚresÚadZ
anova_attrÚkeyr   r   r   Ú	anovadict[   s    
    þr"   c                 C   sh  i }g }|   ¡ D ]L}|dkr>t |jd ¡|d< | d¡ qd|kr^|| ||< | |¡ q|dd… dkr˜|  d¡d }t|| ƒ||< | |¡ q|dd… d	krú|  d¡d   d
¡}ttj||d  ||d  f ƒ|d |¡< | d |¡¡ q|dd… dkrV|  d¡d   d
¡}t	||d  ||d  ƒ|d |¡< | d |¡¡ qt
dƒ‚q||fS )a  convert string formula to data dictionary

    ss : str
     * I : add constant
     * varname : for simple varnames data is used as is
     * F:varname : create dummy variables for factor varname
     * P:varname1*varname2 : create product dummy variables for
       varnames
     * G:varname1*varname2 : create product between factor and
       continuous variable
    data : dict or structured array
       data set, access of variables by name as in dictionaries

    Returns
    -------
    vars : dictionary
        dictionary of variables with converted dummy variables
    names : list
        list of names, product (P:) and grouped continuous
        variables (G:) have name by joining individual names
        sorted according to input

    Examples
    --------
    >>> xx, n = form2design('I a F:b P:c*d G:c*f', testdata)
    >>> xx.keys()
    ['a', 'b', 'const', 'cf', 'cd']
    >>> n
    ['const', 'a', 'b', 'cd', 'cf']

    Notes
    -----

    with sorted dict, separate name list would not be necessary
    ÚIr   Úconstú:Né   zF:r   zP:Ú*Ú zG:zunknown expression in formula)Úsplitr   ZonesÚshapeÚappendr   r   Zc_Újoinr   Ú
ValueError)ÚssÚdataÚvarsÚnamesÚitemÚvr   r   r   Úform2designl   s.    $,$
r4   c                 C   s(   |dd… }|   ¡ D ]}| |¡ q|S )zwdrop names from a list of strings,
    names to drop are in space delimited list
    does not change original list
    N)r)   Úremove)r.   ÚliZnewlir2   r   r   r   Údropname©   s    r7   Ú__main__iè  é   é   )ÚsizeÚaÚbÚcÚdr&   ÚeÚfT)Úflattenr   zI a F:b P:c*dzI a F:b P:c*d G:a*e fc                 C   s   g | ]}t | ‘qS r   ©Úxx©Ú.0Únnr   r   r   Ú
<listcomp>Ô   s     rH   g{®Gáz„?c                 C   s   g | ]}t | ‘qS r   rC   rE   r   r   r   rH   Ü   s     zae fZbreedZsexZlitterZpenZpigZageZbageÚyzdftest3.dataÚ.)ÚmissingZusemaskrK   c                 C   s   g | ]}t j|  ¡ ‘qS r   )ÚdtaÚmaskÚsum)rF   Úkr   r   r   rH   ñ   s     r   zI F:sex agec                 C   s   g | ]}t | ‘qS r   )Úxx_b1rE   r   r   r   rH      s     r(   ú z'I F:breed F:sex F:litter F:pen age bagec                 C   s   g | ]}t | ‘qS r   ©Úxx_b1arE   r   r   r   rH     s     z
Results droppingc                 C   s   g | ]}t | ‘qS r   rR   rE   r   r   r   rH     s     )F)LÚ__doc__Zstatsmodels.compat.pythonr   Znumpyr   Zstatsmodels.apiÚapiÚsmr   r   r   Z
anova_str0Z	anova_strr"   r4   r7   Ú__name__r   ÚrandomÚrandintÚviewr   ZtestdataintÚnormalÚfloatZtestdatacontZnumpy.lib.recfunctionsÚlibZrecfunctionsZ	zip_descrZdt2ÚemptyZtestdataZdtyper1   ÚnamerD   ÚnÚprintZcolumn_stackÚXrN   rI   ZOLSZfitZrest1ÚparamsZdt_bZ
genfromtxtrL   rM   ÚboolÚmZreshapeÚlenÚanyZdroprowsr/   Z
dta_use_b1r*   rP   Znames_b1ZX_b1Zy_b1Zrest_b1r,   ZallexogrS   Z	names_b1aZX_b1aZy_b1aZrest_b1aZdropnZX_b1a_Zy_b1a_Z	rest_b1a_r   r   r   r   Ú<module>   s†   

=

2$

   þ



