U
    KvfY                     @   s.  d dl mZ d dlZd dlZd dlmZmZ d dlZd dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZ dd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd ZG dd dZG dd dZedkr*d dlZd dlmZ ej ddd d!d"d#gd$Z!ed%e!d&" Z#ed'e!d&" Z$ee#d(d)Z%dS )*    )lrangeN)	DataFrameIndex)stats)_has_intercept_intercept_idx_remove_intercept_patsy)summary2)OLSc                 C   sX   |d kr|   S |dkr| jS |dkr,| jS |dkr:| jS |dkrH| jS td| d S )NZhc0Zhc1Zhc2Zhc3z robust options %s not understood)Z
cov_paramsZcov_HC0Zcov_HC1Zcov_HC2Zcov_HC3
ValueError)modelrobust r   ;/tmp/pip-unpacked-wheel-2v6byqio/statsmodels/stats/anova.py_get_covariance   s    r   c                 K   s2  | dd}| dd}| dd}| dd}|r<| }| jj}| jj}|jd }| jj}	| jjj}
| jj	}t
|
jt|
 d }d	| }d
dd||g}tt|df|d}|dkrt| ||||
|||||
S |dkrt| |
||||S |dkr
t| |
||||S |dkrtdntdt| dS )a9  
    Anova table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model
    typ : int or str {1,2,3} or {"I","II","III"}
        Type of sum of squares to use.

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.
    testFscaleNtyp   r   r   zPR(>%s)dfsum_sqmean_sq   columnsr   I)   ZII)   ZIII)   ZIVzType IV not yet implementedzType %s not understood)getlowerr   endogexogshapeZendog_namesdatadesign_info
exog_nameslentermsr   r   npzerosanova1_lm_singleanova2_lm_singleanova3_lm_singleNotImplementedErrorr   str)r   kwargsr   r   r   r   r#   r$   nobsZresponse_namer'   r(   n_rowspr_testnamestabler   r   r   anova_single#   sD    

   


r8   c
                    s  t | dd}
|
dkr2tj|\}}t|j|}
tt jt j	f} fdd j
D }t|D ]\}}d|||f< qht||
d }t }||  }t j
}||  }| }t|dg |_tj||  d|f |j|dd	gf< | j| jf|jdd	dgf< |d
krr|d	 |d  | j| j  ||< tj|d
 |d | j||< tjtjf|jd||gf< |d	 |d  |d< |S )a  
    Anova table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.
    effectsNc                    s   g | ]}  |qS r   )slice).0namer'   r   r   
<listcomp>|   s     z$anova1_lm_single.<locals>.<listcomp>r   r   Residualr   r   r   r   )getattrr+   linalgqrdotTr,   r)   r*   Zcolumn_names
term_names	enumerater   arraytolistr   indexZc_sumlocssrdf_residr   fsfnan)r   r#   r$   r3   r'   r7   r4   r   r5   r   r9   qrZarrslicesiZslice_r   idxrE   rI   r   r=   r   r-   _   s6    

(

r-   c                 C   s  |j dd }t|}dd||g}tt|df|d}t| d}	t| |}
g }g }t|D ]\}}||}t|j	|j
}g }t|j}|D ]R}t|j}||r||ks||}|t|j	|j
 |t|j	|j
 qt| jjjd | }t| jjjd | }|jrtt||
|j}ddlm} ||\}}|jd |jd  }t|dd| df j|}n|}|jd }|d	kr| j||
d
}|j |j|j| |f< }|j|j|j| |f< ||j|j| df< ||j	 ||  q\t |dg |_|j!t"|| jjjd d g  }|| |d  | j# | j$ }||d< | j#| j$tj%tj%f|jddd||gf< |S )a  
    Anova type II table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.

    Type II
    Sum of Squares compares marginal contribution of terms. Thus, it is
    not particularly useful for models with significant interaction terms.
    Nr   r   r    r   r   r   )rA   r   Zcov_pr?   )&r*   r   r   r+   r,   r   rF   r:   r   startstopsetfactorsissubsetextendeyer   r$   r%   sizerC   rD   scipyrA   rB   f_testfvaluerK   rI   pvalueappendr<   r   ilocZargsortrL   rM   rP   )r   r'   r4   r   r5   r   
terms_infor6   r7   covZ
robust_cov	col_orderrI   rT   termcolsL1ZL2Zterm_settZ	other_setcolZLVLrA   Z
orth_compl_rR   L12rN   
test_valuerL   r   r   r   r.      s\    





"

$ r.   c                 C   sN  |t |7 }|j}dd||g}tt|df|d}t| |}	g }
g }t|D ]\}}||}t| j	j
jd | }|}|jd }|dkr| j||	d}|j |j|j| |f< }|j|j|j| |f< ||j|j| df< ||  qNt|d	g |_|| |d  | j | j }||d< | j| jtjtjf|jd	dd||gf< |S )
Nr   r   r    r   r   r   r   rV   r?   )r   r*   r   r+   r,   r   rF   r:   r]   r   r$   r%   r`   ra   rK   rI   rb   rc   r<   r   rL   rM   rP   )r   r'   r4   r   r5   r   re   r6   r7   rf   rg   rI   rT   rh   ri   rj   rn   rR   rN   ro   rL   r   r   r   r/      s6    


 r/   c            
      O   sZ  | dd}t| dkr,| d }t|f|S |dkrDtdt| | dd}| dd	}t| }d
| }dddd||g}tt|df|d}	|s| d j}dd | D |	d< dd | D |	d< t	|	d j
 |	j|	jdd	 df< |	d 	  |	d< |dkrV|	d |	d  | |	d< tj|	d |	d |	d |	|< tj|	j|	d  |f< |	S )a	  
    Anova table for one or more fitted linear models.

    Parameters
    ----------
    args : fitted linear model results instance
        One or more fitted linear models
    scale : float
        Estimate of variance, If None, will be estimated from the largest
        model. Default is None.
    test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".
    typ : str or int {"I","II","III"} or {1,2,3}
        The type of Anova test to perform. See notes.
    robust : {None, "hc0", "hc1", "hc2", "hc3"}
        Use heteroscedasticity-corrected coefficient covariance matrix.
        If robust covariance is desired, it is recommended to use `hc3`.

    Returns
    -------
    anova : DataFrame
        When args is a single model, return is DataFrame with columns:

        sum_sq : float64
            Sum of squares for model terms.
        df : float64
            Degrees of freedom for model terms.
        F : float64
            F statistic value for significance of adding model terms.
        PR(>F) : float64
            P-value for significance of adding model terms.

        When args is multiple models, return is DataFrame with columns:

        df_resid : float64
            Degrees of freedom of residuals in models.
        ssr : float64
            Sum of squares of residuals in models.
        df_diff : float64
            Degrees of freedom difference from previous model in args
        ss_dff : float64
            Difference in ssr from previous model in args
        F : float64
            F statistic comparing to previous model in args
        PR(>F): float64
            P-value for significance comparing to previous model in args

    Notes
    -----
    Model statistics are given in the order of args. Models must have been fit
    using the formula api.

    See Also
    --------
    model_results.compare_f_test, model_results.compare_lm_test

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> from statsmodels.formula.api import ols
    >>> moore = sm.datasets.get_rdataset("Moore", "carData", cache=True) # load
    >>> data = moore.data
    >>> data = data.rename(columns={"partner.status" :
    ...                             "partner_status"}) # make name pythonic
    >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
    ...                 data=data).fit()
    >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame
    >>> print(table)
    r   r   r   r   z6Multiple models only supported for type I. Got type %sr   r   r   NzPr(>%s)rM   rL   Zdf_diffZss_diff   r   c                 S   s   g | ]
}|j qS r   )rL   r;   Zmdlr   r   r   r>   q  s     zanova_lm.<locals>.<listcomp>c                 S   s   g | ]
}|j qS r   )rM   rr   r   r   r   r>   r  s     )r!   r)   r8   r   r1   r   r+   r,   r   ZdiffvaluesrK   rI   r   rN   rO   rP   Zisnull)
argsr2   r   r   r   r   Zn_modelsr5   r6   r7   r   r   r   anova_lm  s6    F
&
ru   c                 C   s.   t dg| }|D ]}| | }d||< q|S )NTF)r+   rG   )rS   Zslices_to_excludenindrh   sr   r   r   
_not_slice  s
    
ry   c           	      C   s\   t |||jd }|| }t| |dd|f |}|j|}t| t| }||fS )ah  
    Residual sum of squares of OLS model excluding factors in `keys`
    Assumes x matrix is orthogonal

    Parameters
    ----------
    y : array_like
        dependent variable
    x : array_like
        independent variables
    term_slices : a dict of slices
        term_slices[key] is a boolean array specifies the parameters
        associated with the factor `key`
    params : ndarray
        OLS solution of y = x * params
    keys : keys for term_slices
        factors to be excluded

    Returns
    -------
    rss : float
        residual sum of squares
    df : int
        degrees of freedom
    r   N)ry   r%   r+   subtractrC   rD   r)   )	yxterm_slicesparamskeysrw   Zparams1rL   rM   r   r   r   _ssr_reduced_model  s    r   c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )AnovaRMa  
    Repeated measures Anova using least squares regression

    The full model regression residual sum of squares is
    used to compare with the reduced model for calculating the
    within-subject effect sum of squares [1].

    Currently, only fully balanced within-subject designs are supported.
    Calculation of between-subject effects and corrections for violation of
    sphericity are not yet implemented.

    Parameters
    ----------
    data : DataFrame
    depvar : str
        The dependent variable in `data`
    subject : str
        Specify the subject id
    within : list[str]
        The within-subject factors
    between : list[str]
        The between-subject factors, this is not yet implemented
    aggregate_func : {None, 'mean', callable}
        If the data set contains more than a single observation per subject
        and cell of the specified model, this function will be used to
        aggregate the data before running the Anova. `None` (the default) will
        not perform any aggregation; 'mean' is s shortcut to `numpy.mean`.
        An exception will be raised if aggregation is required, but no
        aggregation function was specified.

    Returns
    -------
    results : AnovaResults instance

    Raises
    ------
    ValueError
        If the data need to be aggregated, but `aggregate_func` was not
        specified.

    Notes
    -----
    This implementation currently only supports fully balanced designs. If the
    data contain more than one observation per subject and cell of the design,
    these observations need to be aggregated into a single observation
    before the Anova is calculated, either manually or by passing an aggregation
    function via the `aggregate_func` keyword argument.
    Note that if the input data set was not balanced before performing the
    aggregation, the implied heteroscedasticity of the data is ignored.

    References
    ----------
    .. [*] Rutherford, Andrew. Anova and ANCOVA: a GLM approach. John Wiley & Sons, 2011.
    Nc                 C   s   || _ || _|| _d|kr"td|| _|d k	r8td|| _|dkrRtjj	| _
n|| _
||j|g| ds| j
d k	r|   nd}t||   d S )NCzSFactor name cannot be 'C'! This is in conflict with patsy's contrast function name.z)Between subject effect not yet supported!mean)ZsubsetzThe data set contains more than one observation per subject and cell. Either aggregate the data manually, or pass the `aggregate_func` parameter.)r&   depvarwithinr   betweenr0   subjectpdZSeriesr   aggregate_funcequalsZdrop_duplicates
_aggregate_check_data_balanced)selfr&   r   r   r   r   r   msgr   r   r   __init__  s$    

zAnovaRM.__init__c                 C   s.   | j j| jg| j dd| j | j| _ d S )NF)Zas_index)r&   groupbyr   r   r   Zaggr   r   r   r   r   r     s    zAnovaRM._aggregatec           	      C   s   d}| j D ]}|t| j|  9 }q
i }t| jjd D ]T}g }| j D ]}|| j| j|  qHt|}||kr|| d ||< q:d||< q:d}t||krt	||| }|D ]}||| krt	|q| jjd || krt	ddS )zraise if data is not balanced

        This raises a ValueError if the data is not balanced, and
        returns None if it is balance

        Return might change
        r   r   zData is unbalanced.z9There are more than 1 element in a cell! Missing factors?N)
r   r)   r&   uniqueranger%   rc   rd   tupler   )	r   Zfactor_levelsZwiZ
cell_countrI   keyrl   error_messagecountr   r   r   r     s*    



zAnovaRM._check_data_balancedc                 C   sf  | j | j j}dd | jD }d| j }||g }tjd|| j d}|jj	}|D ]4}t
dg|jd  }d||| < t
|||< qTd	|g}	t||	|jd }|d
d
|f }t||}
|
 }|
j|jd k rtd|	D ]}|| q|D ]}|| | ||< q|j}|j}|j}ddddg}tjt
d|d}|D ]}| j|krF|dkrFt|||||g\}}|| }|| | }|d	|d
d ks|d	 | |kr|| }|}n2t|||||d	 | g\}}|| }|| | }|| }tj|||}|dddd}||j|df< ||j|df< ||j|df< ||j|df< qFt|S )zvestimate the model and compute the Anova table

        Returns
        -------
        AnovaResults instance
        c                 S   s   g | ]}d | qS )
C(%s, Sum)r   )r;   rT   r   r   r   r>   0  s     zAnovaRM.fit.<locals>.<listcomp>r   *r&   Fr   T:Nz$Independent variables are collinear.zF ValuezNum DFzDen DFzPr > F)r   r    r   Z	Interceptrq   zC( z, Sum)) r&   r   rs   r   r   patsyZdmatrixjoinr'   Zterm_name_slicesr+   rG   r%   ry   r
   fitZrankr   popr~   rM   rL   r   r   r,   r   r   rN   rO   replacerK   AnovaResults)r   r{   r   r   rZ   r|   r}   r   rw   Zterm_excluder   resultsrT   r~   rM   rL   r   anova_tableZssr1Z	df_resid1Zdf1ZmsmZmseZdf2r   prh   r   r   r   r   &  sv    



       zAnovaRM.fit)NNN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   r     s   7  
!r   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r   zX
    Anova results class

    Attributes
    ----------
    anova_table : DataFrame
    c                 C   s
   || _ d S N)r   )r   r   r   r   r   r   q  s    zAnovaResults.__init__c                 C   s   |    S r   )summary__str__r   r   r   r   r   t  s    zAnovaResults.__str__c                 C   s"   t  }|d || j |S )zlcreate summary results

        Returns
        -------
        summary : summary2.Summary instance
        ZAnova)r	   ZSummaryZ	add_titleZadd_dfr   )r   Zsummr   r   r   r   w  s    
zAnovaResults.summaryN)r   r   r   r   r   r   r   r   r   r   r   r   i  s   r   __main__)olsz	moore.csvr   Zpartner_statusZ
conformityZ	fcategoryZfscore)Zskiprowsr6   z5conformity ~ C(fcategory, Sum)*C(partner_status, Sum)r   z#conformity ~ C(partner_status, Sum)r   )r   )&Zstatsmodels.compat.pythonr   Znumpyr+   Zpandasr   r   r   r   r_   r   Z statsmodels.formula.formulatoolsr   r   r   Zstatsmodels.iolibr	   Z#statsmodels.regression.linear_modelr
   r   r8   r-   r.   r/   ru   ry   r   r   r   r   Zstatsmodels.formula.apir   Zread_csvZmoorer   Zmoore_lmZmooreBr7   r   r   r   r   <module>   sB   <7X'j" A
 
	