In [1]: import numpy as np
In [2]: import statsmodels.api as sm
In [3]: import matplotlib.pyplot as plt
In [4]: from statsmodels.sandbox.regression.predstd import wls_prediction_std
Load data
In [5]: data = sm.datasets.stackloss.load()
In [6]: data.exog = sm.add_constant(data.exog)
Huber’s T norm with the (default) median absolute deviation scaling
In [7]: huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT())
In [8]: hub_results = huber_t.fit()
In [9]: print hub_results.params
[-41.0265 0.8294 0.9261 -0.1278]
In [10]: print hub_results.bse
[ 9.7919 0.111 0.3029 0.1286]
In [11]: varnames = ['var_%d' % i for i in range(len(hub_results.params))]
In [12]: print hub_results.summary(yname='y', xname=varnames)
Robust linear Model Regression Results
==============================================================================
Dep. Variable: y No. Observations: 21
Model: RLM Df Residuals: 17
Method: IRLS Df Model: 3
Norm: HuberT
Scale Est.: mad
Cov Type: H1
Date: Fri, 16 Aug 2013
Time: 06:42:18
No. Iterations: 19
==============================================================================
coef std err z P>|z| [95.0% Conf. Int.]
------------------------------------------------------------------------------
var_0 -41.0265 9.792 -4.190 0.000 -60.218 -21.835
var_1 0.8294 0.111 7.472 0.000 0.612 1.047
var_2 0.9261 0.303 3.057 0.002 0.332 1.520
var_3 -0.1278 0.129 -0.994 0.320 -0.380 0.124
==============================================================================
If the model instance has been used for another fit with different fit
parameters, then the fit options might not be the correct ones anymore .
Huber’s T norm with ‘H2’ covariance matrix
In [13]: hub_results2 = huber_t.fit(cov="H2")
In [14]: print hub_results2.params
[-41.0265 0.8294 0.9261 -0.1278]
In [15]: print hub_results2.bse
[ 9.0895 0.1195 0.3224 0.118 ]
Andrew’s Wave norm with Huber’s Proposal 2 scaling and ‘H3’ covariance matrix
In [16]: andrew_mod = sm.RLM(data.endog, data.exog, M=sm.robust.norms.AndrewWave())
In [17]: andrew_results = andrew_mod.fit(scale_est=sm.robust.scale.HuberScale(),
....: cov='H3')
....:
In [18]: print andrew_results.params
[-40.8818 0.7928 1.0486 -0.1336]
See help(sm.RLM.fit) for more options and module sm.robust.scale for scale options
In [19]: nsample = 50
In [20]: x1 = np.linspace(0, 20, nsample)
In [21]: X = np.c_[x1, (x1 - 5)**2, np.ones(nsample)]
In [22]: sig = 0.3 # smaller error variance makes OLS<->RLM contrast bigger
In [23]: beta = [0.5, -0.0, 5.]
In [24]: y_true2 = np.dot(X, beta)
In [25]: y2 = y_true2 + sig * 1. * np.random.normal(size=nsample)
In [26]: y2[[39, 41, 43, 45, 48]] -= 5 # add some outliers (10% of nsample)
Note that the quadratic term in OLS regression will capture outlier effects.
In [27]: res = sm.OLS(y2, X).fit()
In [28]: print res.params
[ 0.5108 -0.011 5.0591]
In [29]: print res.bse
[ 0.0716 0.0063 0.4638]
In [30]: print res.predict
<bound method OLSResults.predict of <statsmodels.regression.linear_model.OLSResults object at 0x692c0d0>>
Estimate RLM
In [31]: resrlm = sm.RLM(y2, X).fit()
In [32]: print resrlm.params
[ 0.49 0.0005 5.0059]
In [33]: print resrlm.bse
[ 0.0188 0.0017 0.1221]
Draw a plot to compare OLS estimates to the robust estimates
In [34]: plt.figure();
In [35]: plt.plot(x1, y2, 'o', x1, y_true2, 'b-');
In [36]: prstd, iv_l, iv_u = wls_prediction_std(res);
In [37]: plt.plot(x1, res.fittedvalues, 'r-');
In [38]: plt.plot(x1, iv_u, 'r--');
In [39]: plt.plot(x1, iv_l, 'r--');
In [40]: plt.plot(x1, resrlm.fittedvalues, 'g.-');
In [41]: plt.title('blue: true, red: OLS, green: RLM');
Fit a new OLS model using only the linear term and the constant
In [42]: X2 = X[:, [0, 2]]
In [43]: res2 = sm.OLS(y2, X2).fit()
In [44]: print res2.params
[ 0.4009 5.5021]
In [45]: print res2.bse
[ 0.0341 0.3952]
Estimate RLM
In [46]: resrlm2 = sm.RLM(y2, X2).fit()
In [47]: print resrlm2.params
[ 0.4938 4.9922]
In [48]: print resrlm2.bse
[ 0.0089 0.1031]
Draw a plot to compare OLS estimates to the robust estimates
In [49]: prstd, iv_l, iv_u = wls_prediction_std(res2)
In [50]: plt.figure();
In [51]: plt.plot(x1, y2, 'o', x1, y_true2, 'b-');
In [52]: plt.plot(x1, res2.fittedvalues, 'r-');
In [53]: plt.plot(x1, iv_u, 'r--');
In [54]: plt.plot(x1, iv_l, 'r--');
In [55]: plt.plot(x1, resrlm2.fittedvalues, 'g.-');
In [56]: plt.title('blue: true, red: OLS, green: RLM');