Logo

Source code for scikits.statsmodels.sandbox.graphics

import numpy as np
from scipy import stats

[docs]def qqplot(data, dist=stats.distributions.norm, binom_n=None): """ qqplot of the quantiles of x versus the ppf of a distribution. Parameters ---------- data : array-like 1d data array dist : scipy.stats.distribution or string Compare x against dist. Strings aren't implemented yet. The default is scipy.stats.distributions.norm Returns ------- matplotlib figure. Examples -------- >>> import scikits.statsmodels.api as sm >>> from matplotlib import pyplot as plt >>> data = sm.datasets.longley.Load() >>> data.exog = sm.add_constant(data.exog) >>> mod_fit = sm.OLS(data.endog, data.exog).fit() >>> res = mod_fit.resid >>> std_res = (res - res.mean())/res.std() Import qqplots from the sandbox >>> from scikits.statsmodels.sandbox.graphics import qqplot >>> qqplot(std_res) >>> plt.show() Notes ----- Only the default arguments currently work. Depends on matplotlib. """ try: from matplotlib import pyplot as plt except: raise ImportError("matplotlib not installed") if isinstance(dist, str): raise NotImplementedError names_dist = {} names_dist.update({"norm_gen" : "Normal"}) plotname = names_dist[dist.__class__.__name__] x = np.array(data, copy=True) x.sort() nobs = x.shape[0] prob = np.linspace(1./(nobs-1), 1-1./(nobs-1), nobs) # is the above robust for a few data points? quantiles = np.zeros_like(x) for i in range(nobs): quantiles[i] = stats.scoreatpercentile(x, prob[i]*100) # estimate shape and location using distribution.fit # for normal, but will have to be somewhat distribution specific loc,scale = dist.fit(x) y = dist.ppf(prob, loc=loc, scale=scale) # plt.figure() plt.scatter(y, quantiles) y_low = np.min((y.min(),quantiles.min()))-.25 y_high = np.max((y.max(),quantiles.max()))+.25 plt.plot([y.min()-.25, y.max()+.25], [y_low, y_high], 'b-') title = '%s - Quantile Plot' % plotname plt.title(title) xlabel = "Quantiles of %s" % plotname plt.xlabel(xlabel) ylabel = "%s Quantiles" % "Data" plt.ylabel(ylabel) plt.axis([y.min()-.25,y.max()+.25, y_low-.25, y_high+.25]) return plt.gcf()
if __name__ == "__main__": # sample from t-distribution with 3 degrees of freedom import numpy as np from scipy import stats import matplotlib.pyplot as plt np.random.seed(12345) tsample = np.random.standard_t(3, size=1000) tsample2 = stats.t.rvs(3, loc=0, scale=1, size=1000) # graph = qqplot(tsample) # graph2 = qqplot(tsample2) # export data to check vs. R # np.savetxt('./tsample.csv', tsample2, delimiter=",") #TODO: it looks the expected order statistic line is wrong x = np.array(tsample2, copy=True) x.sort() nobs = x.shape[0] # prob = np.linspace(1/(nobs-1.), 1-1/(nobs-1.), nobs) # prob = np.arange(1,nobs+1)/(nobs+1) #from wikipedia: a = .5 prob = np.linspace(1.-a, nobs-a,nobs)/(nobs + 1. -2*a) quantiles = np.zeros_like(x) for i in range(nobs): quantiles[i] = stats.scoreatpercentile(x, prob[i]*100) loc, scale = stats.norm.fit(x) # this should also return args for # distributions with other arguments? y = stats.norm.ppf(prob, loc=loc, scale=scale) plt.scatter(y, quantiles) plt.plot(quantiles, quantiles, 'b-') # plt.plot(stats.norm.ppf(prob)) # plt.show()