1 from __future__ import division
2 from scipy import c_, ones, dot, stats, diff
3 from scipy.linalg import inv, solve, det
4 from numpy import log, pi, sqrt, square, diagonal
5 from numpy.random import randn, seed
6 import time
7
8 class ols:
9 """
10 Author: Vincent Nijs (+ ?)
11
12 Email: v-nijs at kellogg.northwestern.edu
13
14 Last Modified: Mon Jan 15 17:56:17 CST 2007
15
16 Dependencies: See import statement at the top of this file
17
18 Doc: Class for multi-variate regression using OLS
19
20 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
21 simply run this file using 'python ols.py'. This will generate some simulated data and run various analyses. If you have rpy installed
22 the same model will also be estimated by R for confirmation.
23
24 Input:
25 y = dependent variable
26 y_varnm = string with the variable label for y
27 x = independent variables, note that a constant is added by default
28 x_varnm = string or list of variable labels for the independent variables
29
30 Output:
31 There are no values returned by the class. Summary provides printed output.
32 All other measures can be accessed as follows:
33
34 Step 1: Create an OLS instance by passing data to the class
35
36 m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
37
38 Step 2: Get specific metrics
39
40 To print the coefficients:
41 >>> print m.b
42 To print the coefficients p-values:
43 >>> print m.p
44
45 """
46
47 def __init__(self,y,x,y_varnm = 'y',x_varnm = ''):
48 """
49 Initializing the ols class.
50 """
51 self.y = y
52 self.x = c_[ones(x.shape[0]),x]
53 self.y_varnm = y_varnm
54 if not isinstance(x_varnm,list):
55 self.x_varnm = ['const'] + list(x_varnm)
56 else:
57 self.x_varnm = ['const'] + x_varnm
58
59
60 self.estimate()
61
62 def estimate(self):
63
64
65 self.inv_xx = inv(dot(self.x.T,self.x))
66 xy = dot(self.x.T,self.y)
67 self.b = dot(self.inv_xx,xy)
68
69 self.nobs = self.y.shape[0]
70 self.ncoef = self.x.shape[1]
71 self.df_e = self.nobs - self.ncoef
72 self.df_r = self.ncoef - 1
73
74 self.e = self.y - dot(self.x,self.b)
75 self.sse = dot(self.e,self.e)/self.df_e
76 self.se = sqrt(diagonal(self.sse*self.inv_xx))
77 self.t = self.b / self.se
78 self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2
79
80 self.R2 = 1 - self.e.var()/self.y.var()
81 self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef))
82
83 self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e)
84 self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e)
85
86 def dw(self):
87 """
88 Calculates the Durbin-Waston statistic
89 """
90 de = diff(self.e,1)
91 dw = dot(de,de) / dot(self.e,self.e);
92
93 return dw
94
95 def omni(self):
96 """
97 Omnibus test for normality
98 """
99 return stats.normaltest(self.e)
100
101 def JB(self):
102 """
103 Calculate residual skewness, kurtosis, and do the JB test for normality
104 """
105
106
107 skew = stats.skew(self.e)
108 kurtosis = 3 + stats.kurtosis(self.e)
109
110
111 JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
112 JBpv = 1-stats.chi2.cdf(JB,2);
113
114 return JB, JBpv, skew, kurtosis
115
116 def ll(self):
117 """
118 Calculate model log-likelihood and two information criteria
119 """
120
121
122 ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
123 aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
124 bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs
125
126 return ll, aic, bic
127
128 def summary(self):
129 """
130 Printing model output to screen
131 """
132
133
134 t = time.localtime()
135
136
137 ll, aic, bic = self.ll()
138 JB, JBpv, skew, kurtosis = self.JB()
139 omni, omnipv = self.omni()
140
141
142 print '\n=============================================================================='
143 print "Dependent Variable: " + self.y_varnm
144 print "Method: Least Squares"
145 print "Date: ", time.strftime("%a, %d %b %Y",t)
146 print "Time: ", time.strftime("%H:%M:%S",t)
147 print '# obs: %5.0f' % self.nobs
148 print '# variables: %5.0f' % self.ncoef
149 print '=============================================================================='
150 print 'variable coefficient std. Error t-statistic prob.'
151 print '=============================================================================='
152 for i in range(len(self.x_varnm)):
153 print '''% -5s % -5.6f % -5.6f % -5.6f % -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]])
154 print '=============================================================================='
155 print 'Models stats Residual stats'
156 print '=============================================================================='
157 print 'R-squared % -5.6f Durbin-Watson stat % -5.6f' % tuple([self.R2, self.dw()])
158 print 'Adjusted R-squared % -5.6f Omnibus stat % -5.6f' % tuple([self.R2adj, omni])
159 print 'F-statistic % -5.6f Prob(Omnibus stat) % -5.6f' % tuple([self.F, omnipv])
160 print 'Prob (F-statistic) % -5.6f JB stat % -5.6f' % tuple([self.Fpv, JB])
161 print 'Log likelihood % -5.6f Prob(JB) % -5.6f' % tuple([ll, JBpv])
162 print 'AIC criterion % -5.6f Skew % -5.6f' % tuple([aic, skew])
163 print 'BIC criterion % -5.6f Kurtosis % -5.6f' % tuple([bic, kurtosis])
164 print '=============================================================================='
165
166 if __name__ == '__main__':
167
168
169
170
171
172
173 seed(1)
174 data = randn(100,5)
175
176
177 m = ols(data[:,0],data[:,1:],y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
178 m.summary()
179
180
181 have_rpy = False
182 try:
183 print "\n"
184 print "="*30
185 print "Validating OLS results in R"
186 print "="*30
187
188 import rpy
189 have_rpy = True
190 except ImportError:
191 print "\n"
192 print "="*30
193 print "Validating OLS-class results in R"
194 print "="*30
195 print "rpy is not installed"
196 print "="*30
197
198 if have_rpy:
199 y = data[:,0]
200 x1 = data[:,1]
201 x2 = data[:,2]
202 x3 = data[:,3]
203 x4 = data[:,4]
204 rpy.set_default_mode(rpy.NO_CONVERSION)
205 linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y))
206 rpy.set_default_mode(rpy.BASIC_CONVERSION)
207 print linear_model.as_py()['coefficients']
208 summary = rpy.r.summary(linear_model)
209 print summary