1 from scipy import c_, arange, array, unique, kron, ones, eye
2 from numpy.random import randn
3 from __future__ import division
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 Author: Vincent Nijs (+ ?)
9 Email: v-nijs at kellogg.northwestern.edu
10 Last Modified: Sun Jan 7 10:05:12 CST 2007
11
12 Todo:
13 - Check if shelve loading/saving works
14 - Only tested on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
15
16 Dependencies:
17 - See import statement at the top of this file
18
19 Doc:
20 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
21 Data is stored in a dictionary.
22
23 To use the class:
24
25 >>> from dbase import dbase
26 >>> y = dbase('your_filename.csv')
27
28 or for a previously created dbase object stored in a pickle file
29
30 >>> from dbase import dbase
31 >>> y = dbase('your_filename.pickle')
32
33 or without importing the dbase class
34
35 >>> import cPickle
36 >>> f = open('your_filename.pickle','rb')
37 >>> y = cPickle.load(f)
38 >>> data_key = cPickle.load(f)
39 >>> f.close()
40
41 or for a dictionary stored in a shelf file
42
43 >>> from dbase import dbase
44 >>> y = dbase('your_filename.pickle')
45
46 To return a list of variable names and an array of data
47
48 >>> varnm, data = y.get()
49
50 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
51 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
52 of the class to a pickle file.
53 """
54
55 def __init__(self,fname,*var,**date):
56 """
57 Initializing the dbase class. Loading file fname.
58
59 If you have have a column in your csv file that is a date-string use:
60
61 >>> x = dbase('myfile.csv',date = 0)
62
63 where 0 is the index of the date column
64
65 If you have have an array in your pickle file that is a date variable use:
66
67 >>> x = dbase('myfile.pickle',date = 'date')
68
69 where 'date' is the key of the date array
70 """
71 self.load(fname,var,date)
72
73 def load(self,fname,var,date):
74 """
75 Loading data from a csv or a pickle file of the dbase class.
76 If this is csv file use pylab's load function. Seems much faster
77 than scipy.io.read_array.
78 """
79
80 self.DBname = os.getcwd() + '/' + fname
81
82
83 self.date_key = date.values()
84
85
86 fext = self.__ext(fname)
87
88
89 if fext == 'csv':
90 f = open(fname,'r')
91 self.load_csv(f)
92 f.close()
93 elif fext == 'pickle':
94 f = open(fname,'rb')
95 self.load_pickle(f)
96 f.close()
97 elif fext == 'she':
98 self.load_shelve(fname,var)
99 else:
100 raise 'This class only works on csv, pickle, and shelve files'
101
102
103 self.nobs = self.data[self.data.keys()[0]].shape[0]
104
105 def load_csv(self,f):
106 """
107 Loading data from a csv file. Uses pylab's load function. Seems much faster
108 than scipy.io.read_array.
109 """
110 varnm = f.readline().split(',')
111
112
113 if self.date_key != []:
114 rawdata = pylab.load(f, delimiter=',',converters={self.date_key[0]:pylab.datestr2num})
115 self.date_key = varnm[self.date_key[0]]
116 else:
117 rawdata = pylab.load(f, delimiter=',')
118
119
120 varnm = [i.strip() for i in varnm]
121
122
123 self.data = dict(zip(varnm,rawdata.T))
124
125 def load_pickle(self,f):
126 """
127 Loading data from a created earlier using the the dbase class.
128 """
129 self.data = cPickle.load(f)
130
131
132 if self.date_key == []:
133 try:
134 self.date_key = cPickle.load(f)
135 except:
136 print "No date series in pickle file"
137 else:
138 self.date_key = self.date_key[0]
139
140 def load_shelve(self,fname,var):
141 """
142 Loading data from a created earlier using the the dbase class.
143 """
144 data = shelve.open(fname)
145
146
147 if var == ():
148 var = data.keys()
149
150
151 if self.date_key != []:
152 if not self.date_key[0] in var: var = var + self.date_key
153 self.date_key = self.date_key[0]
154
155 self.data = dict([(i,data[i]) for i in var])
156 data.close()
157
158 def save(self,fname):
159 """
160 Dumping the class data dictionary into a csv or pickle file
161 """
162 fext = self.__ext(fname)
163 if fext == 'csv':
164 f = open(fname,'w')
165 self.save_csv(f)
166 f.close()
167 elif fext == 'pickle':
168 f = open(fname,'wb')
169 self.save_pickle(f)
170 f.close()
171 elif fext == 'she':
172 self.save_shelve(fname)
173 else:
174 raise 'This class only works on csv, pickle, and shelve files'
175
176 def save_csv(self,f):
177 """
178 Dumping the class data dictionary into a csv file
179 """
180 writer = csv.writer(f)
181 writer.writerow(self.data.keys())
182
183 data = self.data
184 if self.date_key != []:
185 data = dict(data)
186 dates = pylab.num2date(data[self.date_key])
187 dates = array([i.strftime('%d %b %y') for i in dates])
188 data[self.date_key] = dates
189
190 writer.writerows(array(data.values()).T)
191
192 def save_pickle(self,f):
193 """
194 Dumping the class data dictionary and date_key into a binary pickle file
195 """
196 cPickle.dump(self.data,f,2)
197 cPickle.dump(self.date_key,f,2)
198
199 def save_shelve(self,fname):
200 """
201 Dumping the class data dictionary into a shelve file
202 """
203 f = shelve.open('data.she','c')
204 f = self.data
205 f.close()
206
207 def add_trend(self,tname = 'trend'):
208
209 self.data[tname] = arange(self.nobs)
210
211 def add_dummy(self,dum, dname = 'dummy'):
212 if self.data.has_key(dname):
213 print "The variable name '" + str(dname) + "' already exists. Please select another name."
214 else:
215 self.data[dname] = dum
216
217 def add_seasonal_dummies(self,freq=52,ndum=13):
218 """
219 This function will only work if the freq and ndum 'fit. That is,
220 weeks and 4-weekly periods will work. Weeks and months/quarters
221 will not.
222 """
223 if self.date_key == []:
224 print "Cannot create seasonal dummies since no date array is known"
225 else:
226
227 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
228
229
230 start = freq - sum(years == min(years))
231
232
233 nyear = unique(years).shape[0]
234
235
236 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
237 sd = sd[start:start+self.nobs]
238 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
239 self.data.update(sd)
240
241 def delvar(self,*var):
242 """
243 Deleting specified variables in the data dictionary, changing dictionary in place
244 """
245 [self.data.pop(i) for i in var]
246
247 def keepvar(self,*var):
248 """
249 Keeping specified variables in the data dictionary, changing dictionary in place
250 """
251 [self.data.pop(i) for i in self.data.keys() if i not in var]
252
253 def delvar_copy(self,*var):
254 """
255 Deleting specified variables in the data dictionary, making a copy
256 """
257 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
258
259 def keepvar_copy(self,*var):
260 """
261 Keeping specified variables in the data dictionary, making a copy
262 """
263 return dict([(i,self.data[i]) for i in var])
264
265 def delobs(self,sel):
266 """
267 Deleting specified observations, changing dictionary in place
268 """
269 for i in self.data.keys(): self.data[i] = self.data[i][sel]
270
271
272 self.nobs -= sum(sel)
273
274 def keepobs(self,sel):
275 """
276 Keeping specified observations, changing dictionary in place
277 """
278
279 self.nobs -= sum(sel)
280
281 sel -= 1
282 self.delobs(sel)
283
284 def delobs_copy(self,sel):
285 """
286 Deleting specified observations, making a copy
287 """
288 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
289
290 def keepobs_copy(self,sel):
291 """
292 Keeping specified observations, making a copy
293 """
294 sel -= 1
295 self.delobs_copy(sel)
296
297 def get(self,*var,**sel):
298 """
299 Copying data and keys of selected variables for further analysis
300 """
301
302 var, sel = self.__var_and_sel_clean(var, sel)
303
304
305 d = dict((i,self.data[i][sel]) for i in var)
306
307 return d.keys(), array(d.values()).T
308
309 def info(self,*var, **adict):
310 """
311 Printing descriptive statistics on selected variables
312 """
313
314
315 var, sel = self.__var_and_sel_clean(var, adict)
316 dates, nobs = self.__dates_and_nobs_clean(var, sel)
317
318
319 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
320 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
321
322
323 nvar = len(var)
324
325 print '\n=============================================================='
326 print '==================== Database information ===================='
327 print '==============================================================\n'
328
329 print 'file: %s' % self.DBname
330 print '# obs: %s' % nobs
331 print '# variables: %s' % nvar
332 print 'Start date: %s' % mindate
333 print 'End date: %s' % maxdate
334
335 print '\nvar min max mean std.dev'
336 print '=============================================================='
337
338 for i in var:
339 _min = self.data[i][sel].min(); _max = self.data[i][sel].max(); _mean = self.data[i][sel].mean(); _std = self.data[i][sel].std()
340 print '''%-5s %-5.2f %-5.2f %-5.2f %-5.2f''' % tuple([i,_min,_max,_mean,_std])
341
342 def dataplot(self,*var, **adict):
343 """
344 Plotting the data with variable names
345 """
346
347 var, sel = self.__var_and_sel_clean(var, adict)
348 dates, nobs = self.__dates_and_nobs_clean(var, sel)
349
350 for i in var:
351 pylab.plot_date(dates,self.data[i][sel],'o-')
352
353 pylab.xlabel("Time (n = " + str(nobs) + ")")
354 pylab.title("Data plot of " + self.DBname)
355 pylab.legend(var)
356 if adict.has_key('file'):
357 pylab.savefig(adict['file'],dpi=600)
358 pylab.show()
359
360 def __var_and_sel_clean(self, var, sel, dates_needed = True):
361 """
362 Convenience function to avoid code duplication
363 """
364
365 if var == ():
366 var = self.data.keys()
367
368
369 var = [x for x in var if x != self.date_key]
370
371
372 var.sort()
373
374
375
376 if not sel.has_key('sel'):
377 sel = ()
378 else:
379 sel = sel['sel']
380
381 return var, sel
382
383 def __dates_and_nobs_clean(self, var, sel):
384 """
385 Convenience function to avoid code duplication
386 """
387 nobs = self.nobs
388 if len(sel):
389 nobs = nobs - (nobs - sum(sel))
390
391 if self.date_key != None and self.data.has_key(self.date_key):
392
393 dates = self.data[self.date_key][sel]
394 else:
395
396 dates = range(711858,nobs+711858)
397
398 return dates, nobs
399
400 def __ext(self,fname):
401 """
402 Finding the file extension of the filename passed to dbase
403 """
404 return fname.split('.')[-1].strip()
405
406 if __name__ == '__main__':
407
408
409
410
411
412 import sys
413 from scipy import c_
414
415
416 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
417
418
419 varnm = ['date','a','b','c']
420 nobs = 100
421 data = randn(nobs,3)
422 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
423 dates = [i.strftime('%d %b %y') for i in dates]
424 data = c_[dates,data]
425
426
427 f = open('./dbase_test_files/data.csv','w')
428 writer = csv.writer(f)
429 writer.writerow(varnm)
430 writer.writerows(data)
431 f.close()
432
433
434 a = dbase("./dbase_test_files/data.csv",date = 0)
435
436 a.save("./dbase_test_files/data.pickle")
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452 print "\nLoading the dbase object from a pickle file\n"
453 b = dbase("./dbase_test_files/data.pickle")
454
455
456 print "\nWorking on file: " + b.DBname
457
458
459 varnm, data = b.get()
460 print "Variable names from dbase class\n", varnm
461 print "\nData from dbase class\n", data
462 print "\nDate series", b.data[b.date_key]
463
464
465 varnm, data = b.get('a','c')
466 print "\nTwo columns selected using variable names\n", varnm, "\n", data
467
468
469 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
470
471
472 x1 = b.data['a'] * b.data['b']
473 x2 = b.data['a'] * b.data['c']
474 xdict = {'x1':x1,'x2':x2}
475 b.data.update(xdict)
476
477 varnm, data = b.get()
478 print "\nTwo variable names added\n", varnm
479 print "\nTwo columns added\n", data
480
481
482 import copy
483 c = copy.deepcopy(b)
484
485
486 c.delvar('a','x2')
487 varnm, data = c.get()
488 print "\nTwo variable names deleted\n", varnm
489 print "\nTwo columns deleted\n", data
490
491
492 c = copy.deepcopy(b)
493 c.keepvar('a','x2')
494 varnm, data = c.get()
495 print "\nAll but two variable names deleted\n", varnm
496 print "\nAll but Two columns deleted\n", data
497
498
499 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
500
501
502 c = copy.deepcopy(b)
503 c.delobs(sel_rule)
504
505 varnm, data = c.get()
506 print "\nReduced number of observations following the selection rule\n", data
507
508
509 c = copy.deepcopy(b)
510 c.keepobs(sel_rule)
511
512 varnm, data = c.get()
513 print "\nReduced number of observations following the inverse of the selection rule\n", data
514
515
516 x = b.keepvar_copy('a')
517
518
519 x = b.delvar_copy('a')
520
521
522 x = b.keepobs_copy(sel_rule)
523
524
525 x = b.delobs_copy(sel_rule)
526
527
528 b.info()
529
530
531 b.dataplot(file = './dbase_test_files/full_plot.png')
532
533
534 b.add_trend('mytrend')
535
536
537 dummy_rule = b.data['a'] > 0
538 b.add_dummy(dummy_rule,'mydummy')
539
540
541 b.add_seasonal_dummies(52,13)
542
543
544 b.info('b','c', sel = sel_rule)
545
546
547 b.dataplot('b','c', sel = sel_rule, file = './dbase_test_files/partial_plot.png')