1 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
2 from numpy.random import randn
3 from __future__ import division
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 Author: Vincent Nijs (+ ?)
9 Email: v-nijs at kellogg.northwestern.edu
10 Last Modified: Sun Jan 7 10:05:12 CST 2007
11
12 Todo:
13 - Check if shelve loading/saving works
14
15 Tested on:
16 - Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
17 - Tested on Linux
18
19 Dependencies:
20 - See import statement at the top of this file
21
22 Doc:
23 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
24 Data is stored in a dictionary.
25
26 To use the class:
27
28 >>> from dbase import dbase
29 >>> y = dbase('your_filename.csv')
30
31 or for a previously created dbase object stored in a pickle file
32
33 >>> from dbase import dbase
34 >>> y = dbase('your_filename.pickle')
35
36 or without importing the dbase class
37
38 >>> import cPickle
39 >>> f = open('your_filename.pickle','rb')
40 >>> y = cPickle.load(f)
41 >>> data_key = cPickle.load(f)
42 >>> f.close()
43
44 or for a dictionary stored in a shelf file
45
46 >>> from dbase import dbase
47 >>> y = dbase('your_filename.pickle')
48
49 To return a list of variable names and an array of data
50
51 >>> varnm, data = y.get()
52
53 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
54 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
55 of the class to a pickle file.
56 """
57
58 def __init__(self,fname,var = (),date = ''):
59 """
60 Initializing the dbase class. Loading file fname.
61
62 If you have have a column in your csv file that is a date-string use:
63
64 >>> x = dbase('myfile.csv',date = 0)
65
66 where 0 is the index of the date column
67
68 If you have have an array in your pickle file that is a date variable use:
69
70 >>> x = dbase('myfile.pickle',date = 'date')
71
72 where 'date' is the key of the date array
73 """
74 self.load(fname,var,date)
75
76 def load(self,fname,var,date):
77 """
78 Loading data from a csv or a pickle file of the dbase class.
79 If this is csv file use pylab's load function. Seems much faster
80 than scipy.io.read_array.
81 """
82
83 self.DBname = os.getcwd() + '/' + fname
84
85
86 self.date_key = date
87
88
89 fext = self.__ext(fname)
90
91
92 if fext == 'csv':
93 f = open(fname,'r')
94 self.load_csv(f)
95 f.close()
96 elif fext == 'pickle':
97 f = open(fname,'rb')
98 self.load_pickle(f)
99 f.close()
100 elif fext == 'she':
101 self.load_shelve(fname,var)
102 else:
103 raise 'This class only works on csv, pickle, and shelve files'
104
105
106 self.nobs = self.data[self.data.keys()[0]].shape[0]
107
108 def csvconvert(self,col):
109 """
110 Converting data in a string array to the appropriate type
111 """
112
113 col[col == ''] = 'nan'; col[col == '.'] = 'nan'
114 try:
115
116 return col.astype('i')
117 except ValueError:
118 try:
119 return col.astype('f')
120 except ValueError:
121
122 col[col == 'nan'] = ''
123 return col
124
125 def load_csv_nf(self,f):
126 """
127 Loading data from a csv file using the csv module. Return a list of arrays.
128 Possibly with different types and/or missing values.
129 """
130
131 f.seek(0)
132
133 reader = csv.reader(f)
134
135
136 datalist = array([i for i in reader])
137
138
139 datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
140
141 return datalist
142
143 def load_csv(self,f):
144 """
145 Loading data from a csv file. Uses pylab's load function. Seems much faster
146 than scipy.io.read_array.
147 """
148 varnm = f.readline().split(',')
149
150
151 if self.date_key != '':
152 try:
153 rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num})
154 except ValueError:
155 rawdata = self.load_csv_nf(f)
156
157
158 rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
159
160 self.date_key = varnm[self.date_key]
161 else:
162 try:
163 rawdata = pylab.load(f, delimiter=',')
164 except ValueError:
165 rawdata = self.load_csv_nf(f)
166
167
168 varnm = [i.strip() for i in varnm]
169
170
171 if type(rawdata) == list:
172
173 self.data = dict(zip(varnm,rawdata))
174 else:
175
176 self.data = dict(zip(varnm,rawdata.T))
177
178 def load_pickle(self,f):
179 """
180 Loading data from a created earlier using the the dbase class.
181 """
182 self.data = cPickle.load(f)
183
184
185 if self.date_key == '':
186 try:
187 self.date_key = cPickle.load(f)
188 except:
189 print "No date series in pickle file"
190
191 def load_shelve(self,fname,var):
192 """
193 Loading data from a created earlier using the the dbase class.
194 """
195 data = shelve.open(fname)
196
197
198 if var == ():
199 var = data.keys()
200
201
202 if self.date_key != '':
203 if not self.date_key in var: var = var + list(self.date_key)
204
205 self.data = dict([(i,data[i]) for i in var])
206 data.close()
207
208 def save(self,fname):
209 """
210 Dumping the class data dictionary into a csv or pickle file
211 """
212 fext = self.__ext(fname)
213 if fext == 'csv':
214 f = open(fname,'w')
215 self.save_csv(f)
216 f.close()
217 elif fext == 'pickle':
218 f = open(fname,'wb')
219 self.save_pickle(f)
220 f.close()
221 elif fext == 'she':
222 self.save_shelve(fname)
223 else:
224 raise 'This class only works on csv, pickle, and shelve files'
225
226 def save_csv(self,f):
227 """
228 Dumping the class data dictionary into a csv file
229 """
230 writer = csv.writer(f)
231 writer.writerow(self.data.keys())
232
233 data = self.data
234 if self.date_key != []:
235 data = dict(data)
236 dates = pylab.num2date(data[self.date_key])
237 dates = array([i.strftime('%d %b %y') for i in dates])
238 data[self.date_key] = dates
239
240 writer.writerows(array(data.values()).T)
241
242 def save_pickle(self,f):
243 """
244 Dumping the class data dictionary and date_key into a binary pickle file
245 """
246 cPickle.dump(self.data,f,2)
247 cPickle.dump(self.date_key,f,2)
248
249 def save_shelve(self,fname):
250 """
251 Dumping the class data dictionary into a shelve file
252 """
253 f = shelve.open('data.she','c')
254 f = self.data
255 f.close()
256
257 def add_trend(self,tname = 'trend'):
258
259 self.data[tname] = arange(self.nobs)
260
261 def add_dummy(self,dum, dname = 'dummy'):
262 if self.data.has_key(dname):
263 print "The variable name '" + str(dname) + "' already exists. Please select another name."
264 else:
265 self.data[dname] = dum
266
267 def add_seasonal_dummies(self,freq=52,ndum=13):
268 """
269 This function will only work if the freq and ndum 'fit. That is,
270 weeks and 4-weekly periods will work. Weeks and months/quarters
271 will not.
272 """
273 if self.date_key == []:
274 print "Cannot create seasonal dummies since no date array is known"
275 else:
276
277 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
278
279
280 start = freq - sum(years == min(years))
281
282
283 nyear = unique(years).shape[0]
284
285
286 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
287 sd = sd[start:start+self.nobs]
288 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
289 self.data.update(sd)
290
291 def delvar(self,*var):
292 """
293 Deleting specified variables in the data dictionary, changing dictionary in place
294 """
295 [self.data.pop(i) for i in var]
296
297 def keepvar(self,*var):
298 """
299 Keeping specified variables in the data dictionary, changing dictionary in place
300 """
301 [self.data.pop(i) for i in self.data.keys() if i not in var]
302
303 def delvar_copy(self,*var):
304 """
305 Deleting specified variables in the data dictionary, making a copy
306 """
307 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
308
309 def keepvar_copy(self,*var):
310 """
311 Keeping specified variables in the data dictionary, making a copy
312 """
313 return dict([(i,self.data[i]) for i in var])
314
315 def delobs(self,sel):
316 """
317 Deleting specified observations, changing dictionary in place
318 """
319 for i in self.data.keys(): self.data[i] = self.data[i][sel]
320
321
322 self.nobs -= sum(sel)
323
324 def keepobs(self,sel):
325 """
326 Keeping specified observations, changing dictionary in place
327 """
328
329 self.nobs -= sum(sel)
330
331 sel -= 1
332 self.delobs(sel)
333
334 def delobs_copy(self,sel):
335 """
336 Deleting specified observations, making a copy
337 """
338 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
339
340 def keepobs_copy(self,sel):
341 """
342 Keeping specified observations, making a copy
343 """
344 sel -= 1
345 self.delobs_copy(sel)
346
347 def get(self,*var,**sel):
348 """
349 Copying data and keys of selected variables for further analysis
350 """
351
352 var, sel = self.__var_and_sel_clean(var, sel)
353
354
355 d = dict((i,self.data[i][sel]) for i in var)
356
357 return d.keys(), array(d.values()).T
358
359 def info(self,*var, **adict):
360 """
361 Printing descriptive statistics on selected variables
362 """
363
364
365 var, sel = self.__var_and_sel_clean(var, adict)
366 dates, nobs = self.__dates_and_nobs_clean(var, sel)
367
368
369 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
370 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
371
372
373 nvar = len(var)
374
375 print '\n=============================================================================='
376 print '============================ Database information ============================'
377 print '==============================================================================\n'
378
379 print 'file: %s' % self.DBname
380 print '# obs: %s' % nobs
381 print '# variables: %s' % nvar
382 print 'Start date: %s' % mindate
383 print 'End date: %s' % maxdate
384
385 print '\nvar min max mean std.dev miss levels'
386 print '=============================================================================='
387
388 for i in var:
389 col = self.data[i][sel];
390 if type(col[0]) == string_:
391 _miss = sum(col == '')
392 print '''%-5s %-5s %-5s %-5s %-5s % -5.0f %-5i''' % tuple([i,'-','-','-','-',_miss,len(set(col))])
393 else:
394 _miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
395 print '''% -5s % -5.2f % -5.2f % -5.2f % -5.2f % -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)])
396
397 def dataplot(self,*var, **adict):
398 """
399 Plotting the data with variable names
400 """
401
402 var, sel = self.__var_and_sel_clean(var, adict)
403 dates, nobs = self.__dates_and_nobs_clean(var, sel)
404
405
406 nvar = []
407 for i in var:
408 col = self.data[i][sel]
409 if type(col[0]) != string_:
410 pylab.plot_date(dates,self.data[i][sel],'o-')
411 nvar = nvar + [i]
412
413 pylab.xlabel("Time (n = " + str(nobs) + ")")
414 pylab.title("Data plot of " + self.DBname)
415 pylab.legend(nvar)
416 if adict.has_key('file'):
417 pylab.savefig(adict['file'],dpi=600)
418 pylab.show()
419
420 def __var_and_sel_clean(self, var, sel, dates_needed = True):
421 """
422 Convenience function to avoid code duplication
423 """
424
425 if var == ():
426 var = self.data.keys()
427
428
429 var = [x for x in var if x != self.date_key]
430
431
432 var.sort()
433
434
435
436 if not sel.has_key('sel'):
437 sel = ()
438 else:
439 sel = sel['sel']
440
441 return var, sel
442
443 def __dates_and_nobs_clean(self, var, sel):
444 """
445 Convenience function to avoid code duplication
446 """
447 nobs = self.nobs
448 if len(sel):
449 nobs = nobs - (nobs - sum(sel))
450
451 if self.date_key != None and self.data.has_key(self.date_key):
452
453 dates = self.data[self.date_key][sel]
454 else:
455
456 dates = range(711858,nobs+711858)
457
458 return dates, nobs
459
460 def __ext(self,fname):
461 """
462 Finding the file extension of the filename passed to dbase
463 """
464 return fname.split('.')[-1].strip()
465
466 if __name__ == '__main__':
467
468
469
470
471
472 import sys
473 from scipy import c_
474
475
476 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
477
478
479 varnm = ['date','a','b','c']
480 nobs = 100
481 data = randn(nobs,3)
482 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
483 dates = [i.strftime('%d %b %y') for i in dates]
484 data = c_[dates,data]
485
486
487 data[5,1] = ''
488 data[9,3] = ''
489
490
491 varnm = varnm + ['id']
492 id = [('id'+str(i)) for i in range(nobs)]
493 id[8] = ''
494 data = c_[data,id]
495
496
497 f = open('./dbase_test_files/data.csv','w')
498 writer = csv.writer(f)
499 writer.writerow(varnm)
500 writer.writerows(data)
501 f.close()
502
503
504 a = dbase("./dbase_test_files/data.csv",date = 0)
505
506
507 a.save("./dbase_test_files/data.pickle")
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523 print "\nLoading the dbase object from a pickle file\n"
524 b = dbase("./dbase_test_files/data.pickle")
525
526
527 print "\nWorking on file: " + b.DBname
528
529
530 varnm, data = b.get()
531 print "Variable names from dbase class\n", varnm
532 print "\nData from dbase class\n", data
533 print "\nDate series", b.data[b.date_key]
534
535
536 varnm, data = b.get('a','c')
537 print "\nTwo columns selected using variable names\n", varnm, "\n", data
538
539
540 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
541
542
543 x1 = b.data['a'] * b.data['b']
544 x2 = b.data['a'] * b.data['c']
545 xdict = {'x1':x1,'x2':x2}
546 b.data.update(xdict)
547
548 varnm, data = b.get()
549 print "\nTwo variable names added\n", varnm
550 print "\nTwo columns added\n", data
551
552
553 import copy
554 c = copy.deepcopy(b)
555
556
557 c.delvar('a','x2')
558 varnm, data = c.get()
559 print "\nTwo variable names deleted\n", varnm
560 print "\nTwo columns deleted\n", data
561
562
563 c = copy.deepcopy(b)
564 c.keepvar('a','x2')
565 varnm, data = c.get()
566 print "\nAll but two variable names deleted\n", varnm
567 print "\nAll but Two columns deleted\n", data
568
569
570 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
571
572
573 c = copy.deepcopy(b)
574 c.delobs(sel_rule)
575
576 varnm, data = c.get()
577 print "\nReduced number of observations following the selection rule\n", data
578
579
580 c = copy.deepcopy(b)
581 c.keepobs(sel_rule)
582
583 varnm, data = c.get()
584 print "\nReduced number of observations following the inverse of the selection rule\n", data
585
586
587 x = b.keepvar_copy('a')
588
589
590 x = b.delvar_copy('a')
591
592
593 x = b.keepobs_copy(sel_rule)
594
595
596 x = b.delobs_copy(sel_rule)
597
598
599 b.info()
600
601
602 b.dataplot(file = './dbase_test_files/full_plot.png')
603
604
605 b.add_trend('mytrend')
606
607
608 dummy_rule = b.data['a'] > 0
609 b.add_dummy(dummy_rule,'mydummy')
610
611
612 b.add_seasonal_dummies(52,13)
613
614
615 b.info('b','c','mydummy', sel = sel_rule)
616
617
618 b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')