1 from __future__ import division
2 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
3 from numpy.random import randn
4 from numpy.core.records import recarray
5 import pylab, cPickle, shelve, csv, copy, os
6
7 class dbase:
8 """
9 Author: Vincent Nijs (+ ?)
10 Email: v-nijs at kellogg.northwestern.edu
11 Last Modified: Thu Jan 18 22:32:56 CST 2007
12
13 Todo:
14 - Check if shelve loading/saving works
15
16 Tested on:
17 - Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
18 - Tested on Linux
19
20 Dependencies:
21 - See import statement at the top of this file
22
23 Doc:
24 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
25 Data is stored in a dictionary.
26
27 To use the class:
28
29 >>> from dbase import dbase
30 >>> y = dbase('your_filename.csv')
31
32 or for a previously created dbase object stored in a pickle file
33
34 >>> from dbase import dbase
35 >>> y = dbase('your_filename.pickle')
36
37 or without importing the dbase class
38
39 >>> import cPickle
40 >>> f = open('your_filename.pickle','rb')
41 >>> y = cPickle.load(f)
42 >>> data_key = cPickle.load(f)
43 >>> f.close()
44
45 or for a dictionary stored in a shelf file
46
47 >>> from dbase import dbase
48 >>> y = dbase('your_filename.pickle')
49
50 To return a list of variable names and an array of data
51
52 >>> varnm, data = y.get()
53
54 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
55 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
56 of the class to a pickle file.
57 """
58
59 def __init__(self,fname,var = (),date = ''):
60 """
61 Initializing the dbase class. Loading file fname.
62
63 If you have have a column in your csv file that is a date-string use:
64
65 >>> x = dbase('myfile.csv',date = 0)
66
67 where 0 is the index of the date column
68
69 If you have have an array in your pickle file that is a date variable use:
70
71 >>> x = dbase('myfile.pickle',date = 'date')
72
73 where 'date' is the key of the date array
74 """
75 self.load(fname,var,date)
76
77 def load(self,fname,var,date):
78 """
79 Loading data from a csv or a pickle file of the dbase class.
80 If this is csv file use pylab's load function. Seems much faster
81 than scipy.io.read_array.
82 """
83
84 self.DBname = os.getcwd() + '/' + fname
85
86
87 self.date_key = date
88
89
90 fext = self.__ext(fname)
91
92
93 if fext == 'csv':
94 f = open(fname,'r')
95 self.load_csv(f)
96 f.close()
97 elif fext == 'pickle':
98 f = open(fname,'rb')
99 self.load_pickle(f)
100 f.close()
101 elif fext == 'she':
102 self.load_shelve(fname,var)
103 else:
104 raise 'This class only works on csv, pickle, and shelve files'
105
106
107 self.nobs = self.data[self.data.keys()[0]].shape[0]
108
109 def csvconvert(self,col):
110 """
111 Converting data in a string array to the appropriate type
112 """
113
114 col[col == ''] = 'nan'; col[col == '.'] = 'nan'
115 try:
116
117 return col.astype('i')
118 except ValueError:
119 try:
120 return col.astype('f')
121 except ValueError:
122
123 col[col == 'nan'] = ''
124 return col
125
126 def load_csv_nf(self,f):
127 """
128 Loading data from a csv file using the csv module. Return a list of arrays.
129 Possibly with different types and/or missing values.
130 """
131
132 f.seek(0)
133
134 reader = csv.reader(f)
135
136
137 datalist = array([i for i in reader])
138
139
140 datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
141
142 return datalist
143
144 def load_csv(self,f):
145 """
146 Loading data from a csv file. Uses pylab's load function. Seems much faster
147 than scipy.io.read_array.
148 """
149 varnm = f.readline().split(',')
150
151
152 if self.date_key != '':
153 try:
154 rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num})
155 except ValueError:
156 rawdata = self.load_csv_nf(f)
157
158
159 rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
160
161 self.date_key = varnm[self.date_key]
162 else:
163 try:
164 rawdata = pylab.load(f, delimiter=',')
165 except ValueError:
166 rawdata = self.load_csv_nf(f)
167
168
169 varnm = [i.strip() for i in varnm]
170
171
172 if type(rawdata) == list:
173
174 self.data = dict(zip(varnm,rawdata))
175 else:
176
177 self.data = dict(zip(varnm,rawdata.T))
178
179 def load_pickle(self,f):
180 """
181 Loading data from a created earlier using the the dbase class.
182 """
183 self.data = cPickle.load(f)
184
185
186 if self.date_key == '':
187 try:
188 self.date_key = cPickle.load(f)
189 except:
190 print "No date series in pickle file"
191
192 def load_shelve(self,fname,var):
193 """
194 Loading data from a created earlier using the the dbase class.
195 """
196 data = shelve.open(fname)
197
198
199 if var == ():
200 var = data.keys()
201
202
203 if self.date_key != '':
204 if not self.date_key in var: var = var + list(self.date_key)
205
206 self.data = dict([(i,data[i]) for i in var])
207 data.close()
208
209 def save(self,fname):
210 """
211 Dumping the class data dictionary into a csv or pickle file
212 """
213 fext = self.__ext(fname)
214 if fext == 'csv':
215 f = open(fname,'w')
216 self.save_csv(f)
217 f.close()
218 elif fext == 'pickle':
219 f = open(fname,'wb')
220 self.save_pickle(f)
221 f.close()
222 elif fext == 'she':
223 self.save_shelve(fname)
224 else:
225 raise 'This class only works on csv, pickle, and shelve files'
226
227 def save_csv(self,f):
228 """
229 Dumping the class data dictionary into a csv file
230 """
231 writer = csv.writer(f)
232 writer.writerow(self.data.keys())
233
234 data = self.data
235 if self.date_key != []:
236 data = dict(data)
237 dates = pylab.num2date(data[self.date_key])
238 dates = array([i.strftime('%d %b %y') for i in dates])
239 data[self.date_key] = dates
240
241 writer.writerows(array(data.values()).T)
242
243 def save_pickle(self,f):
244 """
245 Dumping the class data dictionary and date_key into a binary pickle file
246 """
247 cPickle.dump(self.data,f,2)
248 cPickle.dump(self.date_key,f,2)
249
250 def save_shelve(self,fname):
251 """
252 Dumping the class data dictionary into a shelve file
253 """
254 f = shelve.open('data.she','c')
255 f = self.data
256 f.close()
257
258 def add_trend(self,tname = 'trend'):
259
260 self.data[tname] = arange(self.nobs)
261
262 def add_dummy(self,dum, dname = 'dummy'):
263 if self.data.has_key(dname):
264 print "The variable name '" + str(dname) + "' already exists. Please select another name."
265 else:
266 self.data[dname] = dum
267
268 def add_seasonal_dummies(self,freq=52,ndum=13):
269 """
270 This function will only work if the freq and ndum 'fit. That is,
271 weeks and 4-weekly periods will work. Weeks and months/quarters
272 will not.
273 """
274 if self.date_key == []:
275 print "Cannot create seasonal dummies since no date array is known"
276 else:
277
278 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
279
280
281 start = freq - sum(years == min(years))
282
283
284 nyear = unique(years).shape[0]
285
286
287 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
288 sd = sd[start:start+self.nobs]
289 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
290 self.data.update(sd)
291
292 def delvar(self,*var):
293 """
294 Deleting specified variables in the data dictionary, changing dictionary in place
295 """
296 [self.data.pop(i) for i in var]
297
298 def keepvar(self,*var):
299 """
300 Keeping specified variables in the data dictionary, changing dictionary in place
301 """
302 [self.data.pop(i) for i in self.data.keys() if i not in var]
303
304 def delvar_copy(self,*var):
305 """
306 Deleting specified variables in the data dictionary, making a copy
307 """
308 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
309
310 def keepvar_copy(self,*var):
311 """
312 Keeping specified variables in the data dictionary, making a copy
313 """
314 return dict([(i,self.data[i]) for i in var])
315
316 def delobs(self,sel):
317 """
318 Deleting specified observations, changing dictionary in place
319 """
320 for i in self.data.keys(): self.data[i] = self.data[i][sel]
321
322
323 self.nobs -= sum(sel)
324
325 def keepobs(self,sel):
326 """
327 Keeping specified observations, changing dictionary in place
328 """
329
330 self.nobs -= sum(sel)
331
332 sel -= 1
333 self.delobs(sel)
334
335 def delobs_copy(self,sel):
336 """
337 Deleting specified observations, making a copy
338 """
339 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
340
341 def keepobs_copy(self,sel):
342 """
343 Keeping specified observations, making a copy
344 """
345 sel -= 1
346 self.delobs_copy(sel)
347
348 def get(self,*var,**sel):
349 """
350 Copying data and keys of selected variables for further analysis
351 """
352
353 var, sel = self.__var_and_sel_clean(var, sel)
354
355
356 d = dict((i,self.data[i][sel]) for i in var)
357
358 return d.keys(), array(d.values()).T
359
360 def info(self,*var, **adict):
361 """
362 Printing descriptive statistics on selected variables
363 """
364
365
366 var, sel = self.__var_and_sel_clean(var, adict)
367 dates, nobs = self.__dates_and_nobs_clean(var, sel)
368
369
370 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
371 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
372
373
374 nvar = len(var)
375
376 print '\n=============================================================================='
377 print '============================ Database information ============================'
378 print '==============================================================================\n'
379
380 print 'file: %s' % self.DBname
381 print '# obs: %s' % nobs
382 print '# variables: %s' % nvar
383 print 'Start date: %s' % mindate
384 print 'End date: %s' % maxdate
385
386 print '\nvar min max mean std.dev miss levels'
387 print '=============================================================================='
388
389 sets = {}
390 for i in var:
391 col = self.data[i][sel];
392 if type(col[0]) == string_:
393 _miss = sum(col == '')
394 col_set = set(col)
395 sets[i] = col_set
396 print '''%-5s %-5s %-5s %-5s %-5s % -5.0f %-5i''' % tuple([i,'-','-','-','-',_miss,len(col_set)])
397 else:
398 _miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
399 print '''% -5s % -5.2f % -5.2f % -5.2f % -5.2f % -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)])
400
401 if sets:
402 print '\n\nLevels for non-numeric data:'
403 for i in sets.keys():
404 print '=============================================================================='
405 print '''% -5s % -5s''' % tuple([i,sets[i]])
406
407 def dataplot(self,*var, **adict):
408 """
409 Plotting the data with variable names
410 """
411
412 var, sel = self.__var_and_sel_clean(var, adict)
413 dates, nobs = self.__dates_and_nobs_clean(var, sel)
414
415
416 nvar = []
417 for i in var:
418 col = self.data[i][sel]
419 if type(col[0]) != string_:
420 pylab.plot_date(dates,self.data[i][sel],'o-')
421 nvar = nvar + [i]
422
423 pylab.xlabel("Time (n = " + str(nobs) + ")")
424 pylab.title("Data plot of " + self.DBname)
425 pylab.legend(nvar)
426 if adict.has_key('file'):
427 pylab.savefig(adict['file'],dpi=600)
428 pylab.show()
429
430 def to_recarray(self):
431 """
432 Transform the data dictionary to a recarray or a masked array if missing values
433 are present. Based on input from Pierre GM.
434 """
435
436 descr = [(k,self.data[k].dtype) for k in b.data.keys()]
437
438 ra = recarray((self.nobs,), dtype=descr)
439
440 [setattr(ra,n,v) for (n,v) in self.data.iteritems()]
441
442 return ra
443
444 def __var_and_sel_clean(self, var, sel, dates_needed = True):
445 """
446 Convenience function to avoid code duplication
447 """
448
449 if var == ():
450 var = self.data.keys()
451
452
453 var = [x for x in var if x != self.date_key]
454
455
456 var.sort()
457
458
459
460 if not sel.has_key('sel'):
461 sel = ()
462 else:
463 sel = sel['sel']
464
465 return var, sel
466
467 def __dates_and_nobs_clean(self, var, sel):
468 """
469 Convenience function to avoid code duplication
470 """
471 nobs = self.nobs
472 if len(sel):
473 nobs = nobs - (nobs - sum(sel))
474
475 if self.date_key != None and self.data.has_key(self.date_key):
476
477 dates = self.data[self.date_key][sel]
478 else:
479
480 dates = range(711858,nobs+711858)
481
482 return dates, nobs
483
484 def __ext(self,fname):
485 """
486 Finding the file extension of the filename passed to dbase
487 """
488 return fname.split('.')[-1].strip()
489
490 if __name__ == '__main__':
491
492
493
494
495
496 import sys
497 from scipy import c_
498
499
500 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
501
502
503 varnm = ['date','a','b','c']
504 nobs = 100
505 data = randn(nobs,3)
506 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
507 dates = [i.strftime('%d %b %y') for i in dates]
508 data = c_[dates,data]
509
510
511 data[5,1] = ''
512 data[9,3] = ''
513
514
515 varnm = varnm + ['id']
516 id = [('id'+str(i)) for i in range(nobs)]
517 id[8] = ''
518 data = c_[data,id]
519
520
521 f = open('./dbase_test_files/data.csv','w')
522 writer = csv.writer(f)
523 writer.writerow(varnm)
524 writer.writerows(data)
525 f.close()
526
527
528 a = dbase("./dbase_test_files/data.csv",date = 0)
529
530
531 a.save("./dbase_test_files/data.pickle")
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547 print "\nLoading the dbase object from a pickle file\n"
548 b = dbase("./dbase_test_files/data.pickle")
549
550
551 ra = b.to_recarray()
552
553 print '\nLables (+ type) in newly created recarray', ra.dtype
554 print '\nData for column "a"', ra.a
555 print '\nData for first 10 rows, for column "a"', ra['a'][:10]
556
557
558 print "\nWorking on file: " + b.DBname
559
560
561 varnm, data = b.get()
562 print "Variable names from dbase class\n", varnm
563 print "\nData from dbase class\n", data
564 print "\nDate series", b.data[b.date_key]
565
566
567 varnm, data = b.get('a','c')
568 print "\nTwo columns selected using variable names\n", varnm, "\n", data
569
570
571 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
572
573
574 x1 = b.data['a'] * b.data['b']
575 x2 = b.data['a'] * b.data['c']
576 xdict = {'x1':x1,'x2':x2}
577 b.data.update(xdict)
578
579 varnm, data = b.get()
580 print "\nTwo variable names added\n", varnm
581 print "\nTwo columns added\n", data
582
583
584 import copy
585 c = copy.deepcopy(b)
586
587
588 c.delvar('a','x2')
589 varnm, data = c.get()
590 print "\nTwo variable names deleted\n", varnm
591 print "\nTwo columns deleted\n", data
592
593
594 c = copy.deepcopy(b)
595 c.keepvar('a','x2')
596 varnm, data = c.get()
597 print "\nAll but two variable names deleted\n", varnm
598 print "\nAll but Two columns deleted\n", data
599
600
601 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
602
603
604 c = copy.deepcopy(b)
605 c.delobs(sel_rule)
606
607 varnm, data = c.get()
608 print "\nReduced number of observations following the selection rule\n", data
609
610
611 c = copy.deepcopy(b)
612 c.keepobs(sel_rule)
613
614 varnm, data = c.get()
615 print "\nReduced number of observations following the inverse of the selection rule\n", data
616
617
618 x = b.keepvar_copy('a')
619
620
621 x = b.delvar_copy('a')
622
623
624 x = b.keepobs_copy(sel_rule)
625
626
627 x = b.delobs_copy(sel_rule)
628
629
630 b.info()
631
632
633 b.dataplot(file = './dbase_test_files/full_plot.png')
634
635
636 b.add_trend('mytrend')
637
638
639 dummy_rule = b.data['a'] > 0
640 b.add_dummy(dummy_rule,'mydummy')
641
642
643 b.add_seasonal_dummies(52,13)
644
645
646 b.info('b','c','mydummy', sel = sel_rule)
647
648
649 b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')