1 from __future__ import division
2 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
3 from numpy.random import randn
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 Author: Vincent Nijs (+ ?)
9 Email: v-nijs at kellogg.northwestern.edu
10 Last Modified: Sun Jan 7 10:05:12 CST 2007
11
12 Todo:
13 - Check if shelve loading/saving works
14
15 Tested on:
16 - Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
17 - Tested on Linux
18
19 Dependencies:
20 - See import statement at the top of this file
21
22 Doc:
23 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
24 Data is stored in a dictionary.
25
26 To use the class:
27
28 >>> from dbase import dbase
29 >>> y = dbase('your_filename.csv')
30
31 or for a previously created dbase object stored in a pickle file
32
33 >>> from dbase import dbase
34 >>> y = dbase('your_filename.pickle')
35
36 or without importing the dbase class
37
38 >>> import cPickle
39 >>> f = open('your_filename.pickle','rb')
40 >>> y = cPickle.load(f)
41 >>> data_key = cPickle.load(f)
42 >>> f.close()
43
44 or for a dictionary stored in a shelf file
45
46 >>> from dbase import dbase
47 >>> y = dbase('your_filename.pickle')
48
49 To return a list of variable names and an array of data
50
51 >>> varnm, data = y.get()
52
53 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
54 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
55 of the class to a pickle file.
56 """
57
58 def __init__(self,fname,var = (),date = ''):
59 """
60 Initializing the dbase class. Loading file fname.
61
62 If you have have a column in your csv file that is a date-string use:
63
64 >>> x = dbase('myfile.csv',date = 0)
65
66 where 0 is the index of the date column
67
68 If you have have an array in your pickle file that is a date variable use:
69
70 >>> x = dbase('myfile.pickle',date = 'date')
71
72 where 'date' is the key of the date array
73 """
74 self.load(fname,var,date)
75
76 def load(self,fname,var,date):
77 """
78 Loading data from a csv or a pickle file of the dbase class.
79 If this is csv file use pylab's load function. Seems much faster
80 than scipy.io.read_array.
81 """
82
83 self.DBname = os.getcwd() + '/' + fname
84
85
86 self.date_key = date
87
88
89 fext = self.__ext(fname)
90
91
92 if fext == 'csv':
93 f = open(fname,'r')
94 self.load_csv(f)
95 f.close()
96 elif fext == 'pickle':
97 f = open(fname,'rb')
98 self.load_pickle(f)
99 f.close()
100 elif fext == 'she':
101 self.load_shelve(fname,var)
102 else:
103 raise 'This class only works on csv, pickle, and shelve files'
104
105
106 self.nobs = self.data[self.data.keys()[0]].shape[0]
107
108 def csvconvert(self,col):
109 """
110 Converting data in a string array to the appropriate type
111 """
112
113 col[col == ''] = 'nan'; col[col == '.'] = 'nan'
114 try:
115
116 return col.astype('i')
117 except ValueError:
118 try:
119 return col.astype('f')
120 except ValueError:
121
122 col[col == 'nan'] = ''
123 return col
124
125 def load_csv_nf(self,f):
126 """
127 Loading data from a csv file using the csv module. Return a list of arrays.
128 Possibly with different types and/or missing values.
129 """
130
131 f.seek(0)
132
133 reader = csv.reader(f)
134
135
136 datalist = array([i for i in reader])
137
138
139 datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
140
141 return datalist
142
143 def load_csv(self,f):
144 """
145 Loading data from a csv file. Uses pylab's load function. Seems much faster
146 than scipy.io.read_array.
147 """
148 varnm = f.readline().split(',')
149
150
151 if self.date_key != '':
152 try:
153 rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num})
154 except ValueError:
155 rawdata = self.load_csv_nf(f)
156
157
158 rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
159
160 self.date_key = varnm[self.date_key]
161 else:
162 try:
163 rawdata = pylab.load(f, delimiter=',')
164 except ValueError:
165 rawdata = self.load_csv_nf(f)
166
167
168 varnm = [i.strip() for i in varnm]
169
170
171 if type(rawdata) == list:
172
173 self.data = dict(zip(varnm,rawdata))
174 else:
175
176 self.data = dict(zip(varnm,rawdata.T))
177
178 def load_pickle(self,f):
179 """
180 Loading data from a created earlier using the the dbase class.
181 """
182 self.data = cPickle.load(f)
183
184
185 if self.date_key == '':
186 try:
187 self.date_key = cPickle.load(f)
188 except:
189 print "No date series in pickle file"
190
191 def load_shelve(self,fname,var):
192 """
193 Loading data from a created earlier using the the dbase class.
194 """
195 data = shelve.open(fname)
196
197
198 if var == ():
199 var = data.keys()
200
201
202 if self.date_key != '':
203 if not self.date_key in var: var = var + list(self.date_key)
204
205 self.data = dict([(i,data[i]) for i in var])
206 data.close()
207
208 def save(self,fname):
209 """
210 Dumping the class data dictionary into a csv or pickle file
211 """
212 fext = self.__ext(fname)
213 if fext == 'csv':
214 f = open(fname,'w')
215 self.save_csv(f)
216 f.close()
217 elif fext == 'pickle':
218 f = open(fname,'wb')
219 self.save_pickle(f)
220 f.close()
221 elif fext == 'she':
222 self.save_shelve(fname)
223 else:
224 raise 'This class only works on csv, pickle, and shelve files'
225
226 def save_csv(self,f):
227 """
228 Dumping the class data dictionary into a csv file
229 """
230 writer = csv.writer(f)
231 writer.writerow(self.data.keys())
232
233 data = self.data
234 if self.date_key != []:
235 data = dict(data)
236 dates = pylab.num2date(data[self.date_key])
237 dates = array([i.strftime('%d %b %y') for i in dates])
238 data[self.date_key] = dates
239
240 writer.writerows(array(data.values()).T)
241
242 def save_pickle(self,f):
243 """
244 Dumping the class data dictionary and date_key into a binary pickle file
245 """
246 cPickle.dump(self.data,f,2)
247 cPickle.dump(self.date_key,f,2)
248
249 def save_shelve(self,fname):
250 """
251 Dumping the class data dictionary into a shelve file
252 """
253 f = shelve.open('data.she','c')
254 f = self.data
255 f.close()
256
257 def add_trend(self,tname = 'trend'):
258
259 self.data[tname] = arange(self.nobs)
260
261 def add_dummy(self,dum, dname = 'dummy'):
262 if self.data.has_key(dname):
263 print "The variable name '" + str(dname) + "' already exists. Please select another name."
264 else:
265 self.data[dname] = dum
266
267 def add_seasonal_dummies(self,freq=52,ndum=13):
268 """
269 This function will only work if the freq and ndum 'fit. That is,
270 weeks and 4-weekly periods will work. Weeks and months/quarters
271 will not.
272 """
273 if self.date_key == []:
274 print "Cannot create seasonal dummies since no date array is known"
275 else:
276
277 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
278
279
280 start = freq - sum(years == min(years))
281
282
283 nyear = unique(years).shape[0]
284
285
286 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
287 sd = sd[start:start+self.nobs]
288 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
289 self.data.update(sd)
290
291 def delvar(self,*var):
292 """
293 Deleting specified variables in the data dictionary, changing dictionary in place
294 """
295 [self.data.pop(i) for i in var]
296
297 def keepvar(self,*var):
298 """
299 Keeping specified variables in the data dictionary, changing dictionary in place
300 """
301 [self.data.pop(i) for i in self.data.keys() if i not in var]
302
303 def delvar_copy(self,*var):
304 """
305 Deleting specified variables in the data dictionary, making a copy
306 """
307 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
308
309 def keepvar_copy(self,*var):
310 """
311 Keeping specified variables in the data dictionary, making a copy
312 """
313 return dict([(i,self.data[i]) for i in var])
314
315 def delobs(self,sel):
316 """
317 Deleting specified observations, changing dictionary in place
318 """
319 for i in self.data.keys(): self.data[i] = self.data[i][sel]
320
321
322 self.nobs -= sum(sel)
323
324 def keepobs(self,sel):
325 """
326 Keeping specified observations, changing dictionary in place
327 """
328
329 self.nobs -= sum(sel)
330
331 sel -= 1
332 self.delobs(sel)
333
334 def delobs_copy(self,sel):
335 """
336 Deleting specified observations, making a copy
337 """
338 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
339
340 def keepobs_copy(self,sel):
341 """
342 Keeping specified observations, making a copy
343 """
344 sel -= 1
345 self.delobs_copy(sel)
346
347 def get(self,*var,**sel):
348 """
349 Copying data and keys of selected variables for further analysis
350 """
351
352 var, sel = self.__var_and_sel_clean(var, sel)
353
354
355 d = dict((i,self.data[i][sel]) for i in var)
356
357 return d.keys(), array(d.values()).T
358
359 def info(self,*var, **adict):
360 """
361 Printing descriptive statistics on selected variables
362 """
363
364
365 var, sel = self.__var_and_sel_clean(var, adict)
366 dates, nobs = self.__dates_and_nobs_clean(var, sel)
367
368
369 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
370 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
371
372
373 nvar = len(var)
374
375 print '\n=============================================================================='
376 print '============================ Database information ============================'
377 print '==============================================================================\n'
378
379 print 'file: %s' % self.DBname
380 print '# obs: %s' % nobs
381 print '# variables: %s' % nvar
382 print 'Start date: %s' % mindate
383 print 'End date: %s' % maxdate
384
385 print '\nvar min max mean std.dev miss levels'
386 print '=============================================================================='
387
388 sets = {}
389 for i in var:
390 col = self.data[i][sel];
391 if type(col[0]) == string_:
392 _miss = sum(col == '')
393 col_set = set(col)
394 sets[i] = col_set
395 print '''%-5s %-5s %-5s %-5s %-5s % -5.0f %-5i''' % tuple([i,'-','-','-','-',_miss,len(col_set)])
396 else:
397 _miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
398 print '''% -5s % -5.2f % -5.2f % -5.2f % -5.2f % -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)])
399
400 if sets:
401 print '\n\nLevels for non-numeric data:'
402 for i in sets.keys():
403 print '=============================================================================='
404 print '''% -5s % -5s''' % tuple([i,sets[i]])
405
406 def dataplot(self,*var, **adict):
407 """
408 Plotting the data with variable names
409 """
410
411 var, sel = self.__var_and_sel_clean(var, adict)
412 dates, nobs = self.__dates_and_nobs_clean(var, sel)
413
414
415 nvar = []
416 for i in var:
417 col = self.data[i][sel]
418 if type(col[0]) != string_:
419 pylab.plot_date(dates,self.data[i][sel],'o-')
420 nvar = nvar + [i]
421
422 pylab.xlabel("Time (n = " + str(nobs) + ")")
423 pylab.title("Data plot of " + self.DBname)
424 pylab.legend(nvar)
425 if adict.has_key('file'):
426 pylab.savefig(adict['file'],dpi=600)
427 pylab.show()
428
429 def __var_and_sel_clean(self, var, sel, dates_needed = True):
430 """
431 Convenience function to avoid code duplication
432 """
433
434 if var == ():
435 var = self.data.keys()
436
437
438 var = [x for x in var if x != self.date_key]
439
440
441 var.sort()
442
443
444
445 if not sel.has_key('sel'):
446 sel = ()
447 else:
448 sel = sel['sel']
449
450 return var, sel
451
452 def __dates_and_nobs_clean(self, var, sel):
453 """
454 Convenience function to avoid code duplication
455 """
456 nobs = self.nobs
457 if len(sel):
458 nobs = nobs - (nobs - sum(sel))
459
460 if self.date_key != None and self.data.has_key(self.date_key):
461
462 dates = self.data[self.date_key][sel]
463 else:
464
465 dates = range(711858,nobs+711858)
466
467 return dates, nobs
468
469 def __ext(self,fname):
470 """
471 Finding the file extension of the filename passed to dbase
472 """
473 return fname.split('.')[-1].strip()
474
475 if __name__ == '__main__':
476
477
478
479
480
481 import sys
482 from scipy import c_
483
484
485 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
486
487
488 varnm = ['date','a','b','c']
489 nobs = 100
490 data = randn(nobs,3)
491 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
492 dates = [i.strftime('%d %b %y') for i in dates]
493 data = c_[dates,data]
494
495
496 data[5,1] = ''
497 data[9,3] = ''
498
499
500 varnm = varnm + ['id']
501 id = [('id'+str(i)) for i in range(nobs)]
502 id[8] = ''
503 data = c_[data,id]
504
505
506 f = open('./dbase_test_files/data.csv','w')
507 writer = csv.writer(f)
508 writer.writerow(varnm)
509 writer.writerows(data)
510 f.close()
511
512
513 a = dbase("./dbase_test_files/data.csv",date = 0)
514
515
516 a.save("./dbase_test_files/data.pickle")
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532 print "\nLoading the dbase object from a pickle file\n"
533 b = dbase("./dbase_test_files/data.pickle")
534
535
536 print "\nWorking on file: " + b.DBname
537
538
539 varnm, data = b.get()
540 print "Variable names from dbase class\n", varnm
541 print "\nData from dbase class\n", data
542 print "\nDate series", b.data[b.date_key]
543
544
545 varnm, data = b.get('a','c')
546 print "\nTwo columns selected using variable names\n", varnm, "\n", data
547
548
549 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
550
551
552 x1 = b.data['a'] * b.data['b']
553 x2 = b.data['a'] * b.data['c']
554 xdict = {'x1':x1,'x2':x2}
555 b.data.update(xdict)
556
557 varnm, data = b.get()
558 print "\nTwo variable names added\n", varnm
559 print "\nTwo columns added\n", data
560
561
562 import copy
563 c = copy.deepcopy(b)
564
565
566 c.delvar('a','x2')
567 varnm, data = c.get()
568 print "\nTwo variable names deleted\n", varnm
569 print "\nTwo columns deleted\n", data
570
571
572 c = copy.deepcopy(b)
573 c.keepvar('a','x2')
574 varnm, data = c.get()
575 print "\nAll but two variable names deleted\n", varnm
576 print "\nAll but Two columns deleted\n", data
577
578
579 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
580
581
582 c = copy.deepcopy(b)
583 c.delobs(sel_rule)
584
585 varnm, data = c.get()
586 print "\nReduced number of observations following the selection rule\n", data
587
588
589 c = copy.deepcopy(b)
590 c.keepobs(sel_rule)
591
592 varnm, data = c.get()
593 print "\nReduced number of observations following the inverse of the selection rule\n", data
594
595
596 x = b.keepvar_copy('a')
597
598
599 x = b.delvar_copy('a')
600
601
602 x = b.keepobs_copy(sel_rule)
603
604
605 x = b.delobs_copy(sel_rule)
606
607
608 b.info()
609
610
611 b.dataplot(file = './dbase_test_files/full_plot.png')
612
613
614 b.add_trend('mytrend')
615
616
617 dummy_rule = b.data['a'] > 0
618 b.add_dummy(dummy_rule,'mydummy')
619
620
621 b.add_seasonal_dummies(52,13)
622
623
624 b.info('b','c','mydummy', sel = sel_rule)
625
626
627 b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')