1 from scipy import c_, arange, array, unique, kron, ones, eye
2 from numpy.random import randn
3 from __future__ import division
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 TODO:
9 - Check if shelve loading/saving works
10 - Only tested on Mac OS X 10.4.8, with pytz
11
12 DOC:
13 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
14 Data is stored in a dictionary.
15
16 To use the class:
17
18 >>> from dbase import dbase
19 >>> y = dbase('your_filename.csv')
20
21 or for a previously created dbase object stored in a pickle file
22
23 >>> from dbase import dbase
24 >>> y = dbase('your_filename.pickle')
25
26 or without importing the dbase class
27
28 >>> import cPickle
29 >>> f = open('your_filename.pickle','rb')
30 >>> y = cPickle.load(f)
31 >>> data_key = cPickle.load(f)
32 >>> f.close()
33
34 or for a dictionary stored in a shelf file
35
36 >>> from dbase import dbase
37 >>> y = dbase('your_filename.pickle')
38
39 To return a list of variable names and an array of data
40
41 >>> varnm, data = y.get()
42
43 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
44 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
45 of the class to a pickle file.
46 """
47
48 def __init__(self,fname,*var,**date):
49 """
50 Initializing the dbase class. Loading file fname.
51
52 If you have have a column in your csv file that is a date-string use:
53
54 >>> x = dbase('myfile.csv',date = 0)
55
56 where 0 is the index of the date column
57
58 If you have have an array in your pickle file that is a date variable use:
59
60 >>> x = dbase('myfile.pickle',date = 'date')
61
62 where 'date' is the key of the date array
63 """
64 self.load(fname,var,date)
65
66 def load(self,fname,var,date):
67 """
68 Loading data from a csv or a pickle file of the dbase class.
69 If this is csv file use pylab's load function. Seems much faster
70 than scipy.io.read_array.
71 """
72
73 self.DBname = os.getcwd() + '/' + fname
74
75
76 self.date_key = date.values()
77
78
79 fext = self.__ext(fname)
80
81
82 if fext == 'csv':
83 f = open(fname,'r')
84 self.load_csv(f)
85 f.close()
86 elif fext == 'pickle':
87 f = open(fname,'rb')
88 self.load_pickle(f)
89 f.close()
90 elif fext == 'she':
91 self.load_shelve(fname,var)
92 else:
93 raise 'This class only works on csv, pickle, and shelve files'
94
95
96 self.nobs = self.data[self.data.keys()[0]].shape[0]
97
98 def load_csv(self,f):
99 """
100 Loading data from a csv file. Uses pylab's load function. Seems much faster
101 than scipy.io.read_array.
102 """
103 varnm = f.readline().split(',')
104
105
106 if self.date_key != []:
107 rawdata = pylab.load(f, delimiter=',',converters={self.date_key[0]:pylab.datestr2num})
108 self.date_key = varnm[self.date_key[0]]
109 else:
110 rawdata = pylab.load(f, delimiter=',')
111
112
113 varnm = [i.strip() for i in varnm]
114
115
116 self.data = dict(zip(varnm,rawdata.T))
117
118 def load_pickle(self,f):
119 """
120 Loading data from a created earlier using the the dbase class.
121 """
122 self.data = cPickle.load(f)
123
124
125 if self.date_key == []:
126 try:
127 self.date_key = cPickle.load(f)
128 except:
129 print "No date series in pickle file"
130 else:
131 self.date_key = self.date_key[0]
132
133 def load_shelve(self,fname,var):
134 """
135 Loading data from a created earlier using the the dbase class.
136 """
137 data = shelve.open(fname)
138
139
140 if var == ():
141 var = data.keys()
142
143
144 if self.date_key != []:
145 if not self.date_key[0] in var: var = var + self.date_key
146 self.date_key = self.date_key[0]
147
148 self.data = dict([(i,data[i]) for i in var])
149 data.close()
150
151 def save(self,fname):
152 """
153 Dumping the class data dictionary into a csv or pickle file
154 """
155 fext = self.__ext(fname)
156 if fext == 'csv':
157 f = open(fname,'w')
158 self.save_csv(f)
159 f.close()
160 elif fext == 'pickle':
161 f = open(fname,'wb')
162 self.save_pickle(f)
163 f.close()
164 elif fext == 'she':
165 self.save_shelve(fname)
166 else:
167 raise 'This class only works on csv, pickle, and shelve files'
168
169 def save_csv(self,f):
170 """
171 Dumping the class data dictionary into a csv file
172 """
173 writer = csv.writer(f)
174 writer.writerow(self.data.keys())
175
176 data = self.data
177 if self.date_key != []:
178 data = dict(data)
179 dates = pylab.num2date(data[self.date_key])
180 dates = array([i.strftime('%d %b %y') for i in dates])
181 data[self.date_key] = dates
182
183 writer.writerows(array(data.values()).T)
184
185 def save_pickle(self,f):
186 """
187 Dumping the class data dictionary and date_key into a binary pickle file
188 """
189 cPickle.dump(self.data,f,2)
190 cPickle.dump(self.date_key,f,2)
191
192 def save_shelve(self,fname):
193 """
194 Dumping the class data dictionary into a shelve file
195 """
196 f = shelve.open('data.she','c')
197 f = self.data
198 f.close()
199
200 def add_trend(self,tname = 'trend'):
201
202 self.data[tname] = arange(self.nobs)
203
204 def add_dummy(self,dum, dname = 'dummy'):
205 if self.data.has_key(dname):
206 print "The variable name '" + str(dname) + "' already exists. Please select another name."
207 else:
208 self.data[dname] = dum
209
210 def add_seasonal_dummies(self,freq=52,ndum=13):
211 """
212 This function will only work if the freq and ndum 'fit. That is,
213 weeks and 4-weekly periods will work. Weeks and months/quarters
214 will not.
215 """
216 if self.date_key == []:
217 print "Cannot create seasonal dummies since no date array is known"
218 else:
219
220 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
221
222
223 start = freq - sum(years == min(years))
224
225
226 nyear = unique(years).shape[0]
227
228
229 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
230 sd = sd[start:start+self.nobs]
231 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
232 self.data.update(sd)
233
234 def delvar(self,*var):
235 """
236 Deleting specified variables in the data dictionary, changing dictionary in place
237 """
238 [self.data.pop(i) for i in var]
239
240 def keepvar(self,*var):
241 """
242 Keeping specified variables in the data dictionary, changing dictionary in place
243 """
244 [self.data.pop(i) for i in self.data.keys() if i not in var]
245
246 def delvar_copy(self,*var):
247 """
248 Deleting specified variables in the data dictionary, making a copy
249 """
250 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
251
252 def keepvar_copy(self,*var):
253 """
254 Keeping specified variables in the data dictionary, making a copy
255 """
256 return dict([(i,self.data[i]) for i in var])
257
258 def delobs(self,sel):
259 """
260 Deleting specified observations, changing dictionary in place
261 """
262 for i in self.data.keys(): self.data[i] = self.data[i][sel]
263
264
265 self.nobs -= sum(sel)
266
267 def keepobs(self,sel):
268 """
269 Keeping specified observations, changing dictionary in place
270 """
271
272 self.nobs -= sum(sel)
273
274 sel -= 1
275 self.delobs(sel)
276
277 def delobs_copy(self,sel):
278 """
279 Deleting specified observations, making a copy
280 """
281 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
282
283 def keepobs_copy(self,sel):
284 """
285 Keeping specified observations, making a copy
286 """
287 sel -= 1
288 self.delobs_copy(sel)
289
290 def get(self,*var,**sel):
291 """
292 Copying data and keys of selected variables for further analysis
293 """
294
295 var, sel = self.__var_and_sel_clean(var, sel)
296
297
298 d = dict((i,self.data[i][sel]) for i in var)
299
300 return d.keys(), array(d.values()).T
301
302 def info(self,*var, **adict):
303 """
304 Printing descriptive statistics on selected variables
305 """
306
307
308 var, sel = self.__var_and_sel_clean(var, adict)
309 dates, nobs = self.__dates_and_nobs_clean(var, sel)
310
311
312 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
313 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
314
315
316 nvar = len(var)
317
318 print '\n=============================================================='
319 print '==================== Database information ===================='
320 print '==============================================================\n'
321
322 print 'file: %s' % b.DBname
323 print '# obs: %s' % nobs
324 print '# variables: %s' % nvar
325 print "Start date: %s" % mindate
326 print "End date: %s" % maxdate
327
328 print '\nvar min max mean std.dev'
329 print '=============================================================='
330
331 for i in var:
332 _min = self.data[i][sel].min(); _max = self.data[i][sel].max(); _mean = self.data[i][sel].mean(); _std = self.data[i][sel].std()
333 print '''%-5s %-5.2f %-5.2f %-5.2f %-5.2f''' % tuple([i,_min,_max,_mean,_std])
334
335 def dataplot(self,*var, **adict):
336 """
337 Plotting the data with variable names
338 """
339
340 var, sel = self.__var_and_sel_clean(var, adict)
341 dates, nobs = self.__dates_and_nobs_clean(var, sel)
342
343 for i in var:
344 pylab.plot_date(dates,self.data[i][sel],'o-')
345
346 pylab.xlabel("Time (n = " + str(nobs) + ")")
347 pylab.title("Data plot of " + self.DBname)
348 pylab.legend(var)
349 if adict.has_key('file'):
350 pylab.savefig(adict['file'],dpi=600)
351 pylab.show()
352
353 def __var_and_sel_clean(self, var, sel, dates_needed = True):
354 """
355 Convenience function to avoid code duplication
356 """
357
358 if var == ():
359 var = self.data.keys()
360
361
362 var = [x for x in var if x != self.date_key]
363
364
365 var.sort()
366
367
368
369 if not sel.has_key('sel'):
370 sel = ()
371 else:
372 sel = sel['sel']
373
374 return var, sel
375
376 def __dates_and_nobs_clean(self, var, sel):
377 """
378 Convenience function to avoid code duplication
379 """
380 nobs = self.nobs
381 if len(sel):
382 nobs = nobs - (nobs - sum(sel))
383
384 if self.date_key != None and self.data.has_key(self.date_key):
385
386 dates = self.data[self.date_key][sel]
387 else:
388
389 dates = range(711858,nobs+711858)
390
391 return dates, nobs
392
393 def __ext(self,fname):
394 """
395 Finding the file extension of the filename passed to dbase
396 """
397 return fname.split('.')[-1].strip()
398
399 if __name__ == '__main__':
400
401
402
403
404
405 import sys
406 from scipy import c_
407
408
409 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
410
411
412 varnm = ['date','a','b','c']
413 nobs = 100
414 data = randn(nobs,3)
415 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
416 dates = [i.strftime('%d %b %y') for i in dates]
417 data = c_[dates,data]
418
419
420 f = open('./dbase_test_files/data.csv','w')
421 writer = csv.writer(f)
422 writer.writerow(varnm)
423 writer.writerows(data)
424 f.close()
425
426
427 a = dbase("./dbase_test_files/data.csv",date = 0)
428
429 a.save("./dbase_test_files/data.pickle")
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445 print "\nLoading the dbase object from a pickle file\n"
446 b = dbase("./dbase_test_files/data.pickle")
447
448
449 print "\nWorking on file: " + b.DBname
450
451
452 varnm, data = b.get()
453 print "Variable names from dbase class\n", varnm
454 print "\nData from dbase class\n", data
455 print "\nDate series", b.data[b.date_key]
456
457
458 varnm, data = b.get('a','c')
459 print "\nTwo columns selected using variable names\n", varnm, "\n", data
460
461
462 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
463
464
465 x1 = b.data['a'] * b.data['b']
466 x2 = b.data['a'] * b.data['c']
467 xdict = {'x1':x1,'x2':x2}
468 b.data.update(xdict)
469
470 varnm, data = b.get()
471 print "\nTwo variable names added\n", varnm
472 print "\nTwo columns added\n", data
473
474
475 import copy
476 c = copy.deepcopy(b)
477
478
479 c.delvar('a','x2')
480 varnm, data = c.get()
481 print "\nTwo variable names deleted\n", varnm
482 print "\nTwo columns deleted\n", data
483
484
485 c = copy.deepcopy(b)
486 c.keepvar('a','x2')
487 varnm, data = c.get()
488 print "\nAll but two variable names deleted\n", varnm
489 print "\nAll but Two columns deleted\n", data
490
491
492 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
493
494
495 c = copy.deepcopy(b)
496 c.delobs(sel_rule)
497
498 varnm, data = c.get()
499 print "\nReduced number of observations following the selection rule\n", data
500
501
502 c = copy.deepcopy(b)
503 c.keepobs(sel_rule)
504
505 varnm, data = c.get()
506 print "\nReduced number of observations following the inverse of the selection rule\n", data
507
508
509 x = b.keepvar_copy('a')
510
511
512 x = b.delvar_copy('a')
513
514
515 x = b.keepobs_copy(sel_rule)
516
517
518 x = b.delobs_copy(sel_rule)
519
520
521 b.info()
522
523
524 b.dataplot(file = './dbase_test_files/full_plot.png')
525
526
527 b.add_trend('mytrend')
528
529
530 dummy_rule = b.data['a'] > 0
531 b.add_dummy(dummy_rule,'mydummy')
532
533
534 b.add_seasonal_dummies(52,13)
535
536
537 b.info('b','c', sel = sel_rule)
538
539
540 b.dataplot('b','c', sel = sel_rule, file = './dbase_test_files/partial_plot.png')