Attachment 'dbase.0.7.py'
Download
Toggle line numbers
1 from __future__ import division
2 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
3 from numpy.random import randn
4 from numpy.core.records import recarray
5 import pylab, cPickle, shelve, csv, copy, os
6
7 class dbase:
8 """
9 Author: Vincent Nijs (+ ?)
10 Email: v-nijs at kellogg.northwestern.edu
11 Last Modified: Thu Jan 18 22:32:56 CST 2007
12
13 Todo:
14 - Check if shelve loading/saving works
15
16 Tested on:
17 - Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
18 - Tested on Linux
19
20 Dependencies:
21 - See import statement at the top of this file
22
23 Doc:
24 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
25 Data is stored in a dictionary.
26
27 To use the class:
28
29 >>> from dbase import dbase
30 >>> y = dbase('your_filename.csv')
31
32 or for a previously created dbase object stored in a pickle file
33
34 >>> from dbase import dbase
35 >>> y = dbase('your_filename.pickle')
36
37 or without importing the dbase class
38
39 >>> import cPickle
40 >>> f = open('your_filename.pickle','rb')
41 >>> y = cPickle.load(f)
42 >>> data_key = cPickle.load(f)
43 >>> f.close()
44
45 or for a dictionary stored in a shelf file
46
47 >>> from dbase import dbase
48 >>> y = dbase('your_filename.pickle')
49
50 To return a list of variable names and an array of data
51
52 >>> varnm, data = y.get()
53
54 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
55 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
56 of the class to a pickle file.
57 """
58
59 def __init__(self,fname,var = (),date = ''):
60 """
61 Initializing the dbase class. Loading file fname.
62
63 If you have have a column in your csv file that is a date-string use:
64
65 >>> x = dbase('myfile.csv',date = 0)
66
67 where 0 is the index of the date column
68
69 If you have have an array in your pickle file that is a date variable use:
70
71 >>> x = dbase('myfile.pickle',date = 'date')
72
73 where 'date' is the key of the date array
74 """
75 self.load(fname,var,date)
76
77 def load(self,fname,var,date):
78 """
79 Loading data from a csv or a pickle file of the dbase class.
80 If this is csv file use pylab's load function. Seems much faster
81 than scipy.io.read_array.
82 """
83 # setting the ascii/csv file name used for input
84 self.DBname = os.getcwd() + '/' + fname
85
86 # assuming self.date_key = '' unless otherwise given
87 self.date_key = date
88
89 # getting the file extension
90 fext = self.__ext(fname)
91
92 # opening the file for reading
93 if fext == 'csv':
94 f = open(fname,'r')
95 self.load_csv(f)
96 f.close()
97 elif fext == 'pickle':
98 f = open(fname,'rb')
99 self.load_pickle(f)
100 f.close()
101 elif fext == 'she':
102 self.load_shelve(fname,var)
103 else:
104 raise 'This class only works on csv, pickle, and shelve files'
105
106 # specifying nobs in self.data
107 self.nobs = self.data[self.data.keys()[0]].shape[0]
108
109 def csvconvert(self,col):
110 """
111 Converting data in a string array to the appropriate type
112 """
113 # convert missing values to nan
114 col[col == ''] = 'nan'; col[col == '.'] = 'nan'
115 try:
116 # if a missing value is present int variables will be up-cast to float
117 return col.astype('i')
118 except ValueError:
119 try:
120 return col.astype('f')
121 except ValueError:
122 # if the data is a string, put back the empty string
123 col[col == 'nan'] = ''
124 return col
125
126 def load_csv_nf(self,f):
127 """
128 Loading data from a csv file using the csv module. Return a list of arrays.
129 Possibly with different types and/or missing values.
130 """
131 # resetting to the beginning of the file since pylab.load was already tried
132 f.seek(0)
133
134 reader = csv.reader(f)
135
136 # putting the data in an array of strings
137 datalist = array([i for i in reader])
138
139 # converting the data to an appropriate type
140 datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
141
142 return datalist
143
144 def load_csv(self,f):
145 """
146 Loading data from a csv file. Uses pylab's load function. Seems much faster
147 than scipy.io.read_array.
148 """
149 varnm = f.readline().split(',')
150
151 # what is the date variable's key if any, based on index passed as argument
152 if self.date_key != '':
153 try:
154 rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num}) # don't need to 'skiprows' here
155 except ValueError: # if loading via pylab doesn't work use csv
156 rawdata = self.load_csv_nf(f)
157
158 # converting the dates column to a date-number
159 rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
160
161 self.date_key = varnm[self.date_key]
162 else:
163 try:
164 rawdata = pylab.load(f, delimiter=',') # don't need to 'skiprow' here
165 except ValueError: # if loading via pylab doesn't work use csv
166 rawdata = self.load_csv_nf(f)
167
168 # making sure that the variable names contain no leading or trailing spaces
169 varnm = [i.strip() for i in varnm]
170
171 # transforming the data into a dictionary
172 if type(rawdata) == list:
173 # if the csv module was used
174 self.data = dict(zip(varnm,rawdata))
175 else:
176 # if the pylab.load module was used
177 self.data = dict(zip(varnm,rawdata.T))
178
179 def load_pickle(self,f):
180 """
181 Loading data from a created earlier using the the dbase class.
182 """
183 self.data = cPickle.load(f) # loading the data dictionary
184
185 # what is the date variable's key if any
186 if self.date_key == '':
187 try:
188 self.date_key = cPickle.load(f) # if nothing given assume it is in the pickle file
189 except:
190 print "No date series in pickle file"
191
192 def load_shelve(self,fname,var):
193 """
194 Loading data from a created earlier using the the dbase class.
195 """
196 data = shelve.open(fname) # loading the data dictionary
197
198 # find out if a variable list is provided
199 if var == ():
200 var = data.keys()
201
202 # making sure the date variable is fetched from shelve
203 if self.date_key != '':
204 if not self.date_key in var: var = var + list(self.date_key)
205
206 self.data = dict([(i,data[i]) for i in var])
207 data.close()
208
209 def save(self,fname):
210 """
211 Dumping the class data dictionary into a csv or pickle file
212 """
213 fext = self.__ext(fname)
214 if fext == 'csv':
215 f = open(fname,'w')
216 self.save_csv(f)
217 f.close()
218 elif fext == 'pickle':
219 f = open(fname,'wb')
220 self.save_pickle(f)
221 f.close()
222 elif fext == 'she':
223 self.save_shelve(fname)
224 else:
225 raise 'This class only works on csv, pickle, and shelve files'
226
227 def save_csv(self,f):
228 """
229 Dumping the class data dictionary into a csv file
230 """
231 writer = csv.writer(f)
232 writer.writerow(self.data.keys())
233
234 data = self.data # a reference to the data dict
235 if self.date_key != []:
236 data = dict(data) # making a copy so the dates can be changed to strings
237 dates = pylab.num2date(data[self.date_key])
238 dates = array([i.strftime('%d %b %y') for i in dates])
239 data[self.date_key] = dates
240
241 writer.writerows(array(data.values()).T)
242
243 def save_pickle(self,f):
244 """
245 Dumping the class data dictionary and date_key into a binary pickle file
246 """
247 cPickle.dump(self.data,f,2)
248 cPickle.dump(self.date_key,f,2)
249
250 def save_shelve(self,fname):
251 """
252 Dumping the class data dictionary into a shelve file
253 """
254 f = shelve.open('data.she','c')
255 f = self.data
256 f.close()
257
258 def add_trend(self,tname = 'trend'):
259 # making a trend based on nobs in arbitrary series in dictionary
260 self.data[tname] = arange(self.nobs)
261
262 def add_dummy(self,dum, dname = 'dummy'):
263 if self.data.has_key(dname):
264 print "The variable name '" + str(dname) + "' already exists. Please select another name."
265 else:
266 self.data[dname] = dum
267
268 def add_seasonal_dummies(self,freq=52,ndum=13):
269 """
270 This function will only work if the freq and ndum 'fit. That is,
271 weeks and 4-weekly periods will work. Weeks and months/quarters
272 will not.
273 """
274 if self.date_key == []:
275 print "Cannot create seasonal dummies since no date array is known"
276 else:
277 # list of years
278 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
279
280 # how many periods in does the data start
281 start = freq - sum(years == min(years))
282
283 # how many unique years
284 nyear = unique(years).shape[0]
285
286 # using kronecker products to make a big dummy matrix
287 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
288 sd = sd[start:start+self.nobs] # slicing the dummies to fit the data
289 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
290 self.data.update(sd) # adding the dummies to the main dict
291
292 def delvar(self,*var):
293 """
294 Deleting specified variables in the data dictionary, changing dictionary in place
295 """
296 [self.data.pop(i) for i in var]
297
298 def keepvar(self,*var):
299 """
300 Keeping specified variables in the data dictionary, changing dictionary in place
301 """
302 [self.data.pop(i) for i in self.data.keys() if i not in var]
303
304 def delvar_copy(self,*var):
305 """
306 Deleting specified variables in the data dictionary, making a copy
307 """
308 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
309
310 def keepvar_copy(self,*var):
311 """
312 Keeping specified variables in the data dictionary, making a copy
313 """
314 return dict([(i,self.data[i]) for i in var])
315
316 def delobs(self,sel):
317 """
318 Deleting specified observations, changing dictionary in place
319 """
320 for i in self.data.keys(): self.data[i] = self.data[i][sel]
321
322 # updating the value of self.nobs
323 self.nobs -= sum(sel)
324
325 def keepobs(self,sel):
326 """
327 Keeping specified observations, changing dictionary in place
328 """
329 # updating the value of self.nobs
330 self.nobs -= sum(sel)
331
332 sel -= 1 # making true, false and vice-versa
333 self.delobs(sel)
334
335 def delobs_copy(self,sel):
336 """
337 Deleting specified observations, making a copy
338 """
339 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
340
341 def keepobs_copy(self,sel):
342 """
343 Keeping specified observations, making a copy
344 """
345 sel -= 1 # making true, false and vice-versa
346 self.delobs_copy(sel)
347
348 def get(self,*var,**sel):
349 """
350 Copying data and keys of selected variables for further analysis
351 """
352 # calling convenience function to clean-up input parameters
353 var, sel = self.__var_and_sel_clean(var, sel)
354
355 # copying the entire dictionary (= default)
356 d = dict((i,self.data[i][sel]) for i in var)
357
358 return d.keys(), array(d.values()).T
359
360 def info(self,*var, **adict):
361 """
362 Printing descriptive statistics on selected variables
363 """
364
365 # calling convenience functions to clean-up input parameters
366 var, sel = self.__var_and_sel_clean(var, adict)
367 dates, nobs = self.__dates_and_nobs_clean(var, sel)
368
369 # setting the minimum and maximum dates to be used
370 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
371 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
372
373 # number of variables (excluding date if present)
374 nvar = len(var)
375
376 print '\n=============================================================================='
377 print '============================ Database information ============================'
378 print '==============================================================================\n'
379
380 print 'file: %s' % self.DBname
381 print '# obs: %s' % nobs
382 print '# variables: %s' % nvar
383 print 'Start date: %s' % mindate
384 print 'End date: %s' % maxdate
385
386 print '\nvar min max mean std.dev miss levels'
387 print '=============================================================================='
388
389 sets = {}
390 for i in var:
391 col = self.data[i][sel];
392 if type(col[0]) == string_:
393 _miss = sum(col == '')
394 col_set = set(col)
395 sets[i] = col_set
396 print '''%-5s %-5s %-5s %-5s %-5s % -5.0f %-5i''' % tuple([i,'-','-','-','-',_miss,len(col_set)])
397 else:
398 _miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
399 print '''% -5s % -5.2f % -5.2f % -5.2f % -5.2f % -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)])
400
401 if sets:
402 print '\n\nLevels for non-numeric data:'
403 for i in sets.keys():
404 print '=============================================================================='
405 print '''% -5s % -5s''' % tuple([i,sets[i]])
406
407 def dataplot(self,*var, **adict):
408 """
409 Plotting the data with variable names
410 """
411 # calling convenience functions to clean-up input parameters
412 var, sel = self.__var_and_sel_clean(var, adict)
413 dates, nobs = self.__dates_and_nobs_clean(var, sel)
414
415 # don't try to plot non-numerical variables
416 nvar = []
417 for i in var:
418 col = self.data[i][sel]
419 if type(col[0]) != string_:
420 pylab.plot_date(dates,self.data[i][sel],'o-')
421 nvar = nvar + [i]
422
423 pylab.xlabel("Time (n = " + str(nobs) + ")")
424 pylab.title("Data plot of " + self.DBname)
425 pylab.legend(nvar)
426 if adict.has_key('file'):
427 pylab.savefig(adict['file'],dpi=600)
428 pylab.show()
429
430 def to_recarray(self):
431 """
432 Transform the data dictionary to a recarray or a masked array if missing values
433 are present. Based on input from Pierre GM.
434 """
435 # First, create the descriptor:
436 descr = [(k,self.data[k].dtype) for k in b.data.keys()]
437 # Then, create an empty record array:
438 ra = recarray((self.nobs,), dtype=descr)
439 # Then fill the recarray:
440 [setattr(ra,n,v) for (n,v) in self.data.iteritems()]
441
442 return ra
443
444 def __var_and_sel_clean(self, var, sel, dates_needed = True):
445 """
446 Convenience function to avoid code duplication
447 """
448 # find out if a variable list is provided
449 if var == ():
450 var = self.data.keys()
451
452 # removing the date variable if it is present
453 var = [x for x in var if x != self.date_key]
454
455 # report variable label in alphabetical order
456 var.sort()
457
458 # find out if a selection rule is being used
459 # if not, set to empty tuple
460 if not sel.has_key('sel'):
461 sel = ()
462 else:
463 sel = sel['sel']
464
465 return var, sel
466
467 def __dates_and_nobs_clean(self, var, sel):
468 """
469 Convenience function to avoid code duplication
470 """
471 nobs = self.nobs
472 if len(sel):
473 nobs = nobs - (nobs - sum(sel))
474
475 if self.date_key != None and self.data.has_key(self.date_key):
476 # selecting dates from data base
477 dates = self.data[self.date_key][sel]
478 else:
479 # setting date series to start on 1/1/1950
480 dates = range(711858,nobs+711858)
481
482 return dates, nobs
483
484 def __ext(self,fname):
485 """
486 Finding the file extension of the filename passed to dbase
487 """
488 return fname.split('.')[-1].strip()
489
490 if __name__ == '__main__':
491
492 ###################################
493 ### usage examples of dbase class
494 ###################################
495
496 import sys
497 from scipy import c_
498
499 # making a directory to store simulate data
500 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
501
502 # creating simulated data and variable labels
503 varnm = ['date','a','b','c'] # variable labels
504 nobs = 100
505 data = randn(nobs,3) # the data array
506 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
507 dates = [i.strftime('%d %b %y') for i in dates]
508 data = c_[dates,data]
509
510 # adding a few missing values
511 data[5,1] = ''
512 data[9,3] = ''
513
514 # adding a non-numeric variable
515 varnm = varnm + ['id'] # variable labels
516 id = [('id'+str(i)) for i in range(nobs)]
517 id[8] = '' # setting one id to missing
518 data = c_[data,id]
519
520 # saving simulated data to a csv file
521 f = open('./dbase_test_files/data.csv','w')
522 writer = csv.writer(f)
523 writer.writerow(varnm)
524 writer.writerows(data)
525 f.close()
526
527 # loading the data from the csv file
528 a = dbase("./dbase_test_files/data.csv",date = 0)
529
530 # saving the dbase instance data to a pickle file
531 a.save("./dbase_test_files/data.pickle")
532 # saving the dbase data to a shelve file
533 ### a.save("./dbase_test_files/data.she")
534
535 # loading a sub-section of the data from a shelve file
536 ### print "\nLoading 2 variables from a shelve file\n"
537 ### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
538
539 # showing data and variable names, from load_shelve
540 ### varnm, data = b.get()
541 ### print "Variable names from shelve file\n", varnm
542 ### print "\nData selected from shelve file\n", data
543 ### print "\nDate series", b.data[b.date_key]
544 ### del b # cleaning up
545
546 # loading the object from the pickle file
547 print "\nLoading the dbase object from a pickle file\n"
548 b = dbase("./dbase_test_files/data.pickle")
549
550 # transforming the data dictionary to a rec-array
551 ra = b.to_recarray()
552
553 print '\nLables (+ type) in newly created recarray', ra.dtype
554 print '\nData for column "a"', ra.a
555 print '\nData for first 10 rows, for column "a"', ra['a'][:10]
556
557 # getting the name of the file you are working on
558 print "\nWorking on file: " + b.DBname
559
560 # showing data and variable names
561 varnm, data = b.get()
562 print "Variable names from dbase class\n", varnm
563 print "\nData from dbase class\n", data
564 print "\nDate series", b.data[b.date_key]
565
566 # viewing selected data columns
567 varnm, data = b.get('a','c')
568 print "\nTwo columns selected using variable names\n", varnm, "\n", data
569
570 # saving to a csv file
571 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
572
573 # adding variables/data
574 x1 = b.data['a'] * b.data['b']
575 x2 = b.data['a'] * b.data['c']
576 xdict = {'x1':x1,'x2':x2}
577 b.data.update(xdict) # using a dictionaries own 'add/extend method'
578
579 varnm, data = b.get()
580 print "\nTwo variable names added\n", varnm
581 print "\nTwo columns added\n", data
582
583 # using copy.deepcopy to make a complete copy of the class instance data
584 import copy
585 c = copy.deepcopy(b)
586
587 # making the database smaller, inplace, by deleting selected variables
588 c.delvar('a','x2')
589 varnm, data = c.get()
590 print "\nTwo variable names deleted\n", varnm
591 print "\nTwo columns deleted\n", data
592
593 # making the database smaller, inplace, by keeping only selected variables
594 c = copy.deepcopy(b)
595 c.keepvar('a','x2')
596 varnm, data = c.get()
597 print "\nAll but two variable names deleted\n", varnm
598 print "\nAll but Two columns deleted\n", data
599
600 # specifying a selection rule
601 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
602
603 # making the database smaller, inplace, by delecting selected observation
604 c = copy.deepcopy(b)
605 c.delobs(sel_rule)
606
607 varnm, data = c.get()
608 print "\nReduced number of observations following the selection rule\n", data
609
610 # making the database smaller, inplace, by delecting all but the selected observation
611 c = copy.deepcopy(b)
612 c.keepobs(sel_rule)
613
614 varnm, data = c.get()
615 print "\nReduced number of observations following the inverse of the selection rule\n", data
616
617 # making a copy of of just the dictionary for selected variables
618 x = b.keepvar_copy('a')
619
620 # making a copy of of just the dictionary for everything but the selected variables
621 x = b.delvar_copy('a')
622
623 # making a copy of of just the dictionary for selected observations
624 x = b.keepobs_copy(sel_rule)
625
626 # making a copy of of just the dictionary for everything but the selected observation
627 x = b.delobs_copy(sel_rule)
628
629 # descriptive information on the database
630 b.info()
631
632 # plotting series
633 b.dataplot(file = './dbase_test_files/full_plot.png')
634
635 # adding a trend component
636 b.add_trend('mytrend') # or b.data.update({'mytrend':range(100)})
637
638 # adding a dummy
639 dummy_rule = b.data['a'] > 0
640 b.add_dummy(dummy_rule,'mydummy') # or b.data.update({'mydummy':dummy_rule})
641
642 # add seasonal dummies, specify data frequency and # of dummies
643 b.add_seasonal_dummies(52,13)
644
645 # descriptive information on the database for selected variables and time periods
646 b.info('b','c','mydummy', sel = sel_rule)
647
648 # plotting series for selected variables and selected data periods
649 b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
- [get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
- [get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
- [get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
- [get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
- [get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
- [get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
- [get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
- [get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
- [get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
- [get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
- [get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
- [get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
- [get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
- [get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
- [get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
- [get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
- [get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]