Attachment 'dbase.0.6.py'
Download
Toggle line numbers
1 from __future__ import division
2 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
3 from numpy.random import randn
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 Author: Vincent Nijs (+ ?)
9 Email: v-nijs at kellogg.northwestern.edu
10 Last Modified: Sun Jan 7 10:05:12 CST 2007
11
12 Todo:
13 - Check if shelve loading/saving works
14
15 Tested on:
16 - Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
17 - Tested on Linux
18
19 Dependencies:
20 - See import statement at the top of this file
21
22 Doc:
23 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
24 Data is stored in a dictionary.
25
26 To use the class:
27
28 >>> from dbase import dbase
29 >>> y = dbase('your_filename.csv')
30
31 or for a previously created dbase object stored in a pickle file
32
33 >>> from dbase import dbase
34 >>> y = dbase('your_filename.pickle')
35
36 or without importing the dbase class
37
38 >>> import cPickle
39 >>> f = open('your_filename.pickle','rb')
40 >>> y = cPickle.load(f)
41 >>> data_key = cPickle.load(f)
42 >>> f.close()
43
44 or for a dictionary stored in a shelf file
45
46 >>> from dbase import dbase
47 >>> y = dbase('your_filename.pickle')
48
49 To return a list of variable names and an array of data
50
51 >>> varnm, data = y.get()
52
53 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
54 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
55 of the class to a pickle file.
56 """
57
58 def __init__(self,fname,var = (),date = ''):
59 """
60 Initializing the dbase class. Loading file fname.
61
62 If you have have a column in your csv file that is a date-string use:
63
64 >>> x = dbase('myfile.csv',date = 0)
65
66 where 0 is the index of the date column
67
68 If you have have an array in your pickle file that is a date variable use:
69
70 >>> x = dbase('myfile.pickle',date = 'date')
71
72 where 'date' is the key of the date array
73 """
74 self.load(fname,var,date)
75
76 def load(self,fname,var,date):
77 """
78 Loading data from a csv or a pickle file of the dbase class.
79 If this is csv file use pylab's load function. Seems much faster
80 than scipy.io.read_array.
81 """
82 # setting the ascii/csv file name used for input
83 self.DBname = os.getcwd() + '/' + fname
84
85 # assuming self.date_key = '' unless otherwise given
86 self.date_key = date
87
88 # getting the file extension
89 fext = self.__ext(fname)
90
91 # opening the file for reading
92 if fext == 'csv':
93 f = open(fname,'r')
94 self.load_csv(f)
95 f.close()
96 elif fext == 'pickle':
97 f = open(fname,'rb')
98 self.load_pickle(f)
99 f.close()
100 elif fext == 'she':
101 self.load_shelve(fname,var)
102 else:
103 raise 'This class only works on csv, pickle, and shelve files'
104
105 # specifying nobs in self.data
106 self.nobs = self.data[self.data.keys()[0]].shape[0]
107
108 def csvconvert(self,col):
109 """
110 Converting data in a string array to the appropriate type
111 """
112 # convert missing values to nan
113 col[col == ''] = 'nan'; col[col == '.'] = 'nan'
114 try:
115 # if a missing value is present int variables will be up-cast to float
116 return col.astype('i')
117 except ValueError:
118 try:
119 return col.astype('f')
120 except ValueError:
121 # if the data is a string, put back the empty string
122 col[col == 'nan'] = ''
123 return col
124
125 def load_csv_nf(self,f):
126 """
127 Loading data from a csv file using the csv module. Return a list of arrays.
128 Possibly with different types and/or missing values.
129 """
130 # resetting to the beginning of the file since pylab.load was already tried
131 f.seek(0)
132
133 reader = csv.reader(f)
134
135 # putting the data in an array of strings
136 datalist = array([i for i in reader])
137
138 # converting the data to an appropriate type
139 datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
140
141 return datalist
142
143 def load_csv(self,f):
144 """
145 Loading data from a csv file. Uses pylab's load function. Seems much faster
146 than scipy.io.read_array.
147 """
148 varnm = f.readline().split(',')
149
150 # what is the date variable's key if any, based on index passed as argument
151 if self.date_key != '':
152 try:
153 rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num}) # don't need to 'skiprow' here
154 except ValueError: # if loading via pylab doesn't work use csv
155 rawdata = self.load_csv_nf(f)
156
157 # converting the dates column to a date-number
158 rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
159
160 self.date_key = varnm[self.date_key]
161 else:
162 try:
163 rawdata = pylab.load(f, delimiter=',') # don't need to 'skiprow' here
164 except ValueError: # if loading via pylab doesn't work use csv
165 rawdata = self.load_csv_nf(f)
166
167 # making sure that the variable names contain no leading or trailing spaces
168 varnm = [i.strip() for i in varnm]
169
170 # transforming the data into a dictionary
171 if type(rawdata) == list:
172 # if the csv module was used
173 self.data = dict(zip(varnm,rawdata))
174 else:
175 # if the pylab.load module was used
176 self.data = dict(zip(varnm,rawdata.T))
177
178 def load_pickle(self,f):
179 """
180 Loading data from a created earlier using the the dbase class.
181 """
182 self.data = cPickle.load(f) # loading the data dictionary
183
184 # what is the date variable's key if any
185 if self.date_key == '':
186 try:
187 self.date_key = cPickle.load(f) # if nothing given assume it is in the pickle file
188 except:
189 print "No date series in pickle file"
190
191 def load_shelve(self,fname,var):
192 """
193 Loading data from a created earlier using the the dbase class.
194 """
195 data = shelve.open(fname) # loading the data dictionary
196
197 # find out if a variable list is provided
198 if var == ():
199 var = data.keys()
200
201 # making sure the date variable is fetched from shelve
202 if self.date_key != '':
203 if not self.date_key in var: var = var + list(self.date_key)
204
205 self.data = dict([(i,data[i]) for i in var])
206 data.close()
207
208 def save(self,fname):
209 """
210 Dumping the class data dictionary into a csv or pickle file
211 """
212 fext = self.__ext(fname)
213 if fext == 'csv':
214 f = open(fname,'w')
215 self.save_csv(f)
216 f.close()
217 elif fext == 'pickle':
218 f = open(fname,'wb')
219 self.save_pickle(f)
220 f.close()
221 elif fext == 'she':
222 self.save_shelve(fname)
223 else:
224 raise 'This class only works on csv, pickle, and shelve files'
225
226 def save_csv(self,f):
227 """
228 Dumping the class data dictionary into a csv file
229 """
230 writer = csv.writer(f)
231 writer.writerow(self.data.keys())
232
233 data = self.data # a reference to the data dict
234 if self.date_key != []:
235 data = dict(data) # making a copy so the dates can be changed to strings
236 dates = pylab.num2date(data[self.date_key])
237 dates = array([i.strftime('%d %b %y') for i in dates])
238 data[self.date_key] = dates
239
240 writer.writerows(array(data.values()).T)
241
242 def save_pickle(self,f):
243 """
244 Dumping the class data dictionary and date_key into a binary pickle file
245 """
246 cPickle.dump(self.data,f,2)
247 cPickle.dump(self.date_key,f,2)
248
249 def save_shelve(self,fname):
250 """
251 Dumping the class data dictionary into a shelve file
252 """
253 f = shelve.open('data.she','c')
254 f = self.data
255 f.close()
256
257 def add_trend(self,tname = 'trend'):
258 # making a trend based on nobs in arbitrary series in dictionary
259 self.data[tname] = arange(self.nobs)
260
261 def add_dummy(self,dum, dname = 'dummy'):
262 if self.data.has_key(dname):
263 print "The variable name '" + str(dname) + "' already exists. Please select another name."
264 else:
265 self.data[dname] = dum
266
267 def add_seasonal_dummies(self,freq=52,ndum=13):
268 """
269 This function will only work if the freq and ndum 'fit. That is,
270 weeks and 4-weekly periods will work. Weeks and months/quarters
271 will not.
272 """
273 if self.date_key == []:
274 print "Cannot create seasonal dummies since no date array is known"
275 else:
276 # list of years
277 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
278
279 # how many periods in does the data start
280 start = freq - sum(years == min(years))
281
282 # how many unique years
283 nyear = unique(years).shape[0]
284
285 # using kronecker products to make a big dummy matrix
286 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
287 sd = sd[start:start+self.nobs] # slicing the dummies to fit the data
288 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
289 self.data.update(sd) # adding the dummies to the main dict
290
291 def delvar(self,*var):
292 """
293 Deleting specified variables in the data dictionary, changing dictionary in place
294 """
295 [self.data.pop(i) for i in var]
296
297 def keepvar(self,*var):
298 """
299 Keeping specified variables in the data dictionary, changing dictionary in place
300 """
301 [self.data.pop(i) for i in self.data.keys() if i not in var]
302
303 def delvar_copy(self,*var):
304 """
305 Deleting specified variables in the data dictionary, making a copy
306 """
307 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
308
309 def keepvar_copy(self,*var):
310 """
311 Keeping specified variables in the data dictionary, making a copy
312 """
313 return dict([(i,self.data[i]) for i in var])
314
315 def delobs(self,sel):
316 """
317 Deleting specified observations, changing dictionary in place
318 """
319 for i in self.data.keys(): self.data[i] = self.data[i][sel]
320
321 # updating the value of self.nobs
322 self.nobs -= sum(sel)
323
324 def keepobs(self,sel):
325 """
326 Keeping specified observations, changing dictionary in place
327 """
328 # updating the value of self.nobs
329 self.nobs -= sum(sel)
330
331 sel -= 1 # making true, false and vice-versa
332 self.delobs(sel)
333
334 def delobs_copy(self,sel):
335 """
336 Deleting specified observations, making a copy
337 """
338 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
339
340 def keepobs_copy(self,sel):
341 """
342 Keeping specified observations, making a copy
343 """
344 sel -= 1 # making true, false and vice-versa
345 self.delobs_copy(sel)
346
347 def get(self,*var,**sel):
348 """
349 Copying data and keys of selected variables for further analysis
350 """
351 # calling convenience function to clean-up input parameters
352 var, sel = self.__var_and_sel_clean(var, sel)
353
354 # copying the entire dictionary (= default)
355 d = dict((i,self.data[i][sel]) for i in var)
356
357 return d.keys(), array(d.values()).T
358
359 def info(self,*var, **adict):
360 """
361 Printing descriptive statistics on selected variables
362 """
363
364 # calling convenience functions to clean-up input parameters
365 var, sel = self.__var_and_sel_clean(var, adict)
366 dates, nobs = self.__dates_and_nobs_clean(var, sel)
367
368 # setting the minimum and maximum dates to be used
369 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
370 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
371
372 # number of variables (excluding date if present)
373 nvar = len(var)
374
375 print '\n=============================================================================='
376 print '============================ Database information ============================'
377 print '==============================================================================\n'
378
379 print 'file: %s' % self.DBname
380 print '# obs: %s' % nobs
381 print '# variables: %s' % nvar
382 print 'Start date: %s' % mindate
383 print 'End date: %s' % maxdate
384
385 print '\nvar min max mean std.dev miss levels'
386 print '=============================================================================='
387
388 sets = {}
389 for i in var:
390 col = self.data[i][sel];
391 if type(col[0]) == string_:
392 _miss = sum(col == '')
393 col_set = set(col)
394 sets[i] = col_set
395 print '''%-5s %-5s %-5s %-5s %-5s % -5.0f %-5i''' % tuple([i,'-','-','-','-',_miss,len(col_set)])
396 else:
397 _miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
398 print '''% -5s % -5.2f % -5.2f % -5.2f % -5.2f % -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)])
399
400 if sets:
401 print '\n\nLevels for non-numeric data:'
402 for i in sets.keys():
403 print '=============================================================================='
404 print '''% -5s % -5s''' % tuple([i,sets[i]])
405
406 def dataplot(self,*var, **adict):
407 """
408 Plotting the data with variable names
409 """
410 # calling convenience functions to clean-up input parameters
411 var, sel = self.__var_and_sel_clean(var, adict)
412 dates, nobs = self.__dates_and_nobs_clean(var, sel)
413
414 # don't try to plot non-numerical variables
415 nvar = []
416 for i in var:
417 col = self.data[i][sel]
418 if type(col[0]) != string_:
419 pylab.plot_date(dates,self.data[i][sel],'o-')
420 nvar = nvar + [i]
421
422 pylab.xlabel("Time (n = " + str(nobs) + ")")
423 pylab.title("Data plot of " + self.DBname)
424 pylab.legend(nvar)
425 if adict.has_key('file'):
426 pylab.savefig(adict['file'],dpi=600)
427 pylab.show()
428
429 def __var_and_sel_clean(self, var, sel, dates_needed = True):
430 """
431 Convenience function to avoid code duplication
432 """
433 # find out if a variable list is provided
434 if var == ():
435 var = self.data.keys()
436
437 # removing the date variable if it is present
438 var = [x for x in var if x != self.date_key]
439
440 # report variable label in alphabetical order
441 var.sort()
442
443 # find out if a selection rule is being used
444 # if not, set to empty tuple
445 if not sel.has_key('sel'):
446 sel = ()
447 else:
448 sel = sel['sel']
449
450 return var, sel
451
452 def __dates_and_nobs_clean(self, var, sel):
453 """
454 Convenience function to avoid code duplication
455 """
456 nobs = self.nobs
457 if len(sel):
458 nobs = nobs - (nobs - sum(sel))
459
460 if self.date_key != None and self.data.has_key(self.date_key):
461 # selecting dates from data base
462 dates = self.data[self.date_key][sel]
463 else:
464 # setting date series to start on 1/1/1950
465 dates = range(711858,nobs+711858)
466
467 return dates, nobs
468
469 def __ext(self,fname):
470 """
471 Finding the file extension of the filename passed to dbase
472 """
473 return fname.split('.')[-1].strip()
474
475 if __name__ == '__main__':
476
477 ###################################
478 ### usage examples of dbase class
479 ###################################
480
481 import sys
482 from scipy import c_
483
484 # making a directory to store simulate data
485 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
486
487 # creating simulated data and variable labels
488 varnm = ['date','a','b','c'] # variable labels
489 nobs = 100
490 data = randn(nobs,3) # the data array
491 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
492 dates = [i.strftime('%d %b %y') for i in dates]
493 data = c_[dates,data]
494
495 # adding a few missing values
496 data[5,1] = ''
497 data[9,3] = ''
498
499 # adding a non-numeric variable
500 varnm = varnm + ['id'] # variable labels
501 id = [('id'+str(i)) for i in range(nobs)]
502 id[8] = '' # setting one id to missing
503 data = c_[data,id]
504
505 # saving simulated data to a csv file
506 f = open('./dbase_test_files/data.csv','w')
507 writer = csv.writer(f)
508 writer.writerow(varnm)
509 writer.writerows(data)
510 f.close()
511
512 # loading the data from the csv file
513 a = dbase("./dbase_test_files/data.csv",date = 0)
514
515 # saving the dbase instance data to a pickle file
516 a.save("./dbase_test_files/data.pickle")
517 # saving the dbase data to a shelve file
518 ### a.save("./dbase_test_files/data.she")
519
520 # loading a sub-section of the data from a shelve file
521 ### print "\nLoading 2 variables from a shelve file\n"
522 ### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
523
524 # showing data and variable names, from load_shelve
525 ### varnm, data = b.get()
526 ### print "Variable names from shelve file\n", varnm
527 ### print "\nData selected from shelve file\n", data
528 ### print "\nDate series", b.data[b.date_key]
529 ### del b # cleaning up
530
531 # loading the object from the pickle file
532 print "\nLoading the dbase object from a pickle file\n"
533 b = dbase("./dbase_test_files/data.pickle")
534
535 # getting the name of the file you are working on
536 print "\nWorking on file: " + b.DBname
537
538 # showing data and variable names
539 varnm, data = b.get()
540 print "Variable names from dbase class\n", varnm
541 print "\nData from dbase class\n", data
542 print "\nDate series", b.data[b.date_key]
543
544 # viewing selected data columns
545 varnm, data = b.get('a','c')
546 print "\nTwo columns selected using variable names\n", varnm, "\n", data
547
548 # saving to a csv file
549 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
550
551 # adding variables/data
552 x1 = b.data['a'] * b.data['b']
553 x2 = b.data['a'] * b.data['c']
554 xdict = {'x1':x1,'x2':x2}
555 b.data.update(xdict) # using a dictionaries own 'add/extend method'
556
557 varnm, data = b.get()
558 print "\nTwo variable names added\n", varnm
559 print "\nTwo columns added\n", data
560
561 # using copy.deepcopy to make a complete copy of the class instance data
562 import copy
563 c = copy.deepcopy(b)
564
565 # making the database smaller, inplace, by deleting selected variables
566 c.delvar('a','x2')
567 varnm, data = c.get()
568 print "\nTwo variable names deleted\n", varnm
569 print "\nTwo columns deleted\n", data
570
571 # making the database smaller, inplace, by keeping only selected variables
572 c = copy.deepcopy(b)
573 c.keepvar('a','x2')
574 varnm, data = c.get()
575 print "\nAll but two variable names deleted\n", varnm
576 print "\nAll but Two columns deleted\n", data
577
578 # specifying a selection rule
579 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
580
581 # making the database smaller, inplace, by delecting selected observation
582 c = copy.deepcopy(b)
583 c.delobs(sel_rule)
584
585 varnm, data = c.get()
586 print "\nReduced number of observations following the selection rule\n", data
587
588 # making the database smaller, inplace, by delecting all but the selected observation
589 c = copy.deepcopy(b)
590 c.keepobs(sel_rule)
591
592 varnm, data = c.get()
593 print "\nReduced number of observations following the inverse of the selection rule\n", data
594
595 # making a copy of of just the dictionary for selected variables
596 x = b.keepvar_copy('a')
597
598 # making a copy of of just the dictionary for everything but the selected variables
599 x = b.delvar_copy('a')
600
601 # making a copy of of just the dictionary for selected observations
602 x = b.keepobs_copy(sel_rule)
603
604 # making a copy of of just the dictionary for everything but the selected observation
605 x = b.delobs_copy(sel_rule)
606
607 # descriptive information on the database
608 b.info()
609
610 # plotting series
611 b.dataplot(file = './dbase_test_files/full_plot.png')
612
613 # adding a trend component
614 b.add_trend('mytrend') # or b.data.update({'mytrend':range(100)})
615
616 # adding a dummy
617 dummy_rule = b.data['a'] > 0
618 b.add_dummy(dummy_rule,'mydummy') # or b.data.update({'mydummy':dummy_rule})
619
620 # add seasonal dummies, specify data frequency and # of dummies
621 b.add_seasonal_dummies(52,13)
622
623 # descriptive information on the database for selected variables and time periods
624 b.info('b','c','mydummy', sel = sel_rule)
625
626 # plotting series for selected variables and selected data periods
627 b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
- [get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
- [get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
- [get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
- [get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
- [get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
- [get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
- [get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
- [get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
- [get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
- [get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
- [get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
- [get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
- [get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
- [get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
- [get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
- [get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
- [get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]