Attachment 'dbase.0.3.py'
Download
Toggle line numbers
1 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
2 from numpy.random import randn
3 from __future__ import division
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 Author: Vincent Nijs (+ ?)
9 Email: v-nijs at kellogg.northwestern.edu
10 Last Modified: Sun Jan 7 10:05:12 CST 2007
11
12 Todo:
13 - Check if shelve loading/saving works
14
15 Tested on:
16 - Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
17 - Tested on Linux
18
19 Dependencies:
20 - See import statement at the top of this file
21
22 Doc:
23 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
24 Data is stored in a dictionary.
25
26 To use the class:
27
28 >>> from dbase import dbase
29 >>> y = dbase('your_filename.csv')
30
31 or for a previously created dbase object stored in a pickle file
32
33 >>> from dbase import dbase
34 >>> y = dbase('your_filename.pickle')
35
36 or without importing the dbase class
37
38 >>> import cPickle
39 >>> f = open('your_filename.pickle','rb')
40 >>> y = cPickle.load(f)
41 >>> data_key = cPickle.load(f)
42 >>> f.close()
43
44 or for a dictionary stored in a shelf file
45
46 >>> from dbase import dbase
47 >>> y = dbase('your_filename.pickle')
48
49 To return a list of variable names and an array of data
50
51 >>> varnm, data = y.get()
52
53 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
54 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
55 of the class to a pickle file.
56 """
57
58 def __init__(self,fname,var = (),date = ''):
59 """
60 Initializing the dbase class. Loading file fname.
61
62 If you have have a column in your csv file that is a date-string use:
63
64 >>> x = dbase('myfile.csv',date = 0)
65
66 where 0 is the index of the date column
67
68 If you have have an array in your pickle file that is a date variable use:
69
70 >>> x = dbase('myfile.pickle',date = 'date')
71
72 where 'date' is the key of the date array
73 """
74 self.load(fname,var,date)
75
76 def load(self,fname,var,date):
77 """
78 Loading data from a csv or a pickle file of the dbase class.
79 If this is csv file use pylab's load function. Seems much faster
80 than scipy.io.read_array.
81 """
82 # setting the ascii/csv file name used for input
83 self.DBname = os.getcwd() + '/' + fname
84
85 # assuming self.date_key = '' unless otherwise given
86 self.date_key = date
87
88 # getting the file extension
89 fext = self.__ext(fname)
90
91 # opening the file for reading
92 if fext == 'csv':
93 f = open(fname,'r')
94 self.load_csv(f)
95 f.close()
96 elif fext == 'pickle':
97 f = open(fname,'rb')
98 self.load_pickle(f)
99 f.close()
100 elif fext == 'she':
101 self.load_shelve(fname,var)
102 else:
103 raise 'This class only works on csv, pickle, and shelve files'
104
105 # specifying nobs in self.data
106 self.nobs = self.data[self.data.keys()[0]].shape[0]
107
108 def csvconvert(self,col):
109 """
110 Converting data in a string array to the appropriate type
111 """
112 # convert missing values to nan
113 col[col == ''] = 'nan'; col[col == '.'] = 'nan'
114 try:
115 # if a missing value is present int variables will be up-cast to float
116 return col.astype('i')
117 except ValueError:
118 try:
119 return col.astype('f')
120 except ValueError:
121 # if the data is a string, put back the empty string
122 col[col == 'nan'] = ''
123 return col
124
125 def load_csv_nf(self,f):
126 """
127 Loading data from a csv file using the csv module. Return a list of arrays.
128 Possibly with different types and/or missing values.
129 """
130 # resetting to the beginning of the file since pylab.load was already tried
131 f.seek(0)
132
133 reader = csv.reader(f)
134
135 # putting the data in an array of strings
136 datalist = array([i for i in reader])
137
138 # converting the data to an appropriate type
139 datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
140
141 return datalist
142
143 def load_csv(self,f):
144 """
145 Loading data from a csv file. Uses pylab's load function. Seems much faster
146 than scipy.io.read_array.
147 """
148 varnm = f.readline().split(',')
149
150 # what is the date variable's key if any, based on index passed as argument
151 if self.date_key != '':
152 try:
153 rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num}) # don't need to 'skiprow' here
154 except ValueError: # if loading via pylab doesn't work use csv
155 rawdata = self.load_csv_nf(f)
156
157 # converting the dates column to a date-number
158 rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
159
160 self.date_key = varnm[self.date_key]
161 else:
162 try:
163 rawdata = pylab.load(f, delimiter=',') # don't need to 'skiprow' here
164 except ValueError: # if loading via pylab doesn't work use csv
165 rawdata = self.load_csv_nf(f)
166
167 # making sure that the variable names contain no leading or trailing spaces
168 varnm = [i.strip() for i in varnm]
169
170 # transforming the data into a dictionary
171 if type(rawdata) == list:
172 # if the csv module was used
173 self.data = dict(zip(varnm,rawdata))
174 else:
175 # if the pylab.load module was used
176 self.data = dict(zip(varnm,rawdata.T))
177
178 def load_pickle(self,f):
179 """
180 Loading data from a created earlier using the the dbase class.
181 """
182 self.data = cPickle.load(f) # loading the data dictionary
183
184 # what is the date variable's key if any
185 if self.date_key == '':
186 try:
187 self.date_key = cPickle.load(f) # if nothing given assume it is in the pickle file
188 except:
189 print "No date series in pickle file"
190
191 def load_shelve(self,fname,var):
192 """
193 Loading data from a created earlier using the the dbase class.
194 """
195 data = shelve.open(fname) # loading the data dictionary
196
197 # find out if a variable list is provided
198 if var == ():
199 var = data.keys()
200
201 # making sure the date variable is fetched from shelve
202 if self.date_key != '':
203 if not self.date_key in var: var = var + list(self.date_key)
204
205 self.data = dict([(i,data[i]) for i in var])
206 data.close()
207
208 def save(self,fname):
209 """
210 Dumping the class data dictionary into a csv or pickle file
211 """
212 fext = self.__ext(fname)
213 if fext == 'csv':
214 f = open(fname,'w')
215 self.save_csv(f)
216 f.close()
217 elif fext == 'pickle':
218 f = open(fname,'wb')
219 self.save_pickle(f)
220 f.close()
221 elif fext == 'she':
222 self.save_shelve(fname)
223 else:
224 raise 'This class only works on csv, pickle, and shelve files'
225
226 def save_csv(self,f):
227 """
228 Dumping the class data dictionary into a csv file
229 """
230 writer = csv.writer(f)
231 writer.writerow(self.data.keys())
232
233 data = self.data # a reference to the data dict
234 if self.date_key != []:
235 data = dict(data) # making a copy so the dates can be changed to strings
236 dates = pylab.num2date(data[self.date_key])
237 dates = array([i.strftime('%d %b %y') for i in dates])
238 data[self.date_key] = dates
239
240 writer.writerows(array(data.values()).T)
241
242 def save_pickle(self,f):
243 """
244 Dumping the class data dictionary and date_key into a binary pickle file
245 """
246 cPickle.dump(self.data,f,2)
247 cPickle.dump(self.date_key,f,2)
248
249 def save_shelve(self,fname):
250 """
251 Dumping the class data dictionary into a shelve file
252 """
253 f = shelve.open('data.she','c')
254 f = self.data
255 f.close()
256
257 def add_trend(self,tname = 'trend'):
258 # making a trend based on nobs in arbitrary series in dictionary
259 self.data[tname] = arange(self.nobs)
260
261 def add_dummy(self,dum, dname = 'dummy'):
262 if self.data.has_key(dname):
263 print "The variable name '" + str(dname) + "' already exists. Please select another name."
264 else:
265 self.data[dname] = dum
266
267 def add_seasonal_dummies(self,freq=52,ndum=13):
268 """
269 This function will only work if the freq and ndum 'fit. That is,
270 weeks and 4-weekly periods will work. Weeks and months/quarters
271 will not.
272 """
273 if self.date_key == []:
274 print "Cannot create seasonal dummies since no date array is known"
275 else:
276 # list of years
277 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
278
279 # how many periods in does the data start
280 start = freq - sum(years == min(years))
281
282 # how many unique years
283 nyear = unique(years).shape[0]
284
285 # using kronecker products to make a big dummy matrix
286 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
287 sd = sd[start:start+self.nobs] # slicing the dummies to fit the data
288 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
289 self.data.update(sd) # adding the dummies to the main dict
290
291 def delvar(self,*var):
292 """
293 Deleting specified variables in the data dictionary, changing dictionary in place
294 """
295 [self.data.pop(i) for i in var]
296
297 def keepvar(self,*var):
298 """
299 Keeping specified variables in the data dictionary, changing dictionary in place
300 """
301 [self.data.pop(i) for i in self.data.keys() if i not in var]
302
303 def delvar_copy(self,*var):
304 """
305 Deleting specified variables in the data dictionary, making a copy
306 """
307 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
308
309 def keepvar_copy(self,*var):
310 """
311 Keeping specified variables in the data dictionary, making a copy
312 """
313 return dict([(i,self.data[i]) for i in var])
314
315 def delobs(self,sel):
316 """
317 Deleting specified observations, changing dictionary in place
318 """
319 for i in self.data.keys(): self.data[i] = self.data[i][sel]
320
321 # updating the value of self.nobs
322 self.nobs -= sum(sel)
323
324 def keepobs(self,sel):
325 """
326 Keeping specified observations, changing dictionary in place
327 """
328 # updating the value of self.nobs
329 self.nobs -= sum(sel)
330
331 sel -= 1 # making true, false and vice-versa
332 self.delobs(sel)
333
334 def delobs_copy(self,sel):
335 """
336 Deleting specified observations, making a copy
337 """
338 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
339
340 def keepobs_copy(self,sel):
341 """
342 Keeping specified observations, making a copy
343 """
344 sel -= 1 # making true, false and vice-versa
345 self.delobs_copy(sel)
346
347 def get(self,*var,**sel):
348 """
349 Copying data and keys of selected variables for further analysis
350 """
351 # calling convenience function to clean-up input parameters
352 var, sel = self.__var_and_sel_clean(var, sel)
353
354 # copying the entire dictionary (= default)
355 d = dict((i,self.data[i][sel]) for i in var)
356
357 return d.keys(), array(d.values()).T
358
359 def info(self,*var, **adict):
360 """
361 Printing descriptive statistics on selected variables
362 """
363
364 # calling convenience functions to clean-up input parameters
365 var, sel = self.__var_and_sel_clean(var, adict)
366 dates, nobs = self.__dates_and_nobs_clean(var, sel)
367
368 # setting the minimum and maximum dates to be used
369 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
370 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
371
372 # number of variables (excluding date if present)
373 nvar = len(var)
374
375 print '\n=============================================================================='
376 print '============================ Database information ============================'
377 print '==============================================================================\n'
378
379 print 'file: %s' % self.DBname
380 print '# obs: %s' % nobs
381 print '# variables: %s' % nvar
382 print 'Start date: %s' % mindate
383 print 'End date: %s' % maxdate
384
385 print '\nvar min max mean std.dev miss levels'
386 print '=============================================================================='
387
388 for i in var:
389 col = self.data[i][sel];
390 if type(col[0]) == string_:
391 _miss = sum(col == '')
392 print '''%-5s %-5s %-5s %-5s %-5s % -5.0f %-5i''' % tuple([i,'-','-','-','-',_miss,len(set(col))])
393 else:
394 _miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
395 print '''% -5s % -5.2f % -5.2f % -5.2f % -5.2f % -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)])
396
397 def dataplot(self,*var, **adict):
398 """
399 Plotting the data with variable names
400 """
401 # calling convenience functions to clean-up input parameters
402 var, sel = self.__var_and_sel_clean(var, adict)
403 dates, nobs = self.__dates_and_nobs_clean(var, sel)
404
405 # don't try to plot non-numerical variables
406 nvar = []
407 for i in var:
408 col = self.data[i][sel]
409 if type(col[0]) != string_:
410 pylab.plot_date(dates,self.data[i][sel],'o-')
411 nvar = nvar + [i]
412
413 pylab.xlabel("Time (n = " + str(nobs) + ")")
414 pylab.title("Data plot of " + self.DBname)
415 pylab.legend(nvar)
416 if adict.has_key('file'):
417 pylab.savefig(adict['file'],dpi=600)
418 pylab.show()
419
420 def __var_and_sel_clean(self, var, sel, dates_needed = True):
421 """
422 Convenience function to avoid code duplication
423 """
424 # find out if a variable list is provided
425 if var == ():
426 var = self.data.keys()
427
428 # removing the date variable if it is present
429 var = [x for x in var if x != self.date_key]
430
431 # report variable label in alphabetical order
432 var.sort()
433
434 # find out if a selection rule is being used
435 # if not, set to empty tuple
436 if not sel.has_key('sel'):
437 sel = ()
438 else:
439 sel = sel['sel']
440
441 return var, sel
442
443 def __dates_and_nobs_clean(self, var, sel):
444 """
445 Convenience function to avoid code duplication
446 """
447 nobs = self.nobs
448 if len(sel):
449 nobs = nobs - (nobs - sum(sel))
450
451 if self.date_key != None and self.data.has_key(self.date_key):
452 # selecting dates from data base
453 dates = self.data[self.date_key][sel]
454 else:
455 # setting date series to start on 1/1/1950
456 dates = range(711858,nobs+711858)
457
458 return dates, nobs
459
460 def __ext(self,fname):
461 """
462 Finding the file extension of the filename passed to dbase
463 """
464 return fname.split('.')[-1].strip()
465
466 if __name__ == '__main__':
467
468 ###################################
469 ### usage examples of dbase class
470 ###################################
471
472 import sys
473 from scipy import c_
474
475 # making a directory to store simulate data
476 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
477
478 # creating simulated data and variable labels
479 varnm = ['date','a','b','c'] # variable labels
480 nobs = 100
481 data = randn(nobs,3) # the data array
482 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
483 dates = [i.strftime('%d %b %y') for i in dates]
484 data = c_[dates,data]
485
486 # adding a few missing values
487 data[5,1] = ''
488 data[9,3] = ''
489
490 # adding a non-numeric variable
491 varnm = varnm + ['id'] # variable labels
492 id = [('id'+str(i)) for i in range(nobs)]
493 id[8] = '' # setting one id to missing
494 data = c_[data,id]
495
496 # saving simulated data to a csv file
497 f = open('./dbase_test_files/data.csv','w')
498 writer = csv.writer(f)
499 writer.writerow(varnm)
500 writer.writerows(data)
501 f.close()
502
503 # loading the data from the csv file
504 a = dbase("./dbase_test_files/data.csv",date = 0)
505
506 # saving the dbase instance data to a pickle file
507 a.save("./dbase_test_files/data.pickle")
508 # saving the dbase data to a shelve file
509 ### a.save("./dbase_test_files/data.she")
510
511 # loading a sub-section of the data from a shelve file
512 ### print "\nLoading 2 variables from a shelve file\n"
513 ### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
514
515 # showing data and variable names, from load_shelve
516 ### varnm, data = b.get()
517 ### print "Variable names from shelve file\n", varnm
518 ### print "\nData selected from shelve file\n", data
519 ### print "\nDate series", b.data[b.date_key]
520 ### del b # cleaning up
521
522 # loading the object from the pickle file
523 print "\nLoading the dbase object from a pickle file\n"
524 b = dbase("./dbase_test_files/data.pickle")
525
526 # getting the name of the file you are working on
527 print "\nWorking on file: " + b.DBname
528
529 # showing data and variable names
530 varnm, data = b.get()
531 print "Variable names from dbase class\n", varnm
532 print "\nData from dbase class\n", data
533 print "\nDate series", b.data[b.date_key]
534
535 # viewing selected data columns
536 varnm, data = b.get('a','c')
537 print "\nTwo columns selected using variable names\n", varnm, "\n", data
538
539 # saving to a csv file
540 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
541
542 # adding variables/data
543 x1 = b.data['a'] * b.data['b']
544 x2 = b.data['a'] * b.data['c']
545 xdict = {'x1':x1,'x2':x2}
546 b.data.update(xdict) # using a dictionaries own 'add/extend method'
547
548 varnm, data = b.get()
549 print "\nTwo variable names added\n", varnm
550 print "\nTwo columns added\n", data
551
552 # using copy.deepcopy to make a complete copy of the class instance data
553 import copy
554 c = copy.deepcopy(b)
555
556 # making the database smaller, inplace, by deleting selected variables
557 c.delvar('a','x2')
558 varnm, data = c.get()
559 print "\nTwo variable names deleted\n", varnm
560 print "\nTwo columns deleted\n", data
561
562 # making the database smaller, inplace, by keeping only selected variables
563 c = copy.deepcopy(b)
564 c.keepvar('a','x2')
565 varnm, data = c.get()
566 print "\nAll but two variable names deleted\n", varnm
567 print "\nAll but Two columns deleted\n", data
568
569 # specifying a selection rule
570 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
571
572 # making the database smaller, inplace, by delecting selected observation
573 c = copy.deepcopy(b)
574 c.delobs(sel_rule)
575
576 varnm, data = c.get()
577 print "\nReduced number of observations following the selection rule\n", data
578
579 # making the database smaller, inplace, by delecting all but the selected observation
580 c = copy.deepcopy(b)
581 c.keepobs(sel_rule)
582
583 varnm, data = c.get()
584 print "\nReduced number of observations following the inverse of the selection rule\n", data
585
586 # making a copy of of just the dictionary for selected variables
587 x = b.keepvar_copy('a')
588
589 # making a copy of of just the dictionary for everything but the selected variables
590 x = b.delvar_copy('a')
591
592 # making a copy of of just the dictionary for selected observations
593 x = b.keepobs_copy(sel_rule)
594
595 # making a copy of of just the dictionary for everything but the selected observation
596 x = b.delobs_copy(sel_rule)
597
598 # descriptive information on the database
599 b.info()
600
601 # plotting series
602 b.dataplot(file = './dbase_test_files/full_plot.png')
603
604 # adding a trend component
605 b.add_trend('mytrend') # or b.data.update({'mytrend':range(100)})
606
607 # adding a dummy
608 dummy_rule = b.data['a'] > 0
609 b.add_dummy(dummy_rule,'mydummy') # or b.data.update({'mydummy':dummy_rule})
610
611 # add seasonal dummies, specify data frequency and # of dummies
612 b.add_seasonal_dummies(52,13)
613
614 # descriptive information on the database for selected variables and time periods
615 b.info('b','c','mydummy', sel = sel_rule)
616
617 # plotting series for selected variables and selected data periods
618 b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
- [get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
- [get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
- [get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
- [get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
- [get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
- [get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
- [get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
- [get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
- [get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
- [get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
- [get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
- [get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
- [get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
- [get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
- [get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
- [get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
- [get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]