Attachment 'dbase.0.1.py'
Download
Toggle line numbers
1 from scipy import c_, arange, array, unique, kron, ones, eye
2 from numpy.random import randn
3 from __future__ import division
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 Author: Vincent Nijs (+ ?)
9 Email: v-nijs at kellogg.northwestern.edu
10 Last Modified: Sun Jan 7 10:05:12 CST 2007
11
12 Todo:
13 - Check if shelve loading/saving works
14 - Only tested on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
15
16 Dependencies:
17 - See import statement at the top of this file
18
19 Doc:
20 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
21 Data is stored in a dictionary.
22
23 To use the class:
24
25 >>> from dbase import dbase
26 >>> y = dbase('your_filename.csv')
27
28 or for a previously created dbase object stored in a pickle file
29
30 >>> from dbase import dbase
31 >>> y = dbase('your_filename.pickle')
32
33 or without importing the dbase class
34
35 >>> import cPickle
36 >>> f = open('your_filename.pickle','rb')
37 >>> y = cPickle.load(f)
38 >>> data_key = cPickle.load(f)
39 >>> f.close()
40
41 or for a dictionary stored in a shelf file
42
43 >>> from dbase import dbase
44 >>> y = dbase('your_filename.pickle')
45
46 To return a list of variable names and an array of data
47
48 >>> varnm, data = y.get()
49
50 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
51 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
52 of the class to a pickle file.
53 """
54
55 def __init__(self,fname,*var,**date):
56 """
57 Initializing the dbase class. Loading file fname.
58
59 If you have have a column in your csv file that is a date-string use:
60
61 >>> x = dbase('myfile.csv',date = 0)
62
63 where 0 is the index of the date column
64
65 If you have have an array in your pickle file that is a date variable use:
66
67 >>> x = dbase('myfile.pickle',date = 'date')
68
69 where 'date' is the key of the date array
70 """
71 self.load(fname,var,date)
72
73 def load(self,fname,var,date):
74 """
75 Loading data from a csv or a pickle file of the dbase class.
76 If this is csv file use pylab's load function. Seems much faster
77 than scipy.io.read_array.
78 """
79 # setting the ascii/csv file name used for input
80 self.DBname = os.getcwd() + '/' + fname
81
82 # assuming self.date_key = None unless otherwise given
83 self.date_key = date.values()
84
85 # getting the file extension
86 fext = self.__ext(fname)
87
88 # opening the file for reading
89 if fext == 'csv':
90 f = open(fname,'r')
91 self.load_csv(f)
92 f.close()
93 elif fext == 'pickle':
94 f = open(fname,'rb')
95 self.load_pickle(f)
96 f.close()
97 elif fext == 'she':
98 self.load_shelve(fname,var)
99 else:
100 raise 'This class only works on csv, pickle, and shelve files'
101
102 # specifying nobs in self.data
103 self.nobs = self.data[self.data.keys()[0]].shape[0]
104
105 def load_csv(self,f):
106 """
107 Loading data from a csv file. Uses pylab's load function. Seems much faster
108 than scipy.io.read_array.
109 """
110 varnm = f.readline().split(',')
111
112 # what is the date variable's key if any, based on index passed as argument
113 if self.date_key != []:
114 rawdata = pylab.load(f, delimiter=',',converters={self.date_key[0]:pylab.datestr2num}) # don't need to 'skiprow' here
115 self.date_key = varnm[self.date_key[0]]
116 else:
117 rawdata = pylab.load(f, delimiter=',') # don't need to 'skiprow' here
118
119 # making sure that the variable names contain no leading or trailing spaces
120 varnm = [i.strip() for i in varnm]
121
122 # transforming the data into a dictionary
123 self.data = dict(zip(varnm,rawdata.T))
124
125 def load_pickle(self,f):
126 """
127 Loading data from a created earlier using the the dbase class.
128 """
129 self.data = cPickle.load(f) # loading the data dictionary
130
131 # what is the date variable's key if any
132 if self.date_key == []:
133 try:
134 self.date_key = cPickle.load(f) # if nothing given assume it is in the pickle file
135 except:
136 print "No date series in pickle file"
137 else:
138 self.date_key = self.date_key[0] # assumes formatting using pylab.datestr2num already applied
139
140 def load_shelve(self,fname,var):
141 """
142 Loading data from a created earlier using the the dbase class.
143 """
144 data = shelve.open(fname) # loading the data dictionary
145
146 # find out if a variable list is provided
147 if var == ():
148 var = data.keys()
149
150 # making sure the date variable is fetched from shelve
151 if self.date_key != []:
152 if not self.date_key[0] in var: var = var + self.date_key
153 self.date_key = self.date_key[0] # assumes formatting using pylab.datestr2num already applied
154
155 self.data = dict([(i,data[i]) for i in var])
156 data.close()
157
158 def save(self,fname):
159 """
160 Dumping the class data dictionary into a csv or pickle file
161 """
162 fext = self.__ext(fname)
163 if fext == 'csv':
164 f = open(fname,'w')
165 self.save_csv(f)
166 f.close()
167 elif fext == 'pickle':
168 f = open(fname,'wb')
169 self.save_pickle(f)
170 f.close()
171 elif fext == 'she':
172 self.save_shelve(fname)
173 else:
174 raise 'This class only works on csv, pickle, and shelve files'
175
176 def save_csv(self,f):
177 """
178 Dumping the class data dictionary into a csv file
179 """
180 writer = csv.writer(f)
181 writer.writerow(self.data.keys())
182
183 data = self.data # a reference to the data dict
184 if self.date_key != []:
185 data = dict(data) # making a copy so the dates can be changed to strings
186 dates = pylab.num2date(data[self.date_key])
187 dates = array([i.strftime('%d %b %y') for i in dates])
188 data[self.date_key] = dates
189
190 writer.writerows(array(data.values()).T)
191
192 def save_pickle(self,f):
193 """
194 Dumping the class data dictionary and date_key into a binary pickle file
195 """
196 cPickle.dump(self.data,f,2)
197 cPickle.dump(self.date_key,f,2)
198
199 def save_shelve(self,fname):
200 """
201 Dumping the class data dictionary into a shelve file
202 """
203 f = shelve.open('data.she','c')
204 f = self.data
205 f.close()
206
207 def add_trend(self,tname = 'trend'):
208 # making a trend based on nobs in arbitrary series in dictionary
209 self.data[tname] = arange(self.nobs)
210
211 def add_dummy(self,dum, dname = 'dummy'):
212 if self.data.has_key(dname):
213 print "The variable name '" + str(dname) + "' already exists. Please select another name."
214 else:
215 self.data[dname] = dum
216
217 def add_seasonal_dummies(self,freq=52,ndum=13):
218 """
219 This function will only work if the freq and ndum 'fit. That is,
220 weeks and 4-weekly periods will work. Weeks and months/quarters
221 will not.
222 """
223 if self.date_key == []:
224 print "Cannot create seasonal dummies since no date array is known"
225 else:
226 # list of years
227 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
228
229 # how many periods in does the data start
230 start = freq - sum(years == min(years))
231
232 # how many unique years
233 nyear = unique(years).shape[0]
234
235 # using kronecker products to make a big dummy matrix
236 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
237 sd = sd[start:start+self.nobs] # slicing the dummies to fit the data
238 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
239 self.data.update(sd) # adding the dummies to the main dict
240
241 def delvar(self,*var):
242 """
243 Deleting specified variables in the data dictionary, changing dictionary in place
244 """
245 [self.data.pop(i) for i in var]
246
247 def keepvar(self,*var):
248 """
249 Keeping specified variables in the data dictionary, changing dictionary in place
250 """
251 [self.data.pop(i) for i in self.data.keys() if i not in var]
252
253 def delvar_copy(self,*var):
254 """
255 Deleting specified variables in the data dictionary, making a copy
256 """
257 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
258
259 def keepvar_copy(self,*var):
260 """
261 Keeping specified variables in the data dictionary, making a copy
262 """
263 return dict([(i,self.data[i]) for i in var])
264
265 def delobs(self,sel):
266 """
267 Deleting specified observations, changing dictionary in place
268 """
269 for i in self.data.keys(): self.data[i] = self.data[i][sel]
270
271 # updating the value of self.nobs
272 self.nobs -= sum(sel)
273
274 def keepobs(self,sel):
275 """
276 Keeping specified observations, changing dictionary in place
277 """
278 # updating the value of self.nobs
279 self.nobs -= sum(sel)
280
281 sel -= 1 # making true, false and vice-versa
282 self.delobs(sel)
283
284 def delobs_copy(self,sel):
285 """
286 Deleting specified observations, making a copy
287 """
288 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
289
290 def keepobs_copy(self,sel):
291 """
292 Keeping specified observations, making a copy
293 """
294 sel -= 1 # making true, false and vice-versa
295 self.delobs_copy(sel)
296
297 def get(self,*var,**sel):
298 """
299 Copying data and keys of selected variables for further analysis
300 """
301 # calling convenience function to clean-up input parameters
302 var, sel = self.__var_and_sel_clean(var, sel)
303
304 # copying the entire dictionary (= default)
305 d = dict((i,self.data[i][sel]) for i in var)
306
307 return d.keys(), array(d.values()).T
308
309 def info(self,*var, **adict):
310 """
311 Printing descriptive statistics on selected variables
312 """
313
314 # calling convenience functions to clean-up input parameters
315 var, sel = self.__var_and_sel_clean(var, adict)
316 dates, nobs = self.__dates_and_nobs_clean(var, sel)
317
318 # setting the minimum and maximum dates to be used
319 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
320 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
321
322 # number of variables (excluding date if present)
323 nvar = len(var)
324
325 print '\n=============================================================='
326 print '==================== Database information ===================='
327 print '==============================================================\n'
328
329 print 'file: %s' % self.DBname
330 print '# obs: %s' % nobs
331 print '# variables: %s' % nvar
332 print 'Start date: %s' % mindate
333 print 'End date: %s' % maxdate
334
335 print '\nvar min max mean std.dev'
336 print '=============================================================='
337
338 for i in var:
339 _min = self.data[i][sel].min(); _max = self.data[i][sel].max(); _mean = self.data[i][sel].mean(); _std = self.data[i][sel].std()
340 print '''%-5s %-5.2f %-5.2f %-5.2f %-5.2f''' % tuple([i,_min,_max,_mean,_std])
341
342 def dataplot(self,*var, **adict):
343 """
344 Plotting the data with variable names
345 """
346 # calling convenience functions to clean-up input parameters
347 var, sel = self.__var_and_sel_clean(var, adict)
348 dates, nobs = self.__dates_and_nobs_clean(var, sel)
349
350 for i in var:
351 pylab.plot_date(dates,self.data[i][sel],'o-')
352
353 pylab.xlabel("Time (n = " + str(nobs) + ")")
354 pylab.title("Data plot of " + self.DBname)
355 pylab.legend(var)
356 if adict.has_key('file'):
357 pylab.savefig(adict['file'],dpi=600)
358 pylab.show()
359
360 def __var_and_sel_clean(self, var, sel, dates_needed = True):
361 """
362 Convenience function to avoid code duplication
363 """
364 # find out if a variable list is provided
365 if var == ():
366 var = self.data.keys()
367
368 # removing the date variable if it is present
369 var = [x for x in var if x != self.date_key]
370
371 # report variable label in alphabetical order
372 var.sort()
373
374 # find out if a selection rule is being used
375 # if not, set to empty tuple
376 if not sel.has_key('sel'):
377 sel = ()
378 else:
379 sel = sel['sel']
380
381 return var, sel
382
383 def __dates_and_nobs_clean(self, var, sel):
384 """
385 Convenience function to avoid code duplication
386 """
387 nobs = self.nobs
388 if len(sel):
389 nobs = nobs - (nobs - sum(sel))
390
391 if self.date_key != None and self.data.has_key(self.date_key):
392 # selecting dates from data base
393 dates = self.data[self.date_key][sel]
394 else:
395 # setting date series to start on 1/1/1950
396 dates = range(711858,nobs+711858)
397
398 return dates, nobs
399
400 def __ext(self,fname):
401 """
402 Finding the file extension of the filename passed to dbase
403 """
404 return fname.split('.')[-1].strip()
405
406 if __name__ == '__main__':
407
408 ###################################
409 ### usage examples of dbase class
410 ###################################
411
412 import sys
413 from scipy import c_
414
415 # making a directory to store simulate data
416 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
417
418 # creating simulated data and variable labels
419 varnm = ['date','a','b','c'] # variable labels
420 nobs = 100
421 data = randn(nobs,3) # the data array
422 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
423 dates = [i.strftime('%d %b %y') for i in dates]
424 data = c_[dates,data]
425
426 # saving simulated data to a csv file
427 f = open('./dbase_test_files/data.csv','w')
428 writer = csv.writer(f)
429 writer.writerow(varnm)
430 writer.writerows(data)
431 f.close()
432
433 # loading the data from the csv file
434 a = dbase("./dbase_test_files/data.csv",date = 0)
435 # saving the dbase instance data to a pickle file
436 a.save("./dbase_test_files/data.pickle")
437 # saving the dbase data to a shelve file
438 ### a.save("./dbase_test_files/data.she")
439
440 # loading a sub-section of the data from a shelve file
441 ### print "\nLoading 2 variables from a shelve file\n"
442 ### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
443
444 # showing data and variable names, from load_shelve
445 ### varnm, data = b.get()
446 ### print "Variable names from shelve file\n", varnm
447 ### print "\nData selected from shelve file\n", data
448 ### print "\nDate series", b.data[b.date_key]
449 ### del b # cleaning up
450
451 # loading the object from the pickle file
452 print "\nLoading the dbase object from a pickle file\n"
453 b = dbase("./dbase_test_files/data.pickle")
454
455 # getting the name of the file you are working on
456 print "\nWorking on file: " + b.DBname
457
458 # showing data and variable names
459 varnm, data = b.get()
460 print "Variable names from dbase class\n", varnm
461 print "\nData from dbase class\n", data
462 print "\nDate series", b.data[b.date_key]
463
464 # viewing selected data columns
465 varnm, data = b.get('a','c')
466 print "\nTwo columns selected using variable names\n", varnm, "\n", data
467
468 # saving to a csv file
469 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
470
471 # adding variables/data
472 x1 = b.data['a'] * b.data['b']
473 x2 = b.data['a'] * b.data['c']
474 xdict = {'x1':x1,'x2':x2}
475 b.data.update(xdict) # using a dictionaries own 'add/extend method'
476
477 varnm, data = b.get()
478 print "\nTwo variable names added\n", varnm
479 print "\nTwo columns added\n", data
480
481 # using copy.deepcopy to make a complete copy of the class instance data
482 import copy
483 c = copy.deepcopy(b)
484
485 # making the database smaller, inplace, by deleting selected variables
486 c.delvar('a','x2')
487 varnm, data = c.get()
488 print "\nTwo variable names deleted\n", varnm
489 print "\nTwo columns deleted\n", data
490
491 # making the database smaller, inplace, by keeping only selected variables
492 c = copy.deepcopy(b)
493 c.keepvar('a','x2')
494 varnm, data = c.get()
495 print "\nAll but two variable names deleted\n", varnm
496 print "\nAll but Two columns deleted\n", data
497
498 # specifying a selection rule
499 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
500
501 # making the database smaller, inplace, by delecting selected observation
502 c = copy.deepcopy(b)
503 c.delobs(sel_rule)
504
505 varnm, data = c.get()
506 print "\nReduced number of observations following the selection rule\n", data
507
508 # making the database smaller, inplace, by delecting all but the selected observation
509 c = copy.deepcopy(b)
510 c.keepobs(sel_rule)
511
512 varnm, data = c.get()
513 print "\nReduced number of observations following the inverse of the selection rule\n", data
514
515 # making a copy of of just the dictionary for selected variables
516 x = b.keepvar_copy('a')
517
518 # making a copy of of just the dictionary for everything but the selected variables
519 x = b.delvar_copy('a')
520
521 # making a copy of of just the dictionary for selected observations
522 x = b.keepobs_copy(sel_rule)
523
524 # making a copy of of just the dictionary for everything but the selected observation
525 x = b.delobs_copy(sel_rule)
526
527 # descriptive information on the database
528 b.info()
529
530 # plotting series
531 b.dataplot(file = './dbase_test_files/full_plot.png')
532
533 # adding a trend component
534 b.add_trend('mytrend') # or b.data.update({'mytrend':range(100)})
535
536 # adding a dummy
537 dummy_rule = b.data['a'] > 0
538 b.add_dummy(dummy_rule,'mydummy') # or b.data.update({'mydummy':dummy_rule})
539
540 # add seasonal dummies, specify data frequency and # of dummies
541 b.add_seasonal_dummies(52,13)
542
543 # descriptive information on the database for selected variables and time periods
544 b.info('b','c', sel = sel_rule)
545
546 # plotting series for selected variables and selected data periods
547 b.dataplot('b','c', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
- [get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
- [get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
- [get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
- [get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
- [get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
- [get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
- [get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
- [get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
- [get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
- [get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
- [get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
- [get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
- [get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
- [get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
- [get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
- [get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
- [get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]