Attachment 'dbase.py'
Download
Toggle line numbers
1 from scipy import c_, arange, array, unique, kron, ones, eye
2 from numpy.random import randn
3 from __future__ import division
4 import pylab, cPickle, shelve, csv, copy, os
5
6 class dbase:
7 """
8 TODO:
9 - Check if shelve loading/saving works
10 - Only tested on Mac OS X 10.4.8, with pytz
11
12 DOC:
13 A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
14 Data is stored in a dictionary.
15
16 To use the class:
17
18 >>> from dbase import dbase
19 >>> y = dbase('your_filename.csv')
20
21 or for a previously created dbase object stored in a pickle file
22
23 >>> from dbase import dbase
24 >>> y = dbase('your_filename.pickle')
25
26 or without importing the dbase class
27
28 >>> import cPickle
29 >>> f = open('your_filename.pickle','rb')
30 >>> y = cPickle.load(f)
31 >>> data_key = cPickle.load(f)
32 >>> f.close()
33
34 or for a dictionary stored in a shelf file
35
36 >>> from dbase import dbase
37 >>> y = dbase('your_filename.pickle')
38
39 To return a list of variable names and an array of data
40
41 >>> varnm, data = y.get()
42
43 For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
44 simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
45 of the class to a pickle file.
46 """
47
48 def __init__(self,fname,*var,**date):
49 """
50 Initializing the dbase class. Loading file fname.
51
52 If you have have a column in your csv file that is a date-string use:
53
54 >>> x = dbase('myfile.csv',date = 0)
55
56 where 0 is the index of the date column
57
58 If you have have an array in your pickle file that is a date variable use:
59
60 >>> x = dbase('myfile.pickle',date = 'date')
61
62 where 'date' is the key of the date array
63 """
64 self.load(fname,var,date)
65
66 def load(self,fname,var,date):
67 """
68 Loading data from a csv or a pickle file of the dbase class.
69 If this is csv file use pylab's load function. Seems much faster
70 than scipy.io.read_array.
71 """
72 # setting the ascii/csv file name used for input
73 self.DBname = os.getcwd() + '/' + fname
74
75 # assuming self.date_key = None unless otherwise given
76 self.date_key = date.values()
77
78 # getting the file extension
79 fext = self.__ext(fname)
80
81 # opening the file for reading
82 if fext == 'csv':
83 f = open(fname,'r')
84 self.load_csv(f)
85 f.close()
86 elif fext == 'pickle':
87 f = open(fname,'rb')
88 self.load_pickle(f)
89 f.close()
90 elif fext == 'she':
91 self.load_shelve(fname,var)
92 else:
93 raise 'This class only works on csv, pickle, and shelve files'
94
95 # specifying nobs in self.data
96 self.nobs = self.data[self.data.keys()[0]].shape[0]
97
98 def load_csv(self,f):
99 """
100 Loading data from a csv file. Uses pylab's load function. Seems much faster
101 than scipy.io.read_array.
102 """
103 varnm = f.readline().split(',')
104
105 # what is the date variable's key if any, based on index passed as argument
106 if self.date_key != []:
107 rawdata = pylab.load(f, delimiter=',',converters={self.date_key[0]:pylab.datestr2num}) # don't need to 'skiprow' here
108 self.date_key = varnm[self.date_key[0]]
109 else:
110 rawdata = pylab.load(f, delimiter=',') # don't need to 'skiprow' here
111
112 # making sure that the variable names contain no leading or trailing spaces
113 varnm = [i.strip() for i in varnm]
114
115 # transforming the data into a dictionary
116 self.data = dict(zip(varnm,rawdata.T))
117
118 def load_pickle(self,f):
119 """
120 Loading data from a created earlier using the the dbase class.
121 """
122 self.data = cPickle.load(f) # loading the data dictionary
123
124 # what is the date variable's key if any
125 if self.date_key == []:
126 try:
127 self.date_key = cPickle.load(f) # if nothing given assume it is in the pickle file
128 except:
129 print "No date series in pickle file"
130 else:
131 self.date_key = self.date_key[0] # assumes formatting using pylab.datestr2num already applied
132
133 def load_shelve(self,fname,var):
134 """
135 Loading data from a created earlier using the the dbase class.
136 """
137 data = shelve.open(fname) # loading the data dictionary
138
139 # find out if a variable list is provided
140 if var == ():
141 var = data.keys()
142
143 # making sure the date variable is fetched from shelve
144 if self.date_key != []:
145 if not self.date_key[0] in var: var = var + self.date_key
146 self.date_key = self.date_key[0] # assumes formatting using pylab.datestr2num already applied
147
148 self.data = dict([(i,data[i]) for i in var])
149 data.close()
150
151 def save(self,fname):
152 """
153 Dumping the class data dictionary into a csv or pickle file
154 """
155 fext = self.__ext(fname)
156 if fext == 'csv':
157 f = open(fname,'w')
158 self.save_csv(f)
159 f.close()
160 elif fext == 'pickle':
161 f = open(fname,'wb')
162 self.save_pickle(f)
163 f.close()
164 elif fext == 'she':
165 self.save_shelve(fname)
166 else:
167 raise 'This class only works on csv, pickle, and shelve files'
168
169 def save_csv(self,f):
170 """
171 Dumping the class data dictionary into a csv file
172 """
173 writer = csv.writer(f)
174 writer.writerow(self.data.keys())
175
176 data = self.data # a reference to the data dict
177 if self.date_key != []:
178 data = dict(data) # making a copy so the dates can be changed to strings
179 dates = pylab.num2date(data[self.date_key])
180 dates = array([i.strftime('%d %b %y') for i in dates])
181 data[self.date_key] = dates
182
183 writer.writerows(array(data.values()).T)
184
185 def save_pickle(self,f):
186 """
187 Dumping the class data dictionary and date_key into a binary pickle file
188 """
189 cPickle.dump(self.data,f,2)
190 cPickle.dump(self.date_key,f,2)
191
192 def save_shelve(self,fname):
193 """
194 Dumping the class data dictionary into a shelve file
195 """
196 f = shelve.open('data.she','c')
197 f = self.data
198 f.close()
199
200 def add_trend(self,tname = 'trend'):
201 # making a trend based on nobs in arbitrary series in dictionary
202 self.data[tname] = arange(self.nobs)
203
204 def add_dummy(self,dum, dname = 'dummy'):
205 if self.data.has_key(dname):
206 print "The variable name '" + str(dname) + "' already exists. Please select another name."
207 else:
208 self.data[dname] = dum
209
210 def add_seasonal_dummies(self,freq=52,ndum=13):
211 """
212 This function will only work if the freq and ndum 'fit. That is,
213 weeks and 4-weekly periods will work. Weeks and months/quarters
214 will not.
215 """
216 if self.date_key == []:
217 print "Cannot create seasonal dummies since no date array is known"
218 else:
219 # list of years
220 years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
221
222 # how many periods in does the data start
223 start = freq - sum(years == min(years))
224
225 # how many unique years
226 nyear = unique(years).shape[0]
227
228 # using kronecker products to make a big dummy matrix
229 sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
230 sd = sd[start:start+self.nobs] # slicing the dummies to fit the data
231 sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
232 self.data.update(sd) # adding the dummies to the main dict
233
234 def delvar(self,*var):
235 """
236 Deleting specified variables in the data dictionary, changing dictionary in place
237 """
238 [self.data.pop(i) for i in var]
239
240 def keepvar(self,*var):
241 """
242 Keeping specified variables in the data dictionary, changing dictionary in place
243 """
244 [self.data.pop(i) for i in self.data.keys() if i not in var]
245
246 def delvar_copy(self,*var):
247 """
248 Deleting specified variables in the data dictionary, making a copy
249 """
250 return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
251
252 def keepvar_copy(self,*var):
253 """
254 Keeping specified variables in the data dictionary, making a copy
255 """
256 return dict([(i,self.data[i]) for i in var])
257
258 def delobs(self,sel):
259 """
260 Deleting specified observations, changing dictionary in place
261 """
262 for i in self.data.keys(): self.data[i] = self.data[i][sel]
263
264 # updating the value of self.nobs
265 self.nobs -= sum(sel)
266
267 def keepobs(self,sel):
268 """
269 Keeping specified observations, changing dictionary in place
270 """
271 # updating the value of self.nobs
272 self.nobs -= sum(sel)
273
274 sel -= 1 # making true, false and vice-versa
275 self.delobs(sel)
276
277 def delobs_copy(self,sel):
278 """
279 Deleting specified observations, making a copy
280 """
281 return dict([(i,self.data[i][sel]) for i in self.data.keys()])
282
283 def keepobs_copy(self,sel):
284 """
285 Keeping specified observations, making a copy
286 """
287 sel -= 1 # making true, false and vice-versa
288 self.delobs_copy(sel)
289
290 def get(self,*var,**sel):
291 """
292 Copying data and keys of selected variables for further analysis
293 """
294 # calling convenience function to clean-up input parameters
295 var, sel = self.__var_and_sel_clean(var, sel)
296
297 # copying the entire dictionary (= default)
298 d = dict((i,self.data[i][sel]) for i in var)
299
300 return d.keys(), array(d.values()).T
301
302 def info(self,*var, **adict):
303 """
304 Printing descriptive statistics on selected variables
305 """
306
307 # calling convenience functions to clean-up input parameters
308 var, sel = self.__var_and_sel_clean(var, adict)
309 dates, nobs = self.__dates_and_nobs_clean(var, sel)
310
311 # setting the minimum and maximum dates to be used
312 mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
313 maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
314
315 # number of variables (excluding date if present)
316 nvar = len(var)
317
318 print '\n=============================================================='
319 print '==================== Database information ===================='
320 print '==============================================================\n'
321
322 print 'file: %s' % b.DBname
323 print '# obs: %s' % nobs
324 print '# variables: %s' % nvar
325 print "Start date: %s" % mindate
326 print "End date: %s" % maxdate
327
328 print '\nvar min max mean std.dev'
329 print '=============================================================='
330
331 for i in var:
332 _min = self.data[i][sel].min(); _max = self.data[i][sel].max(); _mean = self.data[i][sel].mean(); _std = self.data[i][sel].std()
333 print '''%-5s %-5.2f %-5.2f %-5.2f %-5.2f''' % tuple([i,_min,_max,_mean,_std])
334
335 def dataplot(self,*var, **adict):
336 """
337 Plotting the data with variable names
338 """
339 # calling convenience functions to clean-up input parameters
340 var, sel = self.__var_and_sel_clean(var, adict)
341 dates, nobs = self.__dates_and_nobs_clean(var, sel)
342
343 for i in var:
344 pylab.plot_date(dates,self.data[i][sel],'o-')
345
346 pylab.xlabel("Time (n = " + str(nobs) + ")")
347 pylab.title("Data plot of " + self.DBname)
348 pylab.legend(var)
349 if adict.has_key('file'):
350 pylab.savefig(adict['file'],dpi=600)
351 pylab.show()
352
353 def __var_and_sel_clean(self, var, sel, dates_needed = True):
354 """
355 Convenience function to avoid code duplication
356 """
357 # find out if a variable list is provided
358 if var == ():
359 var = self.data.keys()
360
361 # removing the date variable if it is present
362 var = [x for x in var if x != self.date_key]
363
364 # report variable label in alphabetical order
365 var.sort()
366
367 # find out if a selection rule is being used
368 # if not, set to empty tuple
369 if not sel.has_key('sel'):
370 sel = ()
371 else:
372 sel = sel['sel']
373
374 return var, sel
375
376 def __dates_and_nobs_clean(self, var, sel):
377 """
378 Convenience function to avoid code duplication
379 """
380 nobs = self.nobs
381 if len(sel):
382 nobs = nobs - (nobs - sum(sel))
383
384 if self.date_key != None and self.data.has_key(self.date_key):
385 # selecting dates from data base
386 dates = self.data[self.date_key][sel]
387 else:
388 # setting date series to start on 1/1/1950
389 dates = range(711858,nobs+711858)
390
391 return dates, nobs
392
393 def __ext(self,fname):
394 """
395 Finding the file extension of the filename passed to dbase
396 """
397 return fname.split('.')[-1].strip()
398
399 if __name__ == '__main__':
400
401 ########################
402 ### Testing dbase class
403 ########################
404
405 import sys
406 from scipy import c_
407
408 # making a directory to store simulate data
409 if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
410
411 # creating simulated data and variable labels
412 varnm = ['date','a','b','c'] # variable labels
413 nobs = 100
414 data = randn(nobs,3) # the data array
415 dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
416 dates = [i.strftime('%d %b %y') for i in dates]
417 data = c_[dates,data]
418
419 # saving simulated data to a csv file
420 f = open('./dbase_test_files/data.csv','w')
421 writer = csv.writer(f)
422 writer.writerow(varnm)
423 writer.writerows(data)
424 f.close()
425
426 # loading the data from the csv file
427 a = dbase("./dbase_test_files/data.csv",date = 0)
428 # saving the dbase instance data to a pickle file
429 a.save("./dbase_test_files/data.pickle")
430 # saving the dbase data to a shelve file
431 ### a.save("./dbase_test_files/data.she")
432
433 # loading a sub-section of the data from a shelve file
434 ### print "\nLoading 2 variables from a shelve file\n"
435 ### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
436
437 # showing data and variable names, from load_shelve
438 ### varnm, data = b.get()
439 ### print "Variable names from shelve file\n", varnm
440 ### print "\nData selected from shelve file\n", data
441 ### print "\nDate series", b.data[b.date_key]
442 ### del b # cleaning up
443
444 # loading the object from the pickle file
445 print "\nLoading the dbase object from a pickle file\n"
446 b = dbase("./dbase_test_files/data.pickle")
447
448 # getting the name of the file you are working on
449 print "\nWorking on file: " + b.DBname
450
451 # showing data and variable names
452 varnm, data = b.get()
453 print "Variable names from dbase class\n", varnm
454 print "\nData from dbase class\n", data
455 print "\nDate series", b.data[b.date_key]
456
457 # viewing selected data columns
458 varnm, data = b.get('a','c')
459 print "\nTwo columns selected using variable names\n", varnm, "\n", data
460
461 # saving to a csv file
462 print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
463
464 # adding variables/data
465 x1 = b.data['a'] * b.data['b']
466 x2 = b.data['a'] * b.data['c']
467 xdict = {'x1':x1,'x2':x2}
468 b.data.update(xdict) # using a dictionaries own 'add/extend method'
469
470 varnm, data = b.get()
471 print "\nTwo variable names added\n", varnm
472 print "\nTwo columns added\n", data
473
474 # using copy.deepcopy to make a complete copy of the class instance data
475 import copy
476 c = copy.deepcopy(b)
477
478 # making the database smaller, inplace, by deleting selected variables
479 c.delvar('a','x2')
480 varnm, data = c.get()
481 print "\nTwo variable names deleted\n", varnm
482 print "\nTwo columns deleted\n", data
483
484 # making the database smaller, inplace, by keeping only selected variables
485 c = copy.deepcopy(b)
486 c.keepvar('a','x2')
487 varnm, data = c.get()
488 print "\nAll but two variable names deleted\n", varnm
489 print "\nAll but Two columns deleted\n", data
490
491 # specifying a selection rule
492 sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
493
494 # making the database smaller, inplace, by delecting selected observation
495 c = copy.deepcopy(b)
496 c.delobs(sel_rule)
497
498 varnm, data = c.get()
499 print "\nReduced number of observations following the selection rule\n", data
500
501 # making the database smaller, inplace, by delecting all but the selected observation
502 c = copy.deepcopy(b)
503 c.keepobs(sel_rule)
504
505 varnm, data = c.get()
506 print "\nReduced number of observations following the inverse of the selection rule\n", data
507
508 # making a copy of of just the dictionary for selected variables
509 x = b.keepvar_copy('a')
510
511 # making a copy of of just the dictionary for everything but the selected variables
512 x = b.delvar_copy('a')
513
514 # making a copy of of just the dictionary for selected observations
515 x = b.keepobs_copy(sel_rule)
516
517 # making a copy of of just the dictionary for everything but the selected observation
518 x = b.delobs_copy(sel_rule)
519
520 # descriptive information on the database
521 b.info()
522
523 # plotting series
524 b.dataplot(file = './dbase_test_files/full_plot.png')
525
526 # adding a trend component
527 b.add_trend('mytrend') # or b.data.update({'mytrend':range(100)})
528
529 # adding a dummy
530 dummy_rule = b.data['a'] > 0
531 b.add_dummy(dummy_rule,'mydummy') # or b.data.update({'mydummy':dummy_rule})
532
533 # add seasonal dummies
534 b.add_seasonal_dummies(52,13)
535
536 # descriptive information on the database for selected time period
537 b.info('b','c', sel = sel_rule)
538
539 # plotting series
540 b.dataplot('b','c', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
- [get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
- [get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
- [get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
- [get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
- [get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
- [get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
- [get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
- [get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
- [get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
- [get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
- [get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
- [get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
- [get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
- [get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
- [get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
- [get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
- [get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]