attachment:dbase.0.1.py of Cookbook/dbase

Attachment 'dbase.0.1.py'

   1 from scipy import c_, arange, array, unique, kron, ones, eye
   2 from numpy.random import randn
   3 from __future__ import division
   4 import pylab, cPickle, shelve, csv, copy, os
   5 
   6 class dbase:
   7 	"""
   8 	Author: Vincent Nijs (+ ?)
   9 	Email: v-nijs at kellogg.northwestern.edu
  10 	Last Modified: Sun Jan 7 10:05:12 CST 2007
  11 		
  12 	Todo:
  13 		- Check if shelve loading/saving works
  14 		- Only tested on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
  15 	
  16 	Dependencies:
  17 		- See import statement at the top of this file
  18 
  19 	Doc:
  20 	A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
  21 	Data is stored in a dictionary. 
  22 
  23 	To use the class:
  24 
  25 		>>> from dbase import dbase
  26 		>>> y = dbase('your_filename.csv')
  27 
  28 	or for a previously created dbase object stored in a pickle file
  29 
  30 		>>> from dbase import dbase
  31 		>>> y = dbase('your_filename.pickle')
  32 
  33 		or without importing the dbase class
  34 
  35 		>>> import cPickle 
  36 		>>> f = open('your_filename.pickle','rb')
  37 		>>> y = cPickle.load(f)
  38 		>>> data_key = cPickle.load(f)
  39 		>>> f.close()
  40 
  41 	or for a dictionary stored in a shelf file
  42 
  43 		>>> from dbase import dbase
  44 		>>> y = dbase('your_filename.pickle')
  45 
  46 	To return a list of variable names and an array of data
  47 
  48 		>>> varnm, data = y.get()
  49 	
  50 	For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
  51 	simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
  52 	of the class to a pickle file.
  53 	"""
  54 
  55 	def __init__(self,fname,*var,**date):
  56 		"""
  57 		Initializing the dbase class. Loading file fname.
  58 
  59 		If you have have a column in your csv file that is a date-string use:
  60 
  61 			>>> x = dbase('myfile.csv',date = 0)
  62 
  63 		where 0 is the index of the date column
  64 
  65 		If you have have an array in your pickle file that is a date variable use:
  66 
  67 			>>> x = dbase('myfile.pickle',date = 'date')
  68 
  69 		where 'date' is the key of the date array
  70 		"""
  71 		self.load(fname,var,date)
  72 
  73 	def load(self,fname,var,date):
  74 		"""
  75 		Loading data from a csv or a pickle file of the dbase class.
  76 		If this is csv file use pylab's load function. Seems much faster
  77 		than scipy.io.read_array.
  78 		"""
  79 		# setting the ascii/csv file name used for input
  80 		self.DBname = os.getcwd() + '/' + fname
  81 
  82 		# assuming self.date_key = None unless otherwise given
  83 		self.date_key = date.values()
  84 
  85 		# getting the file extension
  86 		fext = self.__ext(fname)
  87 
  88 		# opening the file for reading
  89 		if fext == 'csv':
  90 			f = open(fname,'r')
  91 			self.load_csv(f)
  92 			f.close()
  93 		elif fext == 'pickle':
  94 			f = open(fname,'rb')
  95 			self.load_pickle(f)
  96 			f.close()
  97 		elif fext == 'she':
  98 			self.load_shelve(fname,var)
  99 		else:
 100 			raise 'This class only works on csv, pickle, and shelve files'
 101 
 102 		# specifying nobs in self.data
 103 		self.nobs = self.data[self.data.keys()[0]].shape[0]
 104 
 105 	def load_csv(self,f):
 106 		"""
 107 		Loading data from a csv file. Uses pylab's load function. Seems much faster
 108 		than scipy.io.read_array.
 109 		"""
 110 		varnm = f.readline().split(',')
 111 
 112 		# what is the date variable's key if any, based on index passed as argument
 113 		if self.date_key != []:
 114 			rawdata = pylab.load(f, delimiter=',',converters={self.date_key[0]:pylab.datestr2num})			# don't need to 'skiprow' here
 115 			self.date_key = varnm[self.date_key[0]]
 116 		else:
 117 			rawdata = pylab.load(f, delimiter=',')															# don't need to 'skiprow' here
 118 
 119 		# making sure that the variable names contain no leading or trailing spaces
 120 		varnm = [i.strip() for i in varnm]
 121 
 122 		# transforming the data into a dictionary
 123 		self.data = dict(zip(varnm,rawdata.T))
 124 
 125 	def load_pickle(self,f):
 126 		"""
 127 		Loading data from a created earlier using the the dbase class.
 128 		"""
 129 		self.data = cPickle.load(f)					# loading the data dictionary
 130 
 131 		# what is the date variable's key if any
 132 		if self.date_key == []:
 133 			try:
 134 				self.date_key = cPickle.load(f)		# if nothing given assume it is in the pickle file
 135 			except:
 136 				print "No date series in pickle file"
 137 		else:
 138 			self.date_key = self.date_key[0]		# assumes formatting using pylab.datestr2num already applied
 139 
 140 	def load_shelve(self,fname,var):
 141 		"""
 142 		Loading data from a created earlier using the the dbase class.
 143 		"""
 144 		data = shelve.open(fname)				# loading the data dictionary
 145 
 146 		# find out if a variable list is provided
 147 		if var == ():
 148 			var = data.keys()
 149 
 150 		# making sure the date variable is fetched from shelve
 151 		if self.date_key != []:
 152 			if not self.date_key[0] in var: var = var + self.date_key
 153 			self.date_key = self.date_key[0]		# assumes formatting using pylab.datestr2num already applied
 154 
 155 		self.data = dict([(i,data[i]) for i in var])
 156 		data.close()
 157 
 158 	def save(self,fname):
 159 		"""
 160 		Dumping the class data dictionary into a csv or pickle file
 161 		"""
 162 		fext = self.__ext(fname)
 163 		if fext == 'csv':
 164 			f = open(fname,'w')
 165 			self.save_csv(f)
 166 			f.close()
 167 		elif fext == 'pickle':
 168 			f = open(fname,'wb')
 169 			self.save_pickle(f)
 170 			f.close()
 171 		elif fext == 'she':
 172 			self.save_shelve(fname)
 173 		else:
 174 			raise 'This class only works on csv, pickle, and shelve files'
 175 
 176 	def save_csv(self,f):
 177 		"""
 178 		Dumping the class data dictionary into a csv file
 179 		"""
 180 		writer = csv.writer(f)
 181 		writer.writerow(self.data.keys())
 182 
 183 		data = self.data						# a reference to the data dict
 184 		if self.date_key != []:
 185 			data = dict(data)				# making a copy so the dates can be changed to strings
 186 			dates = pylab.num2date(data[self.date_key])
 187 			dates = array([i.strftime('%d %b %y') for i in dates])
 188 			data[self.date_key] = dates
 189 
 190 		writer.writerows(array(data.values()).T)
 191 
 192 	def save_pickle(self,f):
 193 		"""
 194 		Dumping the class data dictionary and date_key into a binary pickle file
 195 		"""
 196 		cPickle.dump(self.data,f,2)
 197 		cPickle.dump(self.date_key,f,2)
 198 
 199 	def save_shelve(self,fname):
 200 		"""
 201 		Dumping the class data dictionary into a shelve file
 202 		"""
 203 		f = shelve.open('data.she','c') 
 204 		f = self.data
 205 		f.close()
 206 
 207 	def add_trend(self,tname = 'trend'):
 208 		# making a trend based on nobs in arbitrary series in dictionary
 209 		self.data[tname] = arange(self.nobs)
 210 
 211 	def add_dummy(self,dum, dname = 'dummy'):
 212 		if self.data.has_key(dname):
 213 			print "The variable name '" + str(dname) + "' already exists. Please select another name."
 214 		else:
 215 			self.data[dname] = dum
 216 
 217 	def add_seasonal_dummies(self,freq=52,ndum=13):
 218 		"""
 219 		This function will only work if the freq and ndum 'fit. That is,
 220 		weeks and 4-weekly periods will work. Weeks and months/quarters
 221 		will not.
 222 		"""
 223 		if self.date_key == []:
 224 			print "Cannot create seasonal dummies since no date array is known"
 225 		else:
 226 			# list of years
 227 			years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
 228 
 229 			# how many periods in does the data start
 230 			start = freq - sum(years ==	min(years))
 231 
 232 			# how many unique years
 233 			nyear = unique(years).shape[0]
 234 
 235 			# using kronecker products to make a big dummy matrix
 236 			sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
 237 			sd = sd[start:start+self.nobs]		# slicing the dummies to fit the data	
 238 			sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
 239 			self.data.update(sd)				# adding the dummies to the main dict
 240 
 241 	def delvar(self,*var):
 242 		"""
 243 		Deleting specified variables in the data dictionary, changing dictionary in place
 244 		"""
 245 		[self.data.pop(i) for i in var]
 246 
 247 	def keepvar(self,*var):
 248 		"""
 249 		Keeping specified variables in the data dictionary, changing dictionary in place
 250 		"""
 251 		[self.data.pop(i) for i in self.data.keys() if i not in var]
 252 
 253 	def delvar_copy(self,*var):
 254 		"""
 255 		Deleting specified variables in the data dictionary, making a copy
 256 		"""
 257 		return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
 258 
 259 	def keepvar_copy(self,*var):
 260 		"""
 261 		Keeping specified variables in the data dictionary, making a copy
 262 		"""
 263 		return dict([(i,self.data[i]) for i in var])
 264 
 265 	def delobs(self,sel):
 266 		"""
 267 		Deleting specified observations, changing dictionary in place
 268 		"""
 269 		for i in self.data.keys(): self.data[i] = self.data[i][sel]
 270 
 271 		# updating the value of self.nobs
 272 		self.nobs -= sum(sel)
 273 
 274 	def keepobs(self,sel):
 275 		"""
 276 		Keeping specified observations, changing dictionary in place
 277 		"""
 278 		# updating the value of self.nobs
 279 		self.nobs -= sum(sel)
 280 
 281 		sel -= 1				# making true, false and vice-versa
 282 		self.delobs(sel)
 283 
 284 	def delobs_copy(self,sel):
 285 		"""
 286 		Deleting specified observations, making a copy
 287 		"""
 288 		return dict([(i,self.data[i][sel]) for i in self.data.keys()])
 289 
 290 	def keepobs_copy(self,sel):
 291 		"""
 292 		Keeping specified observations, making a copy
 293 		"""
 294 		sel -= 1				# making true, false and vice-versa
 295 		self.delobs_copy(sel)
 296 
 297 	def get(self,*var,**sel):
 298 		"""
 299 		Copying data and keys of selected variables for further analysis
 300 		"""
 301 		# calling convenience function to clean-up input parameters
 302 		var, sel = self.__var_and_sel_clean(var, sel)
 303 
 304 		# copying the entire dictionary (= default)
 305 		d = dict((i,self.data[i][sel]) for i in var)
 306 
 307 		return d.keys(), array(d.values()).T
 308 
 309 	def info(self,*var, **adict):
 310 		"""
 311 		Printing descriptive statistics on selected variables
 312 		"""
 313 			
 314 		# calling convenience functions to clean-up input parameters
 315 		var, sel = self.__var_and_sel_clean(var, adict)
 316 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 317 			
 318 		# setting the minimum and maximum dates to be used
 319 		mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
 320 		maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
 321 
 322 		# number of variables (excluding date if present)
 323 		nvar = len(var)
 324 
 325 		print '\n=============================================================='
 326 		print '==================== Database information ===================='
 327 		print '==============================================================\n'
 328 
 329 		print 'file:				%s' % self.DBname
 330 		print '# obs:				%s' % nobs
 331 		print '# variables:		%s' % nvar 
 332 		print 'Start date:			%s' % mindate
 333 		print 'End date:			%s' % maxdate
 334 
 335 		print '\nvar				min			max			mean		std.dev'
 336 		print '=============================================================='
 337 		
 338 		for i in var:
 339 			_min = self.data[i][sel].min(); _max = self.data[i][sel].max(); _mean = self.data[i][sel].mean(); _std = self.data[i][sel].std()
 340 			print '''%-5s			%-5.2f		%-5.2f		%-5.2f		%-5.2f''' % tuple([i,_min,_max,_mean,_std]) 
 341 	
 342 	def dataplot(self,*var, **adict):
 343 		"""
 344 		Plotting the data with variable names
 345 		"""
 346 		# calling convenience functions to clean-up input parameters
 347 		var, sel = self.__var_and_sel_clean(var, adict)
 348 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 349 
 350 		for i in var:
 351 			pylab.plot_date(dates,self.data[i][sel],'o-') 
 352 
 353 		pylab.xlabel("Time (n = " + str(nobs) + ")") 
 354 		pylab.title("Data plot of " + self.DBname)
 355 		pylab.legend(var)
 356 		if adict.has_key('file'):
 357 			pylab.savefig(adict['file'],dpi=600)
 358 		pylab.show()
 359 
 360 	def __var_and_sel_clean(self, var, sel, dates_needed = True):
 361 		"""
 362 		Convenience function to avoid code duplication
 363 		"""
 364 		# find out if a variable list is provided
 365 		if var == ():
 366 			var = self.data.keys()
 367 			
 368 		# removing the date variable if it is present
 369 		var = [x for x in var if x != self.date_key]
 370 
 371 		# report variable label in alphabetical order
 372 		var.sort()
 373 
 374 		# find out if a selection rule is being used
 375 		# if not, set to empty tuple
 376 		if not sel.has_key('sel'):
 377 			sel = ()
 378 		else:
 379 			sel = sel['sel']
 380 
 381 		return var, sel
 382 
 383 	def __dates_and_nobs_clean(self, var, sel):
 384 		"""
 385 		Convenience function to avoid code duplication
 386 		"""
 387 		nobs = self.nobs
 388 		if len(sel):
 389 			nobs = nobs - (nobs - sum(sel))
 390 
 391 		if self.date_key != None and self.data.has_key(self.date_key):
 392 			# selecting dates from data base
 393 			dates = self.data[self.date_key][sel]
 394 		else:
 395 			# setting date series to start on 1/1/1950
 396 			dates = range(711858,nobs+711858)
 397 
 398 		return dates, nobs
 399 
 400 	def __ext(self,fname):
 401 		"""
 402 		Finding the file extension of the filename passed to dbase
 403 		"""
 404 		return fname.split('.')[-1].strip()
 405 
 406 if __name__ == '__main__':
 407 
 408 	###################################
 409 	### usage examples of dbase class
 410 	###################################
 411 
 412 	import sys
 413 	from scipy import c_
 414 
 415 	# making a directory to store simulate data
 416 	if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
 417 
 418 	# creating simulated data and variable labels
 419 	varnm = ['date','a','b','c']			# variable labels
 420 	nobs = 100
 421 	data =	randn(nobs,3)					# the data array
 422 	dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
 423 	dates = [i.strftime('%d %b %y') for i in dates]
 424 	data = c_[dates,data]
 425 
 426 	# saving simulated data to a csv file
 427 	f = open('./dbase_test_files/data.csv','w')
 428 	writer = csv.writer(f)
 429 	writer.writerow(varnm)
 430 	writer.writerows(data)
 431 	f.close()
 432 
 433 	# loading the data from the csv file
 434 	a = dbase("./dbase_test_files/data.csv",date = 0)
 435 	# saving the dbase instance data to a pickle file
 436 	a.save("./dbase_test_files/data.pickle")
 437 	# saving the dbase data to a shelve file
 438 	### a.save("./dbase_test_files/data.she")
 439 
 440 	# loading a sub-section of the data from a shelve file
 441 	### print "\nLoading 2 variables from a shelve file\n"
 442 	### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
 443 
 444 	# showing data and variable names, from load_shelve
 445 	### varnm, data = b.get()
 446 	### print "Variable names from shelve file\n", varnm
 447 	### print "\nData selected from shelve file\n", data
 448 	### print "\nDate series", b.data[b.date_key]
 449 	### del b		# cleaning up
 450 
 451 	# loading the object from the pickle file
 452 	print "\nLoading the dbase object from a pickle file\n"
 453 	b = dbase("./dbase_test_files/data.pickle")
 454 
 455 	# getting the name of the file you are working on
 456 	print "\nWorking on file: " + b.DBname
 457 
 458 	# showing data and variable names
 459 	varnm, data = b.get()
 460 	print "Variable names from dbase class\n", varnm
 461 	print "\nData from dbase class\n", data
 462 	print "\nDate series", b.data[b.date_key]
 463 
 464 	# viewing selected data columns
 465 	varnm, data = b.get('a','c')
 466 	print "\nTwo columns selected using variable names\n", varnm, "\n", data 
 467 
 468 	# saving to a csv file
 469 	print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
 470 
 471 	# adding variables/data
 472 	x1 = b.data['a'] * b.data['b']
 473 	x2 = b.data['a'] * b.data['c']
 474 	xdict = {'x1':x1,'x2':x2}
 475 	b.data.update(xdict)				# using a dictionaries own 'add/extend method'
 476 
 477 	varnm, data = b.get()
 478 	print "\nTwo variable names added\n", varnm
 479 	print "\nTwo columns added\n", data
 480 
 481 	# using copy.deepcopy to make a complete copy of the class instance data
 482 	import copy
 483 	c = copy.deepcopy(b)
 484 
 485 	# making the database smaller, inplace, by deleting selected variables
 486 	c.delvar('a','x2')
 487 	varnm, data = c.get()
 488 	print "\nTwo variable names deleted\n", varnm
 489 	print "\nTwo columns deleted\n", data
 490 
 491 	# making the database smaller, inplace, by keeping only selected variables
 492 	c = copy.deepcopy(b)
 493 	c.keepvar('a','x2')
 494 	varnm, data = c.get()
 495 	print "\nAll but two variable names deleted\n", varnm
 496 	print "\nAll but Two columns deleted\n", data
 497 
 498 	# specifying a selection rule
 499 	sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
 500 
 501 	# making the database smaller, inplace, by delecting selected observation
 502 	c = copy.deepcopy(b)
 503 	c.delobs(sel_rule)
 504 
 505 	varnm, data = c.get()
 506 	print "\nReduced number of observations following the selection rule\n", data
 507 
 508 	# making the database smaller, inplace, by delecting all but the selected observation
 509 	c = copy.deepcopy(b)
 510 	c.keepobs(sel_rule)
 511 
 512 	varnm, data = c.get()
 513 	print "\nReduced number of observations following the inverse of the selection rule\n", data
 514 
 515 	# making a copy of of just the dictionary for selected variables
 516 	x = b.keepvar_copy('a')
 517 
 518 	# making a copy of of just the dictionary for everything but the selected variables
 519 	x = b.delvar_copy('a')
 520 
 521 	# making a copy of of just the dictionary for selected observations
 522 	x = b.keepobs_copy(sel_rule)
 523 
 524 	# making a copy of of just the dictionary for everything but the selected observation
 525 	x = b.delobs_copy(sel_rule)
 526 
 527 	# descriptive information on the database
 528 	b.info()
 529 
 530 	# plotting series
 531 	b.dataplot(file = './dbase_test_files/full_plot.png')
 532 
 533 	# adding a trend component
 534 	b.add_trend('mytrend')				# or b.data.update({'mytrend':range(100)})
 535 
 536 	# adding a dummy
 537 	dummy_rule = b.data['a'] > 0
 538 	b.add_dummy(dummy_rule,'mydummy')	# or b.data.update({'mydummy':dummy_rule})
 539 
 540 	# add seasonal dummies, specify data frequency and # of dummies
 541 	b.add_seasonal_dummies(52,13)
 542 
 543 	# descriptive information on the database for selected variables and time periods
 544 	b.info('b','c', sel = sel_rule)
 545 
 546 	# plotting series for selected variables and selected data periods
 547 	b.dataplot('b','c', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
[get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
[get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
[get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
[get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
[get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
[get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
[get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
[get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
[get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
[get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
[get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
[get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
[get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
[get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
[get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
[get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
[get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
[get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]