attachment:dbase.py of Cookbook/dbase

Attachment 'dbase.py'

   1 from scipy import c_, arange, array, unique, kron, ones, eye
   2 from numpy.random import randn
   3 from __future__ import division
   4 import pylab, cPickle, shelve, csv, copy, os
   5 
   6 class dbase:
   7 	"""
   8 	TODO:
   9 		- Check if shelve loading/saving works
  10 		- Only tested on Mac OS X 10.4.8, with pytz
  11 
  12 	DOC:
  13 	A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
  14 	Data is stored in a dictionary. 
  15 
  16 	To use the class:
  17 
  18 		>>> from dbase import dbase
  19 		>>> y = dbase('your_filename.csv')
  20 
  21 	or for a previously created dbase object stored in a pickle file
  22 
  23 		>>> from dbase import dbase
  24 		>>> y = dbase('your_filename.pickle')
  25 
  26 		or without importing the dbase class
  27 
  28 		>>> import cPickle 
  29 		>>> f = open('your_filename.pickle','rb')
  30 		>>> y = cPickle.load(f)
  31 		>>> data_key = cPickle.load(f)
  32 		>>> f.close()
  33 
  34 	or for a dictionary stored in a shelf file
  35 
  36 		>>> from dbase import dbase
  37 		>>> y = dbase('your_filename.pickle')
  38 
  39 	To return a list of variable names and an array of data
  40 
  41 		>>> varnm, data = y.get()
  42 	
  43 	For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
  44 	simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
  45 	of the class to a pickle file.
  46 	"""
  47 
  48 	def __init__(self,fname,*var,**date):
  49 		"""
  50 		Initializing the dbase class. Loading file fname.
  51 
  52 		If you have have a column in your csv file that is a date-string use:
  53 
  54 			>>> x = dbase('myfile.csv',date = 0)
  55 
  56 		where 0 is the index of the date column
  57 
  58 		If you have have an array in your pickle file that is a date variable use:
  59 
  60 			>>> x = dbase('myfile.pickle',date = 'date')
  61 
  62 		where 'date' is the key of the date array
  63 		"""
  64 		self.load(fname,var,date)
  65 
  66 	def load(self,fname,var,date):
  67 		"""
  68 		Loading data from a csv or a pickle file of the dbase class.
  69 		If this is csv file use pylab's load function. Seems much faster
  70 		than scipy.io.read_array.
  71 		"""
  72 		# setting the ascii/csv file name used for input
  73 		self.DBname = os.getcwd() + '/' + fname
  74 
  75 		# assuming self.date_key = None unless otherwise given
  76 		self.date_key = date.values()
  77 
  78 		# getting the file extension
  79 		fext = self.__ext(fname)
  80 
  81 		# opening the file for reading
  82 		if fext == 'csv':
  83 			f = open(fname,'r')
  84 			self.load_csv(f)
  85 			f.close()
  86 		elif fext == 'pickle':
  87 			f = open(fname,'rb')
  88 			self.load_pickle(f)
  89 			f.close()
  90 		elif fext == 'she':
  91 			self.load_shelve(fname,var)
  92 		else:
  93 			raise 'This class only works on csv, pickle, and shelve files'
  94 
  95 		# specifying nobs in self.data
  96 		self.nobs = self.data[self.data.keys()[0]].shape[0]
  97 
  98 	def load_csv(self,f):
  99 		"""
 100 		Loading data from a csv file. Uses pylab's load function. Seems much faster
 101 		than scipy.io.read_array.
 102 		"""
 103 		varnm = f.readline().split(',')
 104 
 105 		# what is the date variable's key if any, based on index passed as argument
 106 		if self.date_key != []:
 107 			rawdata = pylab.load(f, delimiter=',',converters={self.date_key[0]:pylab.datestr2num})			# don't need to 'skiprow' here
 108 			self.date_key = varnm[self.date_key[0]]
 109 		else:
 110 			rawdata = pylab.load(f, delimiter=',')															# don't need to 'skiprow' here
 111 
 112 		# making sure that the variable names contain no leading or trailing spaces
 113 		varnm = [i.strip() for i in varnm]
 114 
 115 		# transforming the data into a dictionary
 116 		self.data = dict(zip(varnm,rawdata.T))
 117 
 118 	def load_pickle(self,f):
 119 		"""
 120 		Loading data from a created earlier using the the dbase class.
 121 		"""
 122 		self.data = cPickle.load(f)					# loading the data dictionary
 123 
 124 		# what is the date variable's key if any
 125 		if self.date_key == []:
 126 			try:
 127 				self.date_key = cPickle.load(f)		# if nothing given assume it is in the pickle file
 128 			except:
 129 				print "No date series in pickle file"
 130 		else:
 131 			self.date_key = self.date_key[0]		# assumes formatting using pylab.datestr2num already applied
 132 
 133 	def load_shelve(self,fname,var):
 134 		"""
 135 		Loading data from a created earlier using the the dbase class.
 136 		"""
 137 		data = shelve.open(fname)				# loading the data dictionary
 138 
 139 		# find out if a variable list is provided
 140 		if var == ():
 141 			var = data.keys()
 142 
 143 		# making sure the date variable is fetched from shelve
 144 		if self.date_key != []:
 145 			if not self.date_key[0] in var: var = var + self.date_key
 146 			self.date_key = self.date_key[0]		# assumes formatting using pylab.datestr2num already applied
 147 
 148 		self.data = dict([(i,data[i]) for i in var])
 149 		data.close()
 150 
 151 	def save(self,fname):
 152 		"""
 153 		Dumping the class data dictionary into a csv or pickle file
 154 		"""
 155 		fext = self.__ext(fname)
 156 		if fext == 'csv':
 157 			f = open(fname,'w')
 158 			self.save_csv(f)
 159 			f.close()
 160 		elif fext == 'pickle':
 161 			f = open(fname,'wb')
 162 			self.save_pickle(f)
 163 			f.close()
 164 		elif fext == 'she':
 165 			self.save_shelve(fname)
 166 		else:
 167 			raise 'This class only works on csv, pickle, and shelve files'
 168 
 169 	def save_csv(self,f):
 170 		"""
 171 		Dumping the class data dictionary into a csv file
 172 		"""
 173 		writer = csv.writer(f)
 174 		writer.writerow(self.data.keys())
 175 
 176 		data = self.data						# a reference to the data dict
 177 		if self.date_key != []:
 178 			data = dict(data)				# making a copy so the dates can be changed to strings
 179 			dates = pylab.num2date(data[self.date_key])
 180 			dates = array([i.strftime('%d %b %y') for i in dates])
 181 			data[self.date_key] = dates
 182 
 183 		writer.writerows(array(data.values()).T)
 184 
 185 	def save_pickle(self,f):
 186 		"""
 187 		Dumping the class data dictionary and date_key into a binary pickle file
 188 		"""
 189 		cPickle.dump(self.data,f,2)
 190 		cPickle.dump(self.date_key,f,2)
 191 
 192 	def save_shelve(self,fname):
 193 		"""
 194 		Dumping the class data dictionary into a shelve file
 195 		"""
 196 		f = shelve.open('data.she','c') 
 197 		f = self.data
 198 		f.close()
 199 
 200 	def add_trend(self,tname = 'trend'):
 201 		# making a trend based on nobs in arbitrary series in dictionary
 202 		self.data[tname] = arange(self.nobs)
 203 
 204 	def add_dummy(self,dum, dname = 'dummy'):
 205 		if self.data.has_key(dname):
 206 			print "The variable name '" + str(dname) + "' already exists. Please select another name."
 207 		else:
 208 			self.data[dname] = dum
 209 
 210 	def add_seasonal_dummies(self,freq=52,ndum=13):
 211 		"""
 212 		This function will only work if the freq and ndum 'fit. That is,
 213 		weeks and 4-weekly periods will work. Weeks and months/quarters
 214 		will not.
 215 		"""
 216 		if self.date_key == []:
 217 			print "Cannot create seasonal dummies since no date array is known"
 218 		else:
 219 			# list of years
 220 			years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
 221 
 222 			# how many periods in does the data start
 223 			start = freq - sum(years ==	min(years))
 224 
 225 			# how many unique years
 226 			nyear = unique(years).shape[0]
 227 
 228 			# using kronecker products to make a big dummy matrix
 229 			sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
 230 			sd = sd[start:start+self.nobs]		# slicing the dummies to fit the data	
 231 			sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
 232 			self.data.update(sd)				# adding the dummies to the main dict
 233 
 234 	def delvar(self,*var):
 235 		"""
 236 		Deleting specified variables in the data dictionary, changing dictionary in place
 237 		"""
 238 		[self.data.pop(i) for i in var]
 239 
 240 	def keepvar(self,*var):
 241 		"""
 242 		Keeping specified variables in the data dictionary, changing dictionary in place
 243 		"""
 244 		[self.data.pop(i) for i in self.data.keys() if i not in var]
 245 
 246 	def delvar_copy(self,*var):
 247 		"""
 248 		Deleting specified variables in the data dictionary, making a copy
 249 		"""
 250 		return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
 251 
 252 	def keepvar_copy(self,*var):
 253 		"""
 254 		Keeping specified variables in the data dictionary, making a copy
 255 		"""
 256 		return dict([(i,self.data[i]) for i in var])
 257 
 258 	def delobs(self,sel):
 259 		"""
 260 		Deleting specified observations, changing dictionary in place
 261 		"""
 262 		for i in self.data.keys(): self.data[i] = self.data[i][sel]
 263 
 264 		# updating the value of self.nobs
 265 		self.nobs -= sum(sel)
 266 
 267 	def keepobs(self,sel):
 268 		"""
 269 		Keeping specified observations, changing dictionary in place
 270 		"""
 271 		# updating the value of self.nobs
 272 		self.nobs -= sum(sel)
 273 
 274 		sel -= 1				# making true, false and vice-versa
 275 		self.delobs(sel)
 276 
 277 	def delobs_copy(self,sel):
 278 		"""
 279 		Deleting specified observations, making a copy
 280 		"""
 281 		return dict([(i,self.data[i][sel]) for i in self.data.keys()])
 282 
 283 	def keepobs_copy(self,sel):
 284 		"""
 285 		Keeping specified observations, making a copy
 286 		"""
 287 		sel -= 1				# making true, false and vice-versa
 288 		self.delobs_copy(sel)
 289 
 290 	def get(self,*var,**sel):
 291 		"""
 292 		Copying data and keys of selected variables for further analysis
 293 		"""
 294 		# calling convenience function to clean-up input parameters
 295 		var, sel = self.__var_and_sel_clean(var, sel)
 296 
 297 		# copying the entire dictionary (= default)
 298 		d = dict((i,self.data[i][sel]) for i in var)
 299 
 300 		return d.keys(), array(d.values()).T
 301 
 302 	def info(self,*var, **adict):
 303 		"""
 304 		Printing descriptive statistics on selected variables
 305 		"""
 306 			
 307 		# calling convenience functions to clean-up input parameters
 308 		var, sel = self.__var_and_sel_clean(var, adict)
 309 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 310 			
 311 		# setting the minimum and maximum dates to be used
 312 		mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
 313 		maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
 314 
 315 		# number of variables (excluding date if present)
 316 		nvar = len(var)
 317 
 318 		print '\n=============================================================='
 319 		print '==================== Database information ===================='
 320 		print '==============================================================\n'
 321 
 322 		print 'file:				%s' % b.DBname
 323 		print '# obs:				%s' % nobs
 324 		print '# variables:		%s' % nvar 
 325 		print "Start date:			%s" % mindate
 326 		print "End date:			%s" % maxdate
 327 
 328 		print '\nvar				min			max			mean		std.dev'
 329 		print '=============================================================='
 330 		
 331 		for i in var:
 332 			_min = self.data[i][sel].min(); _max = self.data[i][sel].max(); _mean = self.data[i][sel].mean(); _std = self.data[i][sel].std()
 333 			print '''%-5s			%-5.2f		%-5.2f		%-5.2f		%-5.2f''' % tuple([i,_min,_max,_mean,_std]) 
 334 	
 335 	def dataplot(self,*var, **adict):
 336 		"""
 337 		Plotting the data with variable names
 338 		"""
 339 		# calling convenience functions to clean-up input parameters
 340 		var, sel = self.__var_and_sel_clean(var, adict)
 341 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 342 
 343 		for i in var:
 344 			pylab.plot_date(dates,self.data[i][sel],'o-') 
 345 
 346 		pylab.xlabel("Time (n = " + str(nobs) + ")") 
 347 		pylab.title("Data plot of " + self.DBname)
 348 		pylab.legend(var)
 349 		if adict.has_key('file'):
 350 			pylab.savefig(adict['file'],dpi=600)
 351 		pylab.show()
 352 
 353 	def __var_and_sel_clean(self, var, sel, dates_needed = True):
 354 		"""
 355 		Convenience function to avoid code duplication
 356 		"""
 357 		# find out if a variable list is provided
 358 		if var == ():
 359 			var = self.data.keys()
 360 			
 361 		# removing the date variable if it is present
 362 		var = [x for x in var if x != self.date_key]
 363 
 364 		# report variable label in alphabetical order
 365 		var.sort()
 366 
 367 		# find out if a selection rule is being used
 368 		# if not, set to empty tuple
 369 		if not sel.has_key('sel'):
 370 			sel = ()
 371 		else:
 372 			sel = sel['sel']
 373 
 374 		return var, sel
 375 
 376 	def __dates_and_nobs_clean(self, var, sel):
 377 		"""
 378 		Convenience function to avoid code duplication
 379 		"""
 380 		nobs = self.nobs
 381 		if len(sel):
 382 			nobs = nobs - (nobs - sum(sel))
 383 
 384 		if self.date_key != None and self.data.has_key(self.date_key):
 385 			# selecting dates from data base
 386 			dates = self.data[self.date_key][sel]
 387 		else:
 388 			# setting date series to start on 1/1/1950
 389 			dates = range(711858,nobs+711858)
 390 
 391 		return dates, nobs
 392 
 393 	def __ext(self,fname):
 394 		"""
 395 		Finding the file extension of the filename passed to dbase
 396 		"""
 397 		return fname.split('.')[-1].strip()
 398 
 399 if __name__ == '__main__':
 400 
 401 	########################
 402 	### Testing dbase class
 403 	########################
 404 
 405 	import sys
 406 	from scipy import c_
 407 
 408 	# making a directory to store simulate data
 409 	if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
 410 
 411 	# creating simulated data and variable labels
 412 	varnm = ['date','a','b','c']			# variable labels
 413 	nobs = 100
 414 	data =	randn(nobs,3)					# the data array
 415 	dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
 416 	dates = [i.strftime('%d %b %y') for i in dates]
 417 	data = c_[dates,data]
 418 
 419 	# saving simulated data to a csv file
 420 	f = open('./dbase_test_files/data.csv','w')
 421 	writer = csv.writer(f)
 422 	writer.writerow(varnm)
 423 	writer.writerows(data)
 424 	f.close()
 425 
 426 	# loading the data from the csv file
 427 	a = dbase("./dbase_test_files/data.csv",date = 0)
 428 	# saving the dbase instance data to a pickle file
 429 	a.save("./dbase_test_files/data.pickle")
 430 	# saving the dbase data to a shelve file
 431 	### a.save("./dbase_test_files/data.she")
 432 
 433 	# loading a sub-section of the data from a shelve file
 434 	### print "\nLoading 2 variables from a shelve file\n"
 435 	### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
 436 
 437 	# showing data and variable names, from load_shelve
 438 	### varnm, data = b.get()
 439 	### print "Variable names from shelve file\n", varnm
 440 	### print "\nData selected from shelve file\n", data
 441 	### print "\nDate series", b.data[b.date_key]
 442 	### del b		# cleaning up
 443 
 444 	# loading the object from the pickle file
 445 	print "\nLoading the dbase object from a pickle file\n"
 446 	b = dbase("./dbase_test_files/data.pickle")
 447 
 448 	# getting the name of the file you are working on
 449 	print "\nWorking on file: " + b.DBname
 450 
 451 	# showing data and variable names
 452 	varnm, data = b.get()
 453 	print "Variable names from dbase class\n", varnm
 454 	print "\nData from dbase class\n", data
 455 	print "\nDate series", b.data[b.date_key]
 456 
 457 	# viewing selected data columns
 458 	varnm, data = b.get('a','c')
 459 	print "\nTwo columns selected using variable names\n", varnm, "\n", data 
 460 
 461 	# saving to a csv file
 462 	print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
 463 
 464 	# adding variables/data
 465 	x1 = b.data['a'] * b.data['b']
 466 	x2 = b.data['a'] * b.data['c']
 467 	xdict = {'x1':x1,'x2':x2}
 468 	b.data.update(xdict)				# using a dictionaries own 'add/extend method'
 469 
 470 	varnm, data = b.get()
 471 	print "\nTwo variable names added\n", varnm
 472 	print "\nTwo columns added\n", data
 473 
 474 	# using copy.deepcopy to make a complete copy of the class instance data
 475 	import copy
 476 	c = copy.deepcopy(b)
 477 
 478 	# making the database smaller, inplace, by deleting selected variables
 479 	c.delvar('a','x2')
 480 	varnm, data = c.get()
 481 	print "\nTwo variable names deleted\n", varnm
 482 	print "\nTwo columns deleted\n", data
 483 
 484 	# making the database smaller, inplace, by keeping only selected variables
 485 	c = copy.deepcopy(b)
 486 	c.keepvar('a','x2')
 487 	varnm, data = c.get()
 488 	print "\nAll but two variable names deleted\n", varnm
 489 	print "\nAll but Two columns deleted\n", data
 490 
 491 	# specifying a selection rule
 492 	sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
 493 
 494 	# making the database smaller, inplace, by delecting selected observation
 495 	c = copy.deepcopy(b)
 496 	c.delobs(sel_rule)
 497 
 498 	varnm, data = c.get()
 499 	print "\nReduced number of observations following the selection rule\n", data
 500 
 501 	# making the database smaller, inplace, by delecting all but the selected observation
 502 	c = copy.deepcopy(b)
 503 	c.keepobs(sel_rule)
 504 
 505 	varnm, data = c.get()
 506 	print "\nReduced number of observations following the inverse of the selection rule\n", data
 507 
 508 	# making a copy of of just the dictionary for selected variables
 509 	x = b.keepvar_copy('a')
 510 
 511 	# making a copy of of just the dictionary for everything but the selected variables
 512 	x = b.delvar_copy('a')
 513 
 514 	# making a copy of of just the dictionary for selected observations
 515 	x = b.keepobs_copy(sel_rule)
 516 
 517 	# making a copy of of just the dictionary for everything but the selected observation
 518 	x = b.delobs_copy(sel_rule)
 519 
 520 	# descriptive information on the database
 521 	b.info()
 522 
 523 	# plotting series
 524 	b.dataplot(file = './dbase_test_files/full_plot.png')
 525 
 526 	# adding a trend component
 527 	b.add_trend('mytrend')			# or b.data.update({'mytrend':range(100)})
 528 
 529 	# adding a dummy
 530 	dummy_rule = b.data['a'] > 0
 531 	b.add_dummy(dummy_rule,'mydummy')			# or b.data.update({'mydummy':dummy_rule})
 532 
 533 	# add seasonal dummies
 534 	b.add_seasonal_dummies(52,13)
 535 
 536 	# descriptive information on the database for selected time period
 537 	b.info('b','c', sel = sel_rule)
 538 
 539 	# plotting series
 540 	b.dataplot('b','c', sel = sel_rule, file = './dbase_test_files/partial_plot.png')
New Attachment

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
[get | view] (2007-01-14 23:35:05, 4.5 KB) [[attachment:data.0.3.csv]]
[get | view] (2007-01-07 18:13:31, 4.0 KB) [[attachment:data.csv]]
[get | view] (2007-01-07 18:00:31, 16.1 KB) [[attachment:dbase.0.1.py]]
[get | view] (2007-01-14 22:55:13, 5.6 KB) [[attachment:dbase.0.2.py]]
[get | view] (2007-01-14 23:31:35, 18.1 KB) [[attachment:dbase.0.3.py]]
[get | view] (2007-01-14 23:51:16, 18.4 KB) [[attachment:dbase.0.4.py]]
[get | view] (2007-01-14 23:54:23, 18.4 KB) [[attachment:dbase.0.5.py]]
[get | view] (2007-01-15 06:33:06, 18.4 KB) [[attachment:dbase.0.6.py]]
[get | view] (2007-01-19 05:53:14, 19.1 KB) [[attachment:dbase.0.7.py]]
[get | view] (2007-01-07 07:50:10, 15.8 KB) [[attachment:dbase.py]]
[get | view] (2007-01-07 07:52:21, 7.6 KB) [[attachment:dbase.pydoc]]
[get | view] (2007-01-07 18:01:44, 7.9 KB) [[attachment:dbase_pydoc.0.1.txt]]
[get | view] (2007-01-14 22:56:04, 8.2 KB) [[attachment:dbase_pydoc.0.2.txt]]
[get | view] (2007-01-07 18:02:50, 27.7 KB) [[attachment:ex_plot.0.1.png]]
[get | view] (2007-01-07 08:01:04, 87.7 KB) [[attachment:ex_plot.png]]
[get | view] (2007-01-07 08:03:54, 28.2 KB) [[attachment:ex_plot1.png]]
[get | view] (2007-01-07 07:57:21, 895.9 KB) [[attachment:example_plot.png]]
[get | view] (2007-01-07 07:51:51, 34.3 KB) [[attachment:pydoc]]