This is an archival dump of old wiki content --- see scipy.org for current material.
Please see http://scipy-cookbook.readthedocs.org/

Attachment 'dbase.0.4.py'

Download

   1 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
   2 from numpy.random import randn
   3 from __future__ import division
   4 import pylab, cPickle, shelve, csv, copy, os
   5 
   6 class dbase:
   7 	"""
   8 	Author: Vincent Nijs (+ ?)
   9 	Email: v-nijs at kellogg.northwestern.edu
  10 	Last Modified: Sun Jan 7 10:05:12 CST 2007
  11 		
  12 	Todo:
  13 		- Check if shelve loading/saving works
  14 	
  15 	Tested on:
  16 		- Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
  17 		- Tested on Linux
  18 	
  19 	Dependencies:
  20 		- See import statement at the top of this file
  21 
  22 	Doc:
  23 	A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
  24 	Data is stored in a dictionary. 
  25 
  26 	To use the class:
  27 
  28 		>>> from dbase import dbase
  29 		>>> y = dbase('your_filename.csv')
  30 
  31 	or for a previously created dbase object stored in a pickle file
  32 
  33 		>>> from dbase import dbase
  34 		>>> y = dbase('your_filename.pickle')
  35 
  36 		or without importing the dbase class
  37 
  38 		>>> import cPickle 
  39 		>>> f = open('your_filename.pickle','rb')
  40 		>>> y = cPickle.load(f)
  41 		>>> data_key = cPickle.load(f)
  42 		>>> f.close()
  43 
  44 	or for a dictionary stored in a shelf file
  45 
  46 		>>> from dbase import dbase
  47 		>>> y = dbase('your_filename.pickle')
  48 
  49 	To return a list of variable names and an array of data
  50 
  51 		>>> varnm, data = y.get()
  52 	
  53 	For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
  54 	simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
  55 	of the class to a pickle file.
  56 	"""
  57 
  58 	def __init__(self,fname,var = (),date = ''):
  59 		"""
  60 		Initializing the dbase class. Loading file fname.
  61 
  62 		If you have have a column in your csv file that is a date-string use:
  63 
  64 			>>> x = dbase('myfile.csv',date = 0)
  65 
  66 		where 0 is the index of the date column
  67 
  68 		If you have have an array in your pickle file that is a date variable use:
  69 
  70 			>>> x = dbase('myfile.pickle',date = 'date')
  71 
  72 		where 'date' is the key of the date array
  73 		"""
  74 		self.load(fname,var,date)
  75 
  76 	def load(self,fname,var,date):
  77 		"""
  78 		Loading data from a csv or a pickle file of the dbase class.
  79 		If this is csv file use pylab's load function. Seems much faster
  80 		than scipy.io.read_array.
  81 		"""
  82 		# setting the ascii/csv file name used for input
  83 		self.DBname = os.getcwd() + '/' + fname
  84 
  85 		# assuming self.date_key = '' unless otherwise given
  86 		self.date_key = date
  87 
  88 		# getting the file extension
  89 		fext = self.__ext(fname)
  90 
  91 		# opening the file for reading
  92 		if fext == 'csv':
  93 			f = open(fname,'r')
  94 			self.load_csv(f)
  95 			f.close()
  96 		elif fext == 'pickle':
  97 			f = open(fname,'rb')
  98 			self.load_pickle(f)
  99 			f.close()
 100 		elif fext == 'she':
 101 			self.load_shelve(fname,var)
 102 		else:
 103 			raise 'This class only works on csv, pickle, and shelve files'
 104 
 105 		# specifying nobs in self.data
 106 		self.nobs = self.data[self.data.keys()[0]].shape[0]
 107 
 108 	def csvconvert(self,col):
 109 		"""
 110 		Converting data in a string array to the appropriate type
 111 		"""
 112 		# convert missing values to nan
 113 		col[col == ''] = 'nan'; col[col == '.'] = 'nan'
 114 		try: 
 115 			# if a missing value is present int variables will be up-cast to float
 116 			return col.astype('i')
 117 		except ValueError: 
 118 			try: 
 119 				return col.astype('f')
 120 			except ValueError: 
 121 				# if the data is a string, put back the empty string
 122 				col[col == 'nan'] = ''
 123 				return col
 124 
 125 	def load_csv_nf(self,f):
 126 		"""
 127 		Loading data from a csv file using the csv module. Return a list of arrays.
 128 		Possibly with different types and/or missing values.
 129 		"""
 130 		# resetting to the beginning of the file since pylab.load was already tried
 131 		f.seek(0)
 132 
 133 		reader = csv.reader(f)
 134 
 135 		# putting the data in an array of strings
 136 		datalist = array([i for i in reader])
 137 
 138 		# converting the data to an appropriate type
 139 		datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
 140 
 141 		return datalist
 142 
 143 	def load_csv(self,f):
 144 		"""
 145 		Loading data from a csv file. Uses pylab's load function. Seems much faster
 146 		than scipy.io.read_array.
 147 		"""
 148 		varnm = f.readline().split(',')
 149 
 150 		# what is the date variable's key if any, based on index passed as argument
 151 		if self.date_key != '':
 152 			try:
 153 				rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num})			# don't need to 'skiprow' here
 154 			except ValueError:																				# if loading via pylab doesn't work use csv
 155 				rawdata = self.load_csv_nf(f)	
 156 
 157 				# converting the dates column to a date-number
 158 				rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
 159 
 160 			self.date_key = varnm[self.date_key]
 161 		else:
 162 			try:
 163 				rawdata = pylab.load(f, delimiter=',')														# don't need to 'skiprow' here
 164 			except ValueError:																				# if loading via pylab doesn't work use csv
 165 				rawdata = self.load_csv_nf(f)	
 166 
 167 		# making sure that the variable names contain no leading or trailing spaces
 168 		varnm = [i.strip() for i in varnm]
 169 
 170 		# transforming the data into a dictionary
 171 		if type(rawdata) == list:
 172 			# if the csv module was used
 173 			self.data = dict(zip(varnm,rawdata))
 174 		else:
 175 			# if the pylab.load module was used
 176 			self.data = dict(zip(varnm,rawdata.T))
 177 
 178 	def load_pickle(self,f):
 179 		"""
 180 		Loading data from a created earlier using the the dbase class.
 181 		"""
 182 		self.data = cPickle.load(f)					# loading the data dictionary
 183 
 184 		# what is the date variable's key if any
 185 		if self.date_key == '':
 186 			try:
 187 				self.date_key = cPickle.load(f)		# if nothing given assume it is in the pickle file
 188 			except:
 189 				print "No date series in pickle file"
 190 
 191 	def load_shelve(self,fname,var):
 192 		"""
 193 		Loading data from a created earlier using the the dbase class.
 194 		"""
 195 		data = shelve.open(fname)				# loading the data dictionary
 196 
 197 		# find out if a variable list is provided
 198 		if var == ():
 199 			var = data.keys()
 200 
 201 		# making sure the date variable is fetched from shelve
 202 		if self.date_key != '':
 203 			if not self.date_key in var: var = var + list(self.date_key)
 204 
 205 		self.data = dict([(i,data[i]) for i in var])
 206 		data.close()
 207 
 208 	def save(self,fname):
 209 		"""
 210 		Dumping the class data dictionary into a csv or pickle file
 211 		"""
 212 		fext = self.__ext(fname)
 213 		if fext == 'csv':
 214 			f = open(fname,'w')
 215 			self.save_csv(f)
 216 			f.close()
 217 		elif fext == 'pickle':
 218 			f = open(fname,'wb')
 219 			self.save_pickle(f)
 220 			f.close()
 221 		elif fext == 'she':
 222 			self.save_shelve(fname)
 223 		else:
 224 			raise 'This class only works on csv, pickle, and shelve files'
 225 
 226 	def save_csv(self,f):
 227 		"""
 228 		Dumping the class data dictionary into a csv file
 229 		"""
 230 		writer = csv.writer(f)
 231 		writer.writerow(self.data.keys())
 232 
 233 		data = self.data						# a reference to the data dict
 234 		if self.date_key != []:
 235 			data = dict(data)				# making a copy so the dates can be changed to strings
 236 			dates = pylab.num2date(data[self.date_key])
 237 			dates = array([i.strftime('%d %b %y') for i in dates])
 238 			data[self.date_key] = dates
 239 
 240 		writer.writerows(array(data.values()).T)
 241 
 242 	def save_pickle(self,f):
 243 		"""
 244 		Dumping the class data dictionary and date_key into a binary pickle file
 245 		"""
 246 		cPickle.dump(self.data,f,2)
 247 		cPickle.dump(self.date_key,f,2)
 248 
 249 	def save_shelve(self,fname):
 250 		"""
 251 		Dumping the class data dictionary into a shelve file
 252 		"""
 253 		f = shelve.open('data.she','c') 
 254 		f = self.data
 255 		f.close()
 256 
 257 	def add_trend(self,tname = 'trend'):
 258 		# making a trend based on nobs in arbitrary series in dictionary
 259 		self.data[tname] = arange(self.nobs)
 260 
 261 	def add_dummy(self,dum, dname = 'dummy'):
 262 		if self.data.has_key(dname):
 263 			print "The variable name '" + str(dname) + "' already exists. Please select another name."
 264 		else:
 265 			self.data[dname] = dum
 266 
 267 	def add_seasonal_dummies(self,freq=52,ndum=13):
 268 		"""
 269 		This function will only work if the freq and ndum 'fit. That is,
 270 		weeks and 4-weekly periods will work. Weeks and months/quarters
 271 		will not.
 272 		"""
 273 		if self.date_key == []:
 274 			print "Cannot create seasonal dummies since no date array is known"
 275 		else:
 276 			# list of years
 277 			years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
 278 
 279 			# how many periods in does the data start
 280 			start = freq - sum(years ==	min(years))
 281 
 282 			# how many unique years
 283 			nyear = unique(years).shape[0]
 284 
 285 			# using kronecker products to make a big dummy matrix
 286 			sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
 287 			sd = sd[start:start+self.nobs]		# slicing the dummies to fit the data	
 288 			sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
 289 			self.data.update(sd)				# adding the dummies to the main dict
 290 
 291 	def delvar(self,*var):
 292 		"""
 293 		Deleting specified variables in the data dictionary, changing dictionary in place
 294 		"""
 295 		[self.data.pop(i) for i in var]
 296 
 297 	def keepvar(self,*var):
 298 		"""
 299 		Keeping specified variables in the data dictionary, changing dictionary in place
 300 		"""
 301 		[self.data.pop(i) for i in self.data.keys() if i not in var]
 302 
 303 	def delvar_copy(self,*var):
 304 		"""
 305 		Deleting specified variables in the data dictionary, making a copy
 306 		"""
 307 		return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
 308 
 309 	def keepvar_copy(self,*var):
 310 		"""
 311 		Keeping specified variables in the data dictionary, making a copy
 312 		"""
 313 		return dict([(i,self.data[i]) for i in var])
 314 
 315 	def delobs(self,sel):
 316 		"""
 317 		Deleting specified observations, changing dictionary in place
 318 		"""
 319 		for i in self.data.keys(): self.data[i] = self.data[i][sel]
 320 
 321 		# updating the value of self.nobs
 322 		self.nobs -= sum(sel)
 323 
 324 	def keepobs(self,sel):
 325 		"""
 326 		Keeping specified observations, changing dictionary in place
 327 		"""
 328 		# updating the value of self.nobs
 329 		self.nobs -= sum(sel)
 330 
 331 		sel -= 1				# making true, false and vice-versa
 332 		self.delobs(sel)
 333 
 334 	def delobs_copy(self,sel):
 335 		"""
 336 		Deleting specified observations, making a copy
 337 		"""
 338 		return dict([(i,self.data[i][sel]) for i in self.data.keys()])
 339 
 340 	def keepobs_copy(self,sel):
 341 		"""
 342 		Keeping specified observations, making a copy
 343 		"""
 344 		sel -= 1				# making true, false and vice-versa
 345 		self.delobs_copy(sel)
 346 
 347 	def get(self,*var,**sel):
 348 		"""
 349 		Copying data and keys of selected variables for further analysis
 350 		"""
 351 		# calling convenience function to clean-up input parameters
 352 		var, sel = self.__var_and_sel_clean(var, sel)
 353 
 354 		# copying the entire dictionary (= default)
 355 		d = dict((i,self.data[i][sel]) for i in var)
 356 
 357 		return d.keys(), array(d.values()).T
 358 
 359 	def info(self,*var, **adict):
 360 		"""
 361 		Printing descriptive statistics on selected variables
 362 		"""
 363 			
 364 		# calling convenience functions to clean-up input parameters
 365 		var, sel = self.__var_and_sel_clean(var, adict)
 366 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 367 			
 368 		# setting the minimum and maximum dates to be used
 369 		mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
 370 		maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
 371 
 372 		# number of variables (excluding date if present)
 373 		nvar = len(var)
 374 
 375 		print '\n=============================================================================='
 376 		print '============================ Database information ============================'
 377 		print '==============================================================================\n'
 378 
 379 		print 'file:				%s' % self.DBname
 380 		print '# obs:				%s' % nobs
 381 		print '# variables:		%s' % nvar 
 382 		print 'Start date:			%s' % mindate
 383 		print 'End date:			%s' % maxdate
 384 
 385 		print '\nvar				min			max			mean		std.dev		miss	levels'
 386 		print '=============================================================================='
 387 		
 388 		sets = {}
 389 		for i in var:
 390 			col = self.data[i][sel];
 391 			if type(col[0]) == string_:
 392 				_miss = sum(col == '')
 393 				col_set = set(col)
 394 				sets[i] = col_set
 395 				print '''%-5s			%-5s		%-5s		%-5s		%-5s		% -5.0f	%-5i''' % tuple([i,'-','-','-','-',_miss,len(col_set)]) 
 396 			else:
 397 				_miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
 398 				print '''% -5s			% -5.2f		% -5.2f		% -5.2f		% -5.2f		% -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)]) 
 399 
 400 		if sets:
 401 			print '\n\nLevels for non-numeric data:'
 402 			for i in sets.keys():
 403 				print '=============================================================================='
 404 				print '''% -5s	% -5s''' % tuple([i,sets[i]])
 405 	
 406 	def dataplot(self,*var, **adict):
 407 		"""
 408 		Plotting the data with variable names
 409 		"""
 410 		# calling convenience functions to clean-up input parameters
 411 		var, sel = self.__var_and_sel_clean(var, adict)
 412 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 413 
 414 		# don't try to plot non-numerical variables
 415 		nvar = []
 416 		for i in var:
 417 			col = self.data[i][sel]
 418 			if type(col[0]) != string_:
 419 				pylab.plot_date(dates,self.data[i][sel],'o-') 
 420 				nvar = nvar + [i]
 421 
 422 		pylab.xlabel("Time (n = " + str(nobs) + ")") 
 423 		pylab.title("Data plot of " + self.DBname)
 424 		pylab.legend(nvar)
 425 		if adict.has_key('file'):
 426 			pylab.savefig(adict['file'],dpi=600)
 427 		pylab.show()
 428 
 429 	def __var_and_sel_clean(self, var, sel, dates_needed = True):
 430 		"""
 431 		Convenience function to avoid code duplication
 432 		"""
 433 		# find out if a variable list is provided
 434 		if var == ():
 435 			var = self.data.keys()
 436 			
 437 		# removing the date variable if it is present
 438 		var = [x for x in var if x != self.date_key]
 439 
 440 		# report variable label in alphabetical order
 441 		var.sort()
 442 
 443 		# find out if a selection rule is being used
 444 		# if not, set to empty tuple
 445 		if not sel.has_key('sel'):
 446 			sel = ()
 447 		else:
 448 			sel = sel['sel']
 449 
 450 		return var, sel
 451 
 452 	def __dates_and_nobs_clean(self, var, sel):
 453 		"""
 454 		Convenience function to avoid code duplication
 455 		"""
 456 		nobs = self.nobs
 457 		if len(sel):
 458 			nobs = nobs - (nobs - sum(sel))
 459 
 460 		if self.date_key != None and self.data.has_key(self.date_key):
 461 			# selecting dates from data base
 462 			dates = self.data[self.date_key][sel]
 463 		else:
 464 			# setting date series to start on 1/1/1950
 465 			dates = range(711858,nobs+711858)
 466 
 467 		return dates, nobs
 468 
 469 	def __ext(self,fname):
 470 		"""
 471 		Finding the file extension of the filename passed to dbase
 472 		"""
 473 		return fname.split('.')[-1].strip()
 474 
 475 if __name__ == '__main__':
 476 
 477 	###################################
 478 	### usage examples of dbase class
 479 	###################################
 480 
 481 	import sys
 482 	from scipy import c_
 483 
 484 	# making a directory to store simulate data
 485 	if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
 486 
 487 	# creating simulated data and variable labels
 488 	varnm = ['date','a','b','c']			# variable labels
 489 	nobs = 100
 490 	data =	randn(nobs,3)					# the data array
 491 	dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
 492 	dates = [i.strftime('%d %b %y') for i in dates]
 493 	data = c_[dates,data]
 494 
 495 	# adding a few missing values
 496 	data[5,1] = ''
 497 	data[9,3] = ''
 498 
 499 	# adding a non-numeric variable
 500 	varnm = varnm + ['id']		# variable labels
 501 	id = [('id'+str(i)) for i in range(nobs)]
 502 	id[8] = ''		# setting one id to missing
 503 	data = c_[data,id]
 504 
 505 	# saving simulated data to a csv file
 506 	f = open('./dbase_test_files/data.csv','w')
 507 	writer = csv.writer(f)
 508 	writer.writerow(varnm)
 509 	writer.writerows(data)
 510 	f.close()
 511 
 512 	# loading the data from the csv file
 513 	a = dbase("./dbase_test_files/data.csv",date = 0)
 514 
 515 	# saving the dbase instance data to a pickle file
 516 	a.save("./dbase_test_files/data.pickle")
 517 	# saving the dbase data to a shelve file
 518 	### a.save("./dbase_test_files/data.she")
 519 
 520 	# loading a sub-section of the data from a shelve file
 521 	### print "\nLoading 2 variables from a shelve file\n"
 522 	### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
 523 
 524 	# showing data and variable names, from load_shelve
 525 	### varnm, data = b.get()
 526 	### print "Variable names from shelve file\n", varnm
 527 	### print "\nData selected from shelve file\n", data
 528 	### print "\nDate series", b.data[b.date_key]
 529 	### del b		# cleaning up
 530 
 531 	# loading the object from the pickle file
 532 	print "\nLoading the dbase object from a pickle file\n"
 533 	b = dbase("./dbase_test_files/data.pickle")
 534 
 535 	# getting the name of the file you are working on
 536 	print "\nWorking on file: " + b.DBname
 537 
 538 	# showing data and variable names
 539 	varnm, data = b.get()
 540 	print "Variable names from dbase class\n", varnm
 541 	print "\nData from dbase class\n", data
 542 	print "\nDate series", b.data[b.date_key]
 543 
 544 	# viewing selected data columns
 545 	varnm, data = b.get('a','c')
 546 	print "\nTwo columns selected using variable names\n", varnm, "\n", data 
 547 
 548 	# saving to a csv file
 549 	print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
 550 
 551 	# adding variables/data
 552 	x1 = b.data['a'] * b.data['b']
 553 	x2 = b.data['a'] * b.data['c']
 554 	xdict = {'x1':x1,'x2':x2}
 555 	b.data.update(xdict)				# using a dictionaries own 'add/extend method'
 556 
 557 	varnm, data = b.get()
 558 	print "\nTwo variable names added\n", varnm
 559 	print "\nTwo columns added\n", data
 560 
 561 	# using copy.deepcopy to make a complete copy of the class instance data
 562 	import copy
 563 	c = copy.deepcopy(b)
 564 
 565 	# making the database smaller, inplace, by deleting selected variables
 566 	c.delvar('a','x2')
 567 	varnm, data = c.get()
 568 	print "\nTwo variable names deleted\n", varnm
 569 	print "\nTwo columns deleted\n", data
 570 
 571 	# making the database smaller, inplace, by keeping only selected variables
 572 	c = copy.deepcopy(b)
 573 	c.keepvar('a','x2')
 574 	varnm, data = c.get()
 575 	print "\nAll but two variable names deleted\n", varnm
 576 	print "\nAll but Two columns deleted\n", data
 577 
 578 	# specifying a selection rule
 579 	sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
 580 
 581 	# making the database smaller, inplace, by delecting selected observation
 582 	c = copy.deepcopy(b)
 583 	c.delobs(sel_rule)
 584 
 585 	varnm, data = c.get()
 586 	print "\nReduced number of observations following the selection rule\n", data
 587 
 588 	# making the database smaller, inplace, by delecting all but the selected observation
 589 	c = copy.deepcopy(b)
 590 	c.keepobs(sel_rule)
 591 
 592 	varnm, data = c.get()
 593 	print "\nReduced number of observations following the inverse of the selection rule\n", data
 594 
 595 	# making a copy of of just the dictionary for selected variables
 596 	x = b.keepvar_copy('a')
 597 
 598 	# making a copy of of just the dictionary for everything but the selected variables
 599 	x = b.delvar_copy('a')
 600 
 601 	# making a copy of of just the dictionary for selected observations
 602 	x = b.keepobs_copy(sel_rule)
 603 
 604 	# making a copy of of just the dictionary for everything but the selected observation
 605 	x = b.delobs_copy(sel_rule)
 606 
 607 	# descriptive information on the database
 608 	b.info()
 609 
 610 	# plotting series
 611 	### b.dataplot(file = './dbase_test_files/full_plot.png')
 612 
 613 	# adding a trend component
 614 	b.add_trend('mytrend')				# or b.data.update({'mytrend':range(100)})
 615 
 616 	# adding a dummy
 617 	dummy_rule = b.data['a'] > 0
 618 	b.add_dummy(dummy_rule,'mydummy')	# or b.data.update({'mydummy':dummy_rule})
 619 
 620 	# add seasonal dummies, specify data frequency and # of dummies
 621 	b.add_seasonal_dummies(52,13)
 622 
 623 	# descriptive information on the database for selected variables and time periods
 624 	b.info('b','c','mydummy', sel = sel_rule)
 625 
 626 	# plotting series for selected variables and selected data periods
 627 	### b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')

New Attachment

File to upload
Rename to
Overwrite existing attachment of same name

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.