This is an archival dump of old wiki content --- see scipy.org for current material.
Please see http://scipy-cookbook.readthedocs.org/

Attachment 'dbase.0.3.py'

Download

   1 from scipy import c_, arange, array, unique, kron, ones, eye, nan, isnan, string_
   2 from numpy.random import randn
   3 from __future__ import division
   4 import pylab, cPickle, shelve, csv, copy, os
   5 
   6 class dbase:
   7 	"""
   8 	Author: Vincent Nijs (+ ?)
   9 	Email: v-nijs at kellogg.northwestern.edu
  10 	Last Modified: Sun Jan 7 10:05:12 CST 2007
  11 		
  12 	Todo:
  13 		- Check if shelve loading/saving works
  14 	
  15 	Tested on:
  16 		- Works on Mac OS X 10.4.8, with full matplotlib (incl. pytz)
  17 		- Tested on Linux
  18 	
  19 	Dependencies:
  20 		- See import statement at the top of this file
  21 
  22 	Doc:
  23 	A simple data-frame, that reads and writes csv/pickle/shelve files with variable names.
  24 	Data is stored in a dictionary. 
  25 
  26 	To use the class:
  27 
  28 		>>> from dbase import dbase
  29 		>>> y = dbase('your_filename.csv')
  30 
  31 	or for a previously created dbase object stored in a pickle file
  32 
  33 		>>> from dbase import dbase
  34 		>>> y = dbase('your_filename.pickle')
  35 
  36 		or without importing the dbase class
  37 
  38 		>>> import cPickle 
  39 		>>> f = open('your_filename.pickle','rb')
  40 		>>> y = cPickle.load(f)
  41 		>>> data_key = cPickle.load(f)
  42 		>>> f.close()
  43 
  44 	or for a dictionary stored in a shelf file
  45 
  46 		>>> from dbase import dbase
  47 		>>> y = dbase('your_filename.pickle')
  48 
  49 	To return a list of variable names and an array of data
  50 
  51 		>>> varnm, data = y.get()
  52 	
  53 	For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
  54 	simply run the file using 'python dbase.py'. This will generate some simulated data (data.csv) and save instance data
  55 	of the class to a pickle file.
  56 	"""
  57 
  58 	def __init__(self,fname,var = (),date = ''):
  59 		"""
  60 		Initializing the dbase class. Loading file fname.
  61 
  62 		If you have have a column in your csv file that is a date-string use:
  63 
  64 			>>> x = dbase('myfile.csv',date = 0)
  65 
  66 		where 0 is the index of the date column
  67 
  68 		If you have have an array in your pickle file that is a date variable use:
  69 
  70 			>>> x = dbase('myfile.pickle',date = 'date')
  71 
  72 		where 'date' is the key of the date array
  73 		"""
  74 		self.load(fname,var,date)
  75 
  76 	def load(self,fname,var,date):
  77 		"""
  78 		Loading data from a csv or a pickle file of the dbase class.
  79 		If this is csv file use pylab's load function. Seems much faster
  80 		than scipy.io.read_array.
  81 		"""
  82 		# setting the ascii/csv file name used for input
  83 		self.DBname = os.getcwd() + '/' + fname
  84 
  85 		# assuming self.date_key = '' unless otherwise given
  86 		self.date_key = date
  87 
  88 		# getting the file extension
  89 		fext = self.__ext(fname)
  90 
  91 		# opening the file for reading
  92 		if fext == 'csv':
  93 			f = open(fname,'r')
  94 			self.load_csv(f)
  95 			f.close()
  96 		elif fext == 'pickle':
  97 			f = open(fname,'rb')
  98 			self.load_pickle(f)
  99 			f.close()
 100 		elif fext == 'she':
 101 			self.load_shelve(fname,var)
 102 		else:
 103 			raise 'This class only works on csv, pickle, and shelve files'
 104 
 105 		# specifying nobs in self.data
 106 		self.nobs = self.data[self.data.keys()[0]].shape[0]
 107 
 108 	def csvconvert(self,col):
 109 		"""
 110 		Converting data in a string array to the appropriate type
 111 		"""
 112 		# convert missing values to nan
 113 		col[col == ''] = 'nan'; col[col == '.'] = 'nan'
 114 		try: 
 115 			# if a missing value is present int variables will be up-cast to float
 116 			return col.astype('i')
 117 		except ValueError: 
 118 			try: 
 119 				return col.astype('f')
 120 			except ValueError: 
 121 				# if the data is a string, put back the empty string
 122 				col[col == 'nan'] = ''
 123 				return col
 124 
 125 	def load_csv_nf(self,f):
 126 		"""
 127 		Loading data from a csv file using the csv module. Return a list of arrays.
 128 		Possibly with different types and/or missing values.
 129 		"""
 130 		# resetting to the beginning of the file since pylab.load was already tried
 131 		f.seek(0)
 132 
 133 		reader = csv.reader(f)
 134 
 135 		# putting the data in an array of strings
 136 		datalist = array([i for i in reader])
 137 
 138 		# converting the data to an appropriate type
 139 		datalist = [self.csvconvert(datalist[1:,i]) for i in range(datalist.shape[1])]
 140 
 141 		return datalist
 142 
 143 	def load_csv(self,f):
 144 		"""
 145 		Loading data from a csv file. Uses pylab's load function. Seems much faster
 146 		than scipy.io.read_array.
 147 		"""
 148 		varnm = f.readline().split(',')
 149 
 150 		# what is the date variable's key if any, based on index passed as argument
 151 		if self.date_key != '':
 152 			try:
 153 				rawdata = pylab.load(f, delimiter=',',converters={self.date_key:pylab.datestr2num})			# don't need to 'skiprow' here
 154 			except ValueError:																				# if loading via pylab doesn't work use csv
 155 				rawdata = self.load_csv_nf(f)	
 156 
 157 				# converting the dates column to a date-number
 158 				rawdata[self.date_key] = pylab.datestr2num(rawdata[self.date_key])
 159 
 160 			self.date_key = varnm[self.date_key]
 161 		else:
 162 			try:
 163 				rawdata = pylab.load(f, delimiter=',')														# don't need to 'skiprow' here
 164 			except ValueError:																				# if loading via pylab doesn't work use csv
 165 				rawdata = self.load_csv_nf(f)	
 166 
 167 		# making sure that the variable names contain no leading or trailing spaces
 168 		varnm = [i.strip() for i in varnm]
 169 
 170 		# transforming the data into a dictionary
 171 		if type(rawdata) == list:
 172 			# if the csv module was used
 173 			self.data = dict(zip(varnm,rawdata))
 174 		else:
 175 			# if the pylab.load module was used
 176 			self.data = dict(zip(varnm,rawdata.T))
 177 
 178 	def load_pickle(self,f):
 179 		"""
 180 		Loading data from a created earlier using the the dbase class.
 181 		"""
 182 		self.data = cPickle.load(f)					# loading the data dictionary
 183 
 184 		# what is the date variable's key if any
 185 		if self.date_key == '':
 186 			try:
 187 				self.date_key = cPickle.load(f)		# if nothing given assume it is in the pickle file
 188 			except:
 189 				print "No date series in pickle file"
 190 
 191 	def load_shelve(self,fname,var):
 192 		"""
 193 		Loading data from a created earlier using the the dbase class.
 194 		"""
 195 		data = shelve.open(fname)				# loading the data dictionary
 196 
 197 		# find out if a variable list is provided
 198 		if var == ():
 199 			var = data.keys()
 200 
 201 		# making sure the date variable is fetched from shelve
 202 		if self.date_key != '':
 203 			if not self.date_key in var: var = var + list(self.date_key)
 204 
 205 		self.data = dict([(i,data[i]) for i in var])
 206 		data.close()
 207 
 208 	def save(self,fname):
 209 		"""
 210 		Dumping the class data dictionary into a csv or pickle file
 211 		"""
 212 		fext = self.__ext(fname)
 213 		if fext == 'csv':
 214 			f = open(fname,'w')
 215 			self.save_csv(f)
 216 			f.close()
 217 		elif fext == 'pickle':
 218 			f = open(fname,'wb')
 219 			self.save_pickle(f)
 220 			f.close()
 221 		elif fext == 'she':
 222 			self.save_shelve(fname)
 223 		else:
 224 			raise 'This class only works on csv, pickle, and shelve files'
 225 
 226 	def save_csv(self,f):
 227 		"""
 228 		Dumping the class data dictionary into a csv file
 229 		"""
 230 		writer = csv.writer(f)
 231 		writer.writerow(self.data.keys())
 232 
 233 		data = self.data						# a reference to the data dict
 234 		if self.date_key != []:
 235 			data = dict(data)				# making a copy so the dates can be changed to strings
 236 			dates = pylab.num2date(data[self.date_key])
 237 			dates = array([i.strftime('%d %b %y') for i in dates])
 238 			data[self.date_key] = dates
 239 
 240 		writer.writerows(array(data.values()).T)
 241 
 242 	def save_pickle(self,f):
 243 		"""
 244 		Dumping the class data dictionary and date_key into a binary pickle file
 245 		"""
 246 		cPickle.dump(self.data,f,2)
 247 		cPickle.dump(self.date_key,f,2)
 248 
 249 	def save_shelve(self,fname):
 250 		"""
 251 		Dumping the class data dictionary into a shelve file
 252 		"""
 253 		f = shelve.open('data.she','c') 
 254 		f = self.data
 255 		f.close()
 256 
 257 	def add_trend(self,tname = 'trend'):
 258 		# making a trend based on nobs in arbitrary series in dictionary
 259 		self.data[tname] = arange(self.nobs)
 260 
 261 	def add_dummy(self,dum, dname = 'dummy'):
 262 		if self.data.has_key(dname):
 263 			print "The variable name '" + str(dname) + "' already exists. Please select another name."
 264 		else:
 265 			self.data[dname] = dum
 266 
 267 	def add_seasonal_dummies(self,freq=52,ndum=13):
 268 		"""
 269 		This function will only work if the freq and ndum 'fit. That is,
 270 		weeks and 4-weekly periods will work. Weeks and months/quarters
 271 		will not.
 272 		"""
 273 		if self.date_key == []:
 274 			print "Cannot create seasonal dummies since no date array is known"
 275 		else:
 276 			# list of years
 277 			years = array([pylab.num2date(i).year for i in self.data[self.date_key]])
 278 
 279 			# how many periods in does the data start
 280 			start = freq - sum(years ==	min(years))
 281 
 282 			# how many unique years
 283 			nyear = unique(years).shape[0]
 284 
 285 			# using kronecker products to make a big dummy matrix
 286 			sd = kron(ones(nyear),kron(eye(ndum),ones(freq/ndum))).T;
 287 			sd = sd[start:start+self.nobs]		# slicing the dummies to fit the data	
 288 			sd = dict([(("sd"+str(i+1)),sd[:,i]) for i in range(1,ndum)])
 289 			self.data.update(sd)				# adding the dummies to the main dict
 290 
 291 	def delvar(self,*var):
 292 		"""
 293 		Deleting specified variables in the data dictionary, changing dictionary in place
 294 		"""
 295 		[self.data.pop(i) for i in var]
 296 
 297 	def keepvar(self,*var):
 298 		"""
 299 		Keeping specified variables in the data dictionary, changing dictionary in place
 300 		"""
 301 		[self.data.pop(i) for i in self.data.keys() if i not in var]
 302 
 303 	def delvar_copy(self,*var):
 304 		"""
 305 		Deleting specified variables in the data dictionary, making a copy
 306 		"""
 307 		return dict([(i,self.data[i]) for i in self.data.keys() if i not in var])
 308 
 309 	def keepvar_copy(self,*var):
 310 		"""
 311 		Keeping specified variables in the data dictionary, making a copy
 312 		"""
 313 		return dict([(i,self.data[i]) for i in var])
 314 
 315 	def delobs(self,sel):
 316 		"""
 317 		Deleting specified observations, changing dictionary in place
 318 		"""
 319 		for i in self.data.keys(): self.data[i] = self.data[i][sel]
 320 
 321 		# updating the value of self.nobs
 322 		self.nobs -= sum(sel)
 323 
 324 	def keepobs(self,sel):
 325 		"""
 326 		Keeping specified observations, changing dictionary in place
 327 		"""
 328 		# updating the value of self.nobs
 329 		self.nobs -= sum(sel)
 330 
 331 		sel -= 1				# making true, false and vice-versa
 332 		self.delobs(sel)
 333 
 334 	def delobs_copy(self,sel):
 335 		"""
 336 		Deleting specified observations, making a copy
 337 		"""
 338 		return dict([(i,self.data[i][sel]) for i in self.data.keys()])
 339 
 340 	def keepobs_copy(self,sel):
 341 		"""
 342 		Keeping specified observations, making a copy
 343 		"""
 344 		sel -= 1				# making true, false and vice-versa
 345 		self.delobs_copy(sel)
 346 
 347 	def get(self,*var,**sel):
 348 		"""
 349 		Copying data and keys of selected variables for further analysis
 350 		"""
 351 		# calling convenience function to clean-up input parameters
 352 		var, sel = self.__var_and_sel_clean(var, sel)
 353 
 354 		# copying the entire dictionary (= default)
 355 		d = dict((i,self.data[i][sel]) for i in var)
 356 
 357 		return d.keys(), array(d.values()).T
 358 
 359 	def info(self,*var, **adict):
 360 		"""
 361 		Printing descriptive statistics on selected variables
 362 		"""
 363 			
 364 		# calling convenience functions to clean-up input parameters
 365 		var, sel = self.__var_and_sel_clean(var, adict)
 366 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 367 			
 368 		# setting the minimum and maximum dates to be used
 369 		mindate = pylab.num2date(min(dates)).strftime('%d %b %Y')
 370 		maxdate = pylab.num2date(max(dates)).strftime('%d %b %Y')
 371 
 372 		# number of variables (excluding date if present)
 373 		nvar = len(var)
 374 
 375 		print '\n=============================================================================='
 376 		print '============================ Database information ============================'
 377 		print '==============================================================================\n'
 378 
 379 		print 'file:				%s' % self.DBname
 380 		print '# obs:				%s' % nobs
 381 		print '# variables:		%s' % nvar 
 382 		print 'Start date:			%s' % mindate
 383 		print 'End date:			%s' % maxdate
 384 
 385 		print '\nvar				min			max			mean		std.dev		miss	levels'
 386 		print '=============================================================================='
 387 		
 388 		for i in var:
 389 			col = self.data[i][sel];
 390 			if type(col[0]) == string_:
 391 				_miss = sum(col == '')
 392 				print '''%-5s			%-5s		%-5s		%-5s		%-5s		% -5.0f	%-5i''' % tuple([i,'-','-','-','-',_miss,len(set(col))]) 
 393 			else:
 394 				_miss = isnan(col); col = col[_miss == False]; _min = col.min(); _max = col.max(); _mean = col.mean(); _std = col.std()
 395 				print '''% -5s			% -5.2f		% -5.2f		% -5.2f		% -5.2f		% -5.0f''' % tuple([i,_min,_max,_mean,_std,sum(_miss)]) 
 396 	
 397 	def dataplot(self,*var, **adict):
 398 		"""
 399 		Plotting the data with variable names
 400 		"""
 401 		# calling convenience functions to clean-up input parameters
 402 		var, sel = self.__var_and_sel_clean(var, adict)
 403 		dates, nobs = self.__dates_and_nobs_clean(var, sel)
 404 
 405 		# don't try to plot non-numerical variables
 406 		nvar = []
 407 		for i in var:
 408 			col = self.data[i][sel]
 409 			if type(col[0]) != string_:
 410 				pylab.plot_date(dates,self.data[i][sel],'o-') 
 411 				nvar = nvar + [i]
 412 
 413 		pylab.xlabel("Time (n = " + str(nobs) + ")") 
 414 		pylab.title("Data plot of " + self.DBname)
 415 		pylab.legend(nvar)
 416 		if adict.has_key('file'):
 417 			pylab.savefig(adict['file'],dpi=600)
 418 		pylab.show()
 419 
 420 	def __var_and_sel_clean(self, var, sel, dates_needed = True):
 421 		"""
 422 		Convenience function to avoid code duplication
 423 		"""
 424 		# find out if a variable list is provided
 425 		if var == ():
 426 			var = self.data.keys()
 427 			
 428 		# removing the date variable if it is present
 429 		var = [x for x in var if x != self.date_key]
 430 
 431 		# report variable label in alphabetical order
 432 		var.sort()
 433 
 434 		# find out if a selection rule is being used
 435 		# if not, set to empty tuple
 436 		if not sel.has_key('sel'):
 437 			sel = ()
 438 		else:
 439 			sel = sel['sel']
 440 
 441 		return var, sel
 442 
 443 	def __dates_and_nobs_clean(self, var, sel):
 444 		"""
 445 		Convenience function to avoid code duplication
 446 		"""
 447 		nobs = self.nobs
 448 		if len(sel):
 449 			nobs = nobs - (nobs - sum(sel))
 450 
 451 		if self.date_key != None and self.data.has_key(self.date_key):
 452 			# selecting dates from data base
 453 			dates = self.data[self.date_key][sel]
 454 		else:
 455 			# setting date series to start on 1/1/1950
 456 			dates = range(711858,nobs+711858)
 457 
 458 		return dates, nobs
 459 
 460 	def __ext(self,fname):
 461 		"""
 462 		Finding the file extension of the filename passed to dbase
 463 		"""
 464 		return fname.split('.')[-1].strip()
 465 
 466 if __name__ == '__main__':
 467 
 468 	###################################
 469 	### usage examples of dbase class
 470 	###################################
 471 
 472 	import sys
 473 	from scipy import c_
 474 
 475 	# making a directory to store simulate data
 476 	if not os.path.exists('./dbase_test_files'): os.mkdir('./dbase_test_files')
 477 
 478 	# creating simulated data and variable labels
 479 	varnm = ['date','a','b','c']			# variable labels
 480 	nobs = 100
 481 	data =	randn(nobs,3)					# the data array
 482 	dates = pylab.num2date(arange(730493,730493+(nobs*7),7))
 483 	dates = [i.strftime('%d %b %y') for i in dates]
 484 	data = c_[dates,data]
 485 
 486 	# adding a few missing values
 487 	data[5,1] = ''
 488 	data[9,3] = ''
 489 
 490 	# adding a non-numeric variable
 491 	varnm = varnm + ['id']		# variable labels
 492 	id = [('id'+str(i)) for i in range(nobs)]
 493 	id[8] = ''		# setting one id to missing
 494 	data = c_[data,id]
 495 
 496 	# saving simulated data to a csv file
 497 	f = open('./dbase_test_files/data.csv','w')
 498 	writer = csv.writer(f)
 499 	writer.writerow(varnm)
 500 	writer.writerows(data)
 501 	f.close()
 502 
 503 	# loading the data from the csv file
 504 	a = dbase("./dbase_test_files/data.csv",date = 0)
 505 
 506 	# saving the dbase instance data to a pickle file
 507 	a.save("./dbase_test_files/data.pickle")
 508 	# saving the dbase data to a shelve file
 509 	### a.save("./dbase_test_files/data.she")
 510 
 511 	# loading a sub-section of the data from a shelve file
 512 	### print "\nLoading 2 variables from a shelve file\n"
 513 	### b = dbase("./dbase_test_files/data.she",'a','b',date = 'date')
 514 
 515 	# showing data and variable names, from load_shelve
 516 	### varnm, data = b.get()
 517 	### print "Variable names from shelve file\n", varnm
 518 	### print "\nData selected from shelve file\n", data
 519 	### print "\nDate series", b.data[b.date_key]
 520 	### del b		# cleaning up
 521 
 522 	# loading the object from the pickle file
 523 	print "\nLoading the dbase object from a pickle file\n"
 524 	b = dbase("./dbase_test_files/data.pickle")
 525 
 526 	# getting the name of the file you are working on
 527 	print "\nWorking on file: " + b.DBname
 528 
 529 	# showing data and variable names
 530 	varnm, data = b.get()
 531 	print "Variable names from dbase class\n", varnm
 532 	print "\nData from dbase class\n", data
 533 	print "\nDate series", b.data[b.date_key]
 534 
 535 	# viewing selected data columns
 536 	varnm, data = b.get('a','c')
 537 	print "\nTwo columns selected using variable names\n", varnm, "\n", data 
 538 
 539 	# saving to a csv file
 540 	print "\nSaving data and variable names to a different csv file\n", b.save("./dbase_test_files/data_save.csv")
 541 
 542 	# adding variables/data
 543 	x1 = b.data['a'] * b.data['b']
 544 	x2 = b.data['a'] * b.data['c']
 545 	xdict = {'x1':x1,'x2':x2}
 546 	b.data.update(xdict)				# using a dictionaries own 'add/extend method'
 547 
 548 	varnm, data = b.get()
 549 	print "\nTwo variable names added\n", varnm
 550 	print "\nTwo columns added\n", data
 551 
 552 	# using copy.deepcopy to make a complete copy of the class instance data
 553 	import copy
 554 	c = copy.deepcopy(b)
 555 
 556 	# making the database smaller, inplace, by deleting selected variables
 557 	c.delvar('a','x2')
 558 	varnm, data = c.get()
 559 	print "\nTwo variable names deleted\n", varnm
 560 	print "\nTwo columns deleted\n", data
 561 
 562 	# making the database smaller, inplace, by keeping only selected variables
 563 	c = copy.deepcopy(b)
 564 	c.keepvar('a','x2')
 565 	varnm, data = c.get()
 566 	print "\nAll but two variable names deleted\n", varnm
 567 	print "\nAll but Two columns deleted\n", data
 568 
 569 	# specifying a selection rule
 570 	sel_rule = b.data['date'] > pylab.datestr2num("8/1/2001")
 571 
 572 	# making the database smaller, inplace, by delecting selected observation
 573 	c = copy.deepcopy(b)
 574 	c.delobs(sel_rule)
 575 
 576 	varnm, data = c.get()
 577 	print "\nReduced number of observations following the selection rule\n", data
 578 
 579 	# making the database smaller, inplace, by delecting all but the selected observation
 580 	c = copy.deepcopy(b)
 581 	c.keepobs(sel_rule)
 582 
 583 	varnm, data = c.get()
 584 	print "\nReduced number of observations following the inverse of the selection rule\n", data
 585 
 586 	# making a copy of of just the dictionary for selected variables
 587 	x = b.keepvar_copy('a')
 588 
 589 	# making a copy of of just the dictionary for everything but the selected variables
 590 	x = b.delvar_copy('a')
 591 
 592 	# making a copy of of just the dictionary for selected observations
 593 	x = b.keepobs_copy(sel_rule)
 594 
 595 	# making a copy of of just the dictionary for everything but the selected observation
 596 	x = b.delobs_copy(sel_rule)
 597 
 598 	# descriptive information on the database
 599 	b.info()
 600 
 601 	# plotting series
 602 	b.dataplot(file = './dbase_test_files/full_plot.png')
 603 
 604 	# adding a trend component
 605 	b.add_trend('mytrend')				# or b.data.update({'mytrend':range(100)})
 606 
 607 	# adding a dummy
 608 	dummy_rule = b.data['a'] > 0
 609 	b.add_dummy(dummy_rule,'mydummy')	# or b.data.update({'mydummy':dummy_rule})
 610 
 611 	# add seasonal dummies, specify data frequency and # of dummies
 612 	b.add_seasonal_dummies(52,13)
 613 
 614 	# descriptive information on the database for selected variables and time periods
 615 	b.info('b','c','mydummy', sel = sel_rule)
 616 
 617 	# plotting series for selected variables and selected data periods
 618 	b.dataplot('b','c','mydummy', sel = sel_rule, file = './dbase_test_files/partial_plot.png')

New Attachment

File to upload
Rename to
Overwrite existing attachment of same name

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.