1 from scipy import c_, arange
2 from scipy.io import read_array
3 from numpy.random import randn
4 from pylab import plot, show, figure
5 import pickle, csv, os
6
7 class dbase:
8 """
9 A simple data-frame, that reads and write csv/pickle files with variable names.
10 Columns in the data can be accessed using x.get('a','c') where 'a' and 'c' are
11 variable names.
12 """
13 def __init__(self,f):
14 """
15 Initializing the dbase class. Loading file f.
16 """
17 self.load(f)
18 self.DBname = os.getcwd() + '/' + f
19
20 def load(self,fname):
21 """
22 Loading data from a csv or a pickle file of the dbase class
23 """
24 fext = self.__ext(fname)
25 f = open(fname,'r')
26 if fext == 'csv':
27 self.varnm = self.__vardic(f.readline().split(','))
28 self.data = read_array(f, separator=',', lines=(0,-1))
29 elif fext == 'pickle':
30 a = pickle.load(f)
31 self.varnm = a.varnm
32 self.data = a.data
33 else:
34 raise 'This class only works on csv and pickle files'
35 f.close()
36
37 def dump(self,fname):
38 """
39 Dumping the instance of the class into a csv or pickle file
40 """
41 fext = self.__ext(fname)
42 f = open(fname,'w')
43 if fext == 'csv':
44 writer = csv.writer(f)
45 writer.writerow(self.__sort_keys())
46 writer.writerows(self.data)
47 elif fext == 'pickle':
48 pickle.dump(self,f)
49 else:
50 raise 'This class only outputs csv or pickle files'
51 f.close()
52
53 def get(self,*var):
54 """
55 Selecting a column based on variable labels. Assumes data are in columns.
56 """
57
58 a = self.data[:,self.varnm[var[0]]]
59
60 for i in var[1:]:
61 a = c_[a,self.data[:,self.varnm[i]]]
62
63 return a
64
65 def addvar(self,a,v):
66 """
67 Adding columns of data
68 """
69 self.data = c_[self.data,a]
70
71 j = max(self.varnm.values()) + 1
72 if isinstance(v,str): v = [v]
73 for i in v:
74 self.varnm[i] = j
75 j += 1
76
77 def delvar(self,*v):
78 """
79 Deleting columns of data
80 """
81
82 for i in v:
83 del self.varnm[i]
84
85
86 index = self.varnm.values()
87 index.sort()
88
89
90 self.data = self.data[:,index]
91
92
93 self.varnm = self.__vardic(self.__sort_keys(range(len(index))))
94
95 def info(self,axis=0):
96 """
97 Printing descriptive statistics on selected variables
98 """
99 nobs = self.data.shape[axis]
100 nvar = len(self.varnm.keys())
101 min = self.data.min(axis)
102 max = self.data.max(axis)
103 mean = self.data.mean(axis)
104 std = self.data.std(axis)
105 vars = self.__sort_keys()
106
107 print '\n=========================================================='
108 print '================== Database information =================='
109 print '==========================================================\n'
110
111 print '''file: %s''' % b.DBname
112 print '''# obs: %s''' % nobs
113 print '''# variables: %s\n''' % nvar
114
115 print 'var min max mean std.dev'
116 print '=========================================================='
117
118 for i in range(nvar):
119 print '''%s %-5.2f %-5.2f %-5.2f %-5.2f''' % tuple([vars[i],min[i],max[i],mean[i],std[i]])
120
121 def dataplot(self,var):
122 """
123 Plotting the data with variable names
124 """
125 a = self.get(var)
126
127
128 title = "Plot of series " + var
129 ax = figure().add_axes([.1,.1,.8,.8])
130 ax.plot(a);
131 ax.set_title(title)
132 show()
133
134 def __vardic(self,L):
135 """
136 Making a dictionary with variable names and indices
137 """
138 dic = {}; j = 0
139
140
141
142
143 for i in L:
144 dic[i.strip()] = j
145 j += 1
146
147 return dic
148
149 def __ext(self,fname):
150 """
151 Finding the file extension of the filename passed to dbase
152 """
153 return fname.split('.')[-1].strip()
154
155 def __sort_keys(self,v = []):
156 """
157 Sorting the keys in the variable name dictionary so they are in the correct order
158 """
159 k = self.varnm.keys()
160 if v == []: v = self.varnm.values()
161
162 return [k[i] for i in v]
163
164
165
166
167
168 if __name__ == '__main__':
169
170
171 varnm = ['a','b','c']
172 data = randn(5,3)
173
174
175 f = open('data.csv','w')
176 writer = csv.writer(f)
177 writer.writerow(varnm)
178 writer.writerows(data)
179 f.close()
180
181
182 a = dbase("data.csv")
183 a.dump("data.pickle")
184
185
186 print "\nLoading the dbase object from a pickle file\n"
187
188 b = dbase("data.pickle")
189
190 print "Data from dbase class\n", b.data
191 print "\nVariable names from dbase class\n", b.varnm
192 print "\nTwo columns selected using variable names\n", b.get('a','c')
193 print "\nSaving data and variable names to a different csv file\n", b.dump("data_dump.csv")
194
195
196 xtra1 = b.get('a') * b.get('b')
197 xtra2 = b.get('a') * b.get('c')
198 xtra = c_[xtra1,xtra2]
199 xtra_varnm = ('x1','x2')
200
201 b.addvar(xtra,xtra_varnm)
202 print "\nTwo columns added\n", b.data
203 print "\nTwo variable names added\n", b.varnm
204
205
206 b.delvar('a','x2')
207 print "\nTwo columns deleted\n", b.data
208 print "\nTwo variable names deleted\n", b.varnm
209
210
211 print "\nWorking on file: " + b.DBname
212
213
214 b.info()
215
216
217 b.dataplot('b')