1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 import math
32 import sets
33 from scipy import *
34 import Numeric
35 import cStringIO as StringIO
36 import csv
37 class access2000(csv.Dialect):
38 """A dialect to properly interpret Microsoft Access2000 CSV exports for international languages.
39 """
40 delimiter = ';'
41 quotechar = '"'
42 doublequote = True
43 quoting = csv.QUOTE_NONNUMERIC
44 lineterminator = '\n'
45 skipinitialspace = True
46
47 class DataFrame:
48 def __init__(self, value_dict=None, fields_order=None ):
49 if value_dict is None:
50 value_dict = {}
51 self.value_dict = value_dict
52 num_rows = 0
53 for column in self.value_dict.values():
54 try:
55 num_rows = max(num_rows,len(column))
56 except:
57 pass
58 for key in self.value_dict.keys():
59 if self.value_dict[key] is None:
60 self.value_dict[key] = (None,)*num_rows
61 for field in self.value_dict:
62 if len(self.value_dict[field]) != num_rows:
63 raise ValueError("field %s has wrong number of rows"%str(field))
64 self.num_rows = num_rows
65 if fields_order is None:
66 self.fields_order = self.value_dict.keys()
67 else:
68 for key in fields_order:
69 assert self.value_dict.has_key(key)
70 self.fields_order = fields_order
71
72 def copy_empty(self):
73 vd = {}
74 for k in self.value_dict.keys():
75 vd[k] = None
76 return DataFrame(vd,self.fields_order)
77
78 def __add__(self,other):
79 nd = {}
80 for k in other.fields_order:
81 if k not in self.fields_order:
82 raise NotImplementedError("no fix yet for when not all fields are in both frames")
83 for k in self.fields_order:
84 nd[k] = list(self.value_dict[k]) + list(other.value_dict[k])
85 res = DataFrame(nd,self.fields_order)
86 return res
87
88 def insert_row(self, value_dict, new_fields_ok=False):
89 if not new_fields_ok:
90 for v in value_dict:
91 assert(v in self.fields_order)
92 for v in self.fields_order:
93 assert(v in value_dict)
94 try:
95 self.value_dict[v].append( value_dict[v] )
96 except AttributeError:
97 tmp = list(self.value_dict[v])
98 tmp.append( value_dict[v] )
99 self.value_dict[v] = tmp
100 self.num_rows += 1
101 else:
102 all_fields = list(sets.Set(self.fields_order).union(value_dict.keys()))
103 all_fields.sort()
104 all_fields.reverse()
105 for v in all_fields:
106 if v in value_dict:
107 if v not in self.value_dict:
108 self.value_dict[v] = [None]*self.num_rows
109 try:
110 self.value_dict[v].append(value_dict[v])
111 except AttributeError:
112 tmp = list(self.value_dict[v])
113 tmp.append( value_dict[v] )
114 self.value_dict[v] = tmp
115 else:
116 try:
117 self.value_dict[v].append(None)
118 except AttributeError:
119 tmp = list(self.value_dict[v])
120 tmp.append( None )
121 self.value_dict[v] = tmp
122 self.fields_order = all_fields
123 self.num_rows += 1
124
125 def insert_column(self, field_name, values, position='last'):
126 assert len(values) == self.num_rows
127 if position == 'last':
128 self.fields_order.append(field_name)
129 else:
130 self.fields_order.insert(position,field_name)
131 self.value_dict[field_name] = values
132
133 def drop_column(self, field_name):
134 self.fields_order.remove(field_name)
135 del self.value_dict[field_name]
136
137 def drop_all_columns_except(self, *field_names):
138 save_names = list(field_names)
139 for field_name in self.fields_order[:]:
140 if field_name not in save_names:
141 self.drop_column( field_name )
142
143 def __str__(self):
144 def cc(s,width=10,just='center'):
145 if len(s) > width:
146 s = s[:width]
147 if just=='center':
148 return s.center(width)
149 elif just=='left':
150 return s.ljust(width)
151 elif just=='right':
152 return s.rjust(width)
153 buf = StringIO.StringIO()
154 print >> buf, cc('',width=5,just='right'),
155 for field in self.fields_order:
156 print >> buf, cc( field),
157 print >> buf
158 for row in range(self.num_rows):
159 print >> buf, cc(str(row),width=5,just='right'),
160 for field in self.fields_order:
161 v = self.value_dict[field][row]
162 if v is not None:
163 v_str = str(v)
164 else:
165 v_str = ''
166 print >> buf, cc( v_str ),
167 print >> buf
168 buf.seek(0)
169 return buf.read()
170
171 def get_row_dict(self,row_idx):
172 return self[row_idx]
173
174 def __getitem__(self, i):
175 result = {}
176 for field in self.fields_order:
177 result[field] = self.value_dict[field][i]
178 return result
179
180 def __getitems__(self, idxs):
181 result = []
182 for i in idxs:
183 result.append( self[i] )
184 return result
185
186 def __len__(self):
187 return self.num_rows
188
189 def __get_row(self,row_idx):
190 result = []
191 for field in self.fields_order:
192 result.append( self.value_dict[field][row_idx] )
193 return result
194
195 def __get_rows(self,row_idxs):
196 if len(row_idxs) == 0:
197 return
198 rows = [self.__get_row(row_idx) for row_idx in row_idxs]
199 by_col = zip(*rows)
200 result = {}
201 for i,field in enumerate(self.fields_order):
202 result[field] = by_col[i]
203 return result
204
205 def where_field_cmp(self, field, bool_func_of_value):
206 col = self.value_dict[field]
207 indices = []
208 for i in range(len(col)):
209 if bool_func_of_value(col[i]):
210 indices.append(i)
211 results = self.__get_rows(indices)
212 if results is not None:
213 return DataFrame(results,fields_order=self.fields_order)
214 else:
215 return None
216
217 def where_field_equal(self, field, value, eps=None):
218 col = self.value_dict[field]
219 indices = []
220 if eps is None:
221 for i in range(len(col)):
222 if col[i] == value:
223 indices.append(i)
224 else:
225
226 a = asarray(col)
227 abs_diff = abs(a-value)
228 indices = nonzero( less( abs_diff, eps ) )
229 results = self.__get_rows(indices)
230 if results is not None:
231 return DataFrame(results,fields_order=self.fields_order)
232 else:
233 return None
234
235 def where_field_not_equal(self, field, value, eps=None):
236 col = self.value_dict[field]
237 indices = []
238 if eps is None:
239 for i in range(len(col)):
240 if col[i] != value:
241 indices.append(i)
242 else:
243
244 a = numpy.asarray(col)
245 abs_diff = abs(a-value)
246 indices = numpy.nonzero( numpy.greater_equal( abs_diff, eps ) )
247 results = self.__get_rows(indices)
248 if results is not None:
249 return DataFrame(results,fields_order=self.fields_order)
250 else:
251 return None
252
253 def where_field_less(self, field, value):
254 col = self.value_dict[field]
255 indices = []
256 for i in range(len(col)):
257 if col[i] < value:
258 indices.append(i)
259 results = self.__get_rows(indices)
260 if results is not None:
261 return DataFrame(results,fields_order=self.fields_order)
262 else:
263 return None
264
265 def where_field_lessequal(self, field, value):
266 col = self.value_dict[field]
267 indices = []
268 for i in range(len(col)):
269 if col[i] <= value:
270 indices.append(i)
271 results = self.__get_rows(indices)
272 if results is not None:
273 return DataFrame(results,fields_order=self.fields_order)
274 else:
275 return None
276
277 def where_field_greater(self, field, value):
278 col = self.value_dict[field]
279 indices = []
280 for i in range(len(col)):
281 if col[i] > value:
282 indices.append(i)
283 results = self.__get_rows(indices)
284 if results is not None:
285 return DataFrame(results,fields_order=self.fields_order)
286 else:
287 return None
288
289 def where_field_greaterequal(self, field, value):
290 col = self.value_dict[field]
291 indices = []
292 for i in range(len(col)):
293 if col[i] >= value:
294 indices.append(i)
295 results = self.__get_rows(indices)
296 if results is not None:
297 return DataFrame(results,fields_order=self.fields_order)
298 else:
299 return None
300
301 def where_field_in(self, field, values):
302 col = self.value_dict[field]
303 indices = []
304 for i in range(len(col)):
305 if col[i] in values:
306 indices.append(i)
307 results = self.__get_rows(indices)
308 if results is not None:
309 return DataFrame(results,fields_order=self.fields_order)
310 else:
311 return None
312
313 def enumerate_on(self, field, cmp_func=None):
314 values = self.get_unique_values(field)
315 result_frames = []
316 values.sort(cmp_func)
317 for value in values:
318 result_frames.append( (value, self.where_field_equal(field,value)))
319 return iter(result_frames)
320
321 def enumerate_crude_bins(self, field, eps=None, eps_domain='linear'):
322 if eps is None:
323 return self.enumerate_on(field)
324 if eps_domain == 'linear':
325 def filt(x):
326 return x
327 elif eps_domain == 'log10':
328 def filt(x):
329 return numpy.log10(x)
330 else:
331 raise NotImplementedError
332 vs = self.get_unique_values(field)
333 vs.sort()
334 bins = {}
335 current_starter_v = None
336 for cv in vs:
337 if current_starter_v is not None and abs(filt(cv)-filt(current_starter_v))<eps:
338 bins[current_starter_v].append(cv)
339 else:
340 bins.setdefault(cv,[]).append(cv)
341 current_starter_v = cv
342 results = []
343 keys = bins.keys()
344 keys.sort()
345 for close_v in keys:
346 running_sum = 0
347 running_n = 0
348 accum = self.copy_empty()
349 for v in bins[close_v]:
350 add_frame = self.where_field_equal(field,v)
351 n = add_frame.num_rows
352 running_sum += (v*n)
353 running_n += n
354 accum = accum + add_frame
355 avg_value = running_sum/float(running_n)
356 results.append(( avg_value, accum ))
357 return iter(results)
358
359 def enumerate_crude_2_dims(self, field1, field2,
360 eps1=None, eps2=None,
361 eps1_domain='linear', eps2_domain='linear'):
362 axis1_vs = {}
363 for v,vf in self.enumerate_crude_bins(field1,eps=eps1,eps_domain=eps1_domain):
364 axis1_vs[v] = vf.get_unique_values(field1)
365
366 axis2_vs = {}
367 for v,vf in self.enumerate_crude_bins(field2,eps=eps2,eps_domain=eps2_domain):
368 axis2_vs[v] = vf.get_unique_values(field2)
369
370 results = []
371
372 v1s = axis1_vs.keys()
373 v1s.sort()
374
375 v2s = axis2_vs.keys()
376 v2s.sort()
377 for v1 in v1s:
378 for v2 in v2s:
379 this_result = self.copy_empty()
380 for v1r in axis1_vs[v1]:
381 for v2r in axis2_vs[v2]:
382 tmp1 = self.where_field_equal(field1,v1r)
383 if tmp1 is not None:
384 tmp2 = tmp1.where_field_equal(field2,v2r)
385 if tmp2 is not None:
386 this_result = this_result + tmp2
387 if this_result.num_rows > 0:
388 results.append((v1,v2,this_result))
389 return iter(results)
390
391 def mean(self, field):
392 return mean(self.value_dict[field])
393
394 def mean_and_std(self, field):
395 values = self.value_dict[field]
396 return mean(values), std(values)
397
398 def mean_and_sem(self, field):
399 values = self.value_dict[field]
400 n = len(values)
401 return mean(values), std(values)/float(numpy.sqrt(n))
402
403 def get_fields(self):
404 return self.fields_order[:]
405
406 def sort_by(self,field,ascending=True):
407 orig = self.value_dict[field]
408 sorted = list(orig[:])
409 sorted.sort()
410 if not ascending:
411 sorted.reverse()
412 my_copy = list(orig[:])
413 new_order = []
414 for i in range(len(sorted)):
415 sorted_value = sorted[i]
416 index = my_copy.index( sorted_value )
417 new_order.append( index )
418 my_copy[index] = int
419 for field in self.fields_order:
420 orig_list = self.value_dict[field]
421 new_list = []
422 for i in new_order:
423 new_list.append( orig_list[i] )
424 self.value_dict[field] = new_list
425
426 def sorted(self,field,ascending=True):
427 result = DataFrame(self.value_dict, self.fields_order)
428 result.sort_by(field,ascending=ascending)
429 return result
430
431 def get_unique_values(self, field):
432 unique_values = list(sets.Set(self.get_all_values(field)))
433 try:
434 unique_values.sort()
435 except:
436 pass
437 return unique_values
438
439 def get_all_values(self, field):
440 values = [ v for v in self.value_dict[field] ]
441 return values
442
443 def write_csv(self,filename,dialect='excel'):
444 writer = csv.writer(open(filename,'w'),dialect=dialect)
445 writer.writerow( self.fields_order )
446 for i in range(self.num_rows):
447 writer.writerow( self.__get_row(i) )
448
449 def read_csv(file,header=True,dialect='excel'):
450 if not hasattr(file,'readlines'):
451
452 reader = csv.reader(open(file,"r"),dialect=dialect)
453 else:
454 reader = csv.reader(file,dialect=dialect)
455 split_lines = [ row for row in reader ]
456 if header:
457 fields = split_lines.pop(0)
458 else:
459 num_fields = len(split_lines[0])
460 num_chars = int(math.ceil(math.log10(num_fields)))
461 name_string = "column%0"+str(num_chars)+"d"
462 fields = [ name_string%(i+1) for i in range(len(split_lines[0])) ]
463
464 for i in range(len(split_lines)):
465 split_line = split_lines[i]
466 for j in range(len(split_line)):
467 value = split_line[j]
468 if value == '':
469 converted_value = None
470 else:
471 try:
472 converted_value = int(value)
473 except ValueError:
474 try:
475 converted_value = float(value)
476 except:
477 converted_value = value
478 split_line[j] = converted_value
479 split_lines[i] = split_line
480 columns = zip(*split_lines)
481 results = {}
482 for i in range(len(fields)):
483 field = fields[i]
484 try:
485 results[field] = list(columns[i])
486 except IndexError:
487 results[field] = None
488
489 return DataFrame( results, fields_order=fields )