attachment:DataFrame.py of Cookbook/DataFrame

Attachment 'DataFrame.py'

   1 ## Copyright (c) 2001-2006, Andrew Straw. All rights reserved.
   2 
   3 ## Redistribution and use in source and binary forms, with or without
   4 ## modification, are permitted provided that the following conditions are
   5 ## met:
   6 
   7 ##     * Redistributions of source code must retain the above copyright
   8 ##       notice, this list of conditions and the following disclaimer.
   9 
  10 ##     * Redistributions in binary form must reproduce the above
  11 ##       copyright notice, this list of conditions and the following
  12 ##       disclaimer in the documentation and/or other materials provided
  13 ##       with the distribution.
  14 
  15 ##     * Neither the name of the Andrew Straw nor the names of its
  16 ##       contributors may be used to endorse or promote products derived
  17 ##       from this software without specific prior written permission.
  18 
  19 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20 ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21 ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22 ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23 ## OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24 ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25 ## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 ## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 ## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 ## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29 ## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 
  31 import math
  32 import sets
  33 from scipy import *
  34 import Numeric
  35 import cStringIO as StringIO
  36 import csv # with Python 2.3
  37 class access2000(csv.Dialect):
  38 	"""A dialect to properly interpret Microsoft Access2000 CSV exports for international languages.
  39 	"""
  40 	delimiter = ';'
  41 	quotechar = '"'
  42 	doublequote = True
  43 	quoting = csv.QUOTE_NONNUMERIC
  44 	lineterminator = '\n'
  45 	skipinitialspace = True
  46 
  47 class DataFrame:
  48     def __init__(self, value_dict=None, fields_order=None ):
  49         if value_dict is None:
  50             value_dict = {}
  51         self.value_dict = value_dict
  52         num_rows = 0
  53         for column in self.value_dict.values():
  54             try:
  55                 num_rows = max(num_rows,len(column))
  56             except:
  57                 pass
  58         for key in self.value_dict.keys():
  59             if self.value_dict[key] is None:
  60                 self.value_dict[key] = (None,)*num_rows
  61         for field in self.value_dict:
  62             if len(self.value_dict[field]) != num_rows:
  63                 raise ValueError("field %s has wrong number of rows"%str(field))
  64         self.num_rows = num_rows
  65         if fields_order is None:
  66             self.fields_order = self.value_dict.keys()
  67         else:
  68             for key in fields_order:
  69                 assert self.value_dict.has_key(key)
  70             self.fields_order = fields_order
  71 
  72     def copy_empty(self):
  73         vd = {}
  74         for k in self.value_dict.keys():
  75             vd[k] = None
  76         return DataFrame(vd,self.fields_order)
  77 
  78     def __add__(self,other):
  79         nd = {}
  80         for k in other.fields_order:
  81             if k not in self.fields_order:
  82                 raise NotImplementedError("no fix yet for when not all fields are in both frames")
  83         for k in self.fields_order:
  84             nd[k] = list(self.value_dict[k]) + list(other.value_dict[k])
  85         res = DataFrame(nd,self.fields_order)
  86         return res
  87 
  88     def insert_row(self, value_dict, new_fields_ok=False):
  89         if not new_fields_ok:
  90             for v in value_dict:
  91                 assert(v in self.fields_order)
  92             for v in self.fields_order:
  93                 assert(v in value_dict)
  94                 try:
  95                     self.value_dict[v].append( value_dict[v] )
  96                 except AttributeError:
  97                     tmp = list(self.value_dict[v])
  98                     tmp.append( value_dict[v] )
  99                     self.value_dict[v] = tmp
 100             self.num_rows += 1
 101         else:
 102             all_fields = list(sets.Set(self.fields_order).union(value_dict.keys()))
 103             all_fields.sort()
 104             all_fields.reverse()
 105             for v in all_fields:
 106                 if v in value_dict:
 107                     if v not in self.value_dict:
 108                         self.value_dict[v] = [None]*self.num_rows
 109                     try:
 110                         self.value_dict[v].append(value_dict[v])
 111                     except AttributeError:
 112                         tmp = list(self.value_dict[v])
 113                         tmp.append( value_dict[v] )
 114                         self.value_dict[v] = tmp
 115                 else:
 116                     try:
 117                         self.value_dict[v].append(None)
 118                     except AttributeError:
 119                         tmp = list(self.value_dict[v])
 120                         tmp.append( None )
 121                         self.value_dict[v] = tmp
 122             self.fields_order = all_fields
 123             self.num_rows += 1
 124 
 125     def insert_column(self, field_name, values, position='last'):
 126         assert len(values) == self.num_rows
 127         if position == 'last':
 128             self.fields_order.append(field_name)
 129         else:
 130             self.fields_order.insert(position,field_name)
 131         self.value_dict[field_name] = values
 132 
 133     def drop_column(self, field_name):
 134         self.fields_order.remove(field_name)
 135         del self.value_dict[field_name]
 136 
 137     def drop_all_columns_except(self, *field_names):
 138         save_names = list(field_names)
 139 	for field_name in self.fields_order[:]:
 140             if field_name not in save_names:
 141                 self.drop_column( field_name )
 142 
 143     def __str__(self):
 144         def cc(s,width=10,just='center'):
 145             if len(s) > width:
 146                 s = s[:width]
 147             if just=='center':
 148                 return s.center(width)
 149             elif just=='left':
 150                 return s.ljust(width)
 151             elif just=='right':
 152                 return s.rjust(width)
 153         buf = StringIO.StringIO()
 154         print >> buf, cc('',width=5,just='right'),
 155         for field in self.fields_order:
 156             print >> buf, cc( field),
 157         print >> buf
 158         for row in range(self.num_rows):
 159             print >> buf, cc(str(row),width=5,just='right'), # row number
 160             for field in self.fields_order:
 161                 v = self.value_dict[field][row]
 162                 if v is not None:
 163                     v_str = str(v)
 164                 else:
 165                     v_str = ''
 166                 print >> buf, cc( v_str ),
 167             print >> buf
 168         buf.seek(0)
 169         return buf.read()
 170 
 171     def get_row_dict(self,row_idx):
 172         return self[row_idx]
 173 
 174     def __getitem__(self, i):
 175         result = {}
 176         for field in self.fields_order:
 177             result[field] = self.value_dict[field][i]
 178         return result
 179 
 180     def __getitems__(self, idxs):
 181         result = []
 182         for i in idxs:
 183             result.append( self[i] )
 184         return result
 185 
 186     def __len__(self):
 187         return self.num_rows
 188 
 189     def __get_row(self,row_idx):
 190         result = []
 191         for field in self.fields_order:
 192             result.append( self.value_dict[field][row_idx] )
 193         return result
 194 
 195     def __get_rows(self,row_idxs):
 196         if len(row_idxs) == 0:
 197             return
 198         rows = [self.__get_row(row_idx) for row_idx in row_idxs]
 199         by_col = zip(*rows)
 200         result = {}
 201         for i,field in enumerate(self.fields_order):
 202             result[field] = by_col[i]
 203         return result
 204 
 205     def where_field_cmp(self, field, bool_func_of_value):
 206         col = self.value_dict[field]
 207         indices = []
 208         for i in range(len(col)):
 209             if bool_func_of_value(col[i]):
 210                 indices.append(i)
 211         results = self.__get_rows(indices)
 212         if results is not None:
 213             return DataFrame(results,fields_order=self.fields_order)
 214         else:
 215             return None
 216 
 217     def where_field_equal(self, field, value, eps=None):
 218         col = self.value_dict[field]
 219         indices = []
 220         if eps is None:
 221             for i in range(len(col)):
 222                 if col[i] == value:
 223                     indices.append(i)
 224         else:
 225             # this is probably faster because it assumes data is numeric
 226             a = asarray(col)
 227             abs_diff = abs(a-value)
 228             indices = nonzero( less( abs_diff, eps ) )
 229         results = self.__get_rows(indices)
 230         if results is not None:
 231             return DataFrame(results,fields_order=self.fields_order)
 232         else:
 233             return None
 234 
 235     def where_field_not_equal(self, field, value, eps=None):
 236         col = self.value_dict[field]
 237         indices = []
 238         if eps is None:
 239             for i in range(len(col)):
 240                 if col[i] != value:
 241                     indices.append(i)
 242         else:
 243             # this is probably faster because it assumes data is numeric
 244             a = numpy.asarray(col)
 245             abs_diff = abs(a-value)
 246             indices = numpy.nonzero( numpy.greater_equal( abs_diff, eps ) )
 247         results = self.__get_rows(indices)
 248         if results is not None:
 249             return DataFrame(results,fields_order=self.fields_order)
 250         else:
 251             return None
 252 
 253     def where_field_less(self, field, value):
 254         col = self.value_dict[field]
 255         indices = []
 256         for i in range(len(col)):
 257             if col[i] < value:
 258                 indices.append(i)
 259         results = self.__get_rows(indices)
 260         if results is not None:
 261             return DataFrame(results,fields_order=self.fields_order)
 262         else:
 263             return None
 264 
 265     def where_field_lessequal(self, field, value):
 266         col = self.value_dict[field]
 267         indices = []
 268         for i in range(len(col)):
 269             if col[i] <= value:
 270                 indices.append(i)
 271         results = self.__get_rows(indices)
 272         if results is not None:
 273             return DataFrame(results,fields_order=self.fields_order)
 274         else:
 275             return None
 276 
 277     def where_field_greater(self, field, value):
 278         col = self.value_dict[field]
 279         indices = []
 280         for i in range(len(col)):
 281             if col[i] > value:
 282                 indices.append(i)
 283         results = self.__get_rows(indices)
 284         if results is not None:
 285             return DataFrame(results,fields_order=self.fields_order)
 286         else:
 287             return None
 288 
 289     def where_field_greaterequal(self, field, value):
 290         col = self.value_dict[field]
 291         indices = []
 292         for i in range(len(col)):
 293             if col[i] >= value:
 294                 indices.append(i)
 295         results = self.__get_rows(indices)
 296         if results is not None:
 297             return DataFrame(results,fields_order=self.fields_order)
 298         else:
 299             return None
 300 
 301     def where_field_in(self, field, values):
 302         col = self.value_dict[field]
 303         indices = []
 304         for i in range(len(col)):
 305             if col[i] in values:
 306                 indices.append(i)
 307         results = self.__get_rows(indices)
 308         if results is not None:
 309             return DataFrame(results,fields_order=self.fields_order)
 310         else:
 311             return None
 312 
 313     def enumerate_on(self, field, cmp_func=None):
 314         values = self.get_unique_values(field)
 315         result_frames = []
 316         values.sort(cmp_func)
 317         for value in values:
 318             result_frames.append( (value, self.where_field_equal(field,value)))
 319         return iter(result_frames)
 320 
 321     def enumerate_crude_bins(self, field, eps=None, eps_domain='linear'):
 322         if eps is None:
 323             return self.enumerate_on(field)
 324         if eps_domain == 'linear':
 325             def filt(x):
 326                 return x
 327         elif eps_domain == 'log10':
 328             def filt(x):
 329                 return numpy.log10(x)
 330         else:
 331             raise NotImplementedError
 332         vs = self.get_unique_values(field)
 333         vs.sort()
 334         bins = {}
 335         current_starter_v = None
 336         for cv in vs:
 337             if current_starter_v is not None and abs(filt(cv)-filt(current_starter_v))<eps:
 338                 bins[current_starter_v].append(cv)
 339             else:
 340                 bins.setdefault(cv,[]).append(cv)
 341                 current_starter_v = cv
 342         results = []
 343         keys = bins.keys()
 344         keys.sort()
 345         for close_v in keys:
 346             running_sum = 0
 347             running_n = 0
 348             accum = self.copy_empty() # new DataFrame with same fields
 349             for v in bins[close_v]:
 350                 add_frame = self.where_field_equal(field,v)
 351                 n = add_frame.num_rows
 352                 running_sum += (v*n)
 353                 running_n += n
 354                 accum = accum + add_frame
 355             avg_value = running_sum/float(running_n)
 356             results.append(( avg_value, accum ))
 357         return iter(results)
 358 
 359     def enumerate_crude_2_dims(self, field1, field2,
 360                                eps1=None, eps2=None,
 361                                eps1_domain='linear', eps2_domain='linear'):
 362         axis1_vs = {}
 363         for v,vf in self.enumerate_crude_bins(field1,eps=eps1,eps_domain=eps1_domain):
 364             axis1_vs[v] = vf.get_unique_values(field1)
 365 
 366         axis2_vs = {}
 367         for v,vf in self.enumerate_crude_bins(field2,eps=eps2,eps_domain=eps2_domain):
 368             axis2_vs[v] = vf.get_unique_values(field2)
 369 
 370         results = []
 371 
 372         v1s = axis1_vs.keys()
 373         v1s.sort()
 374 
 375         v2s = axis2_vs.keys()
 376         v2s.sort()
 377         for v1 in v1s:
 378             for v2 in v2s:
 379                 this_result = self.copy_empty()
 380                 for v1r in axis1_vs[v1]:
 381                     for v2r in axis2_vs[v2]:
 382                         tmp1 = self.where_field_equal(field1,v1r)
 383                         if tmp1 is not None:
 384                             tmp2 = tmp1.where_field_equal(field2,v2r)
 385                             if tmp2 is not None:
 386                                 this_result = this_result + tmp2
 387                 if this_result.num_rows > 0:
 388                     results.append((v1,v2,this_result))
 389         return iter(results)
 390 
 391     def mean(self, field):
 392         return mean(self.value_dict[field])
 393 
 394     def mean_and_std(self, field):
 395         values = self.value_dict[field]
 396         return mean(values), std(values)
 397 
 398     def mean_and_sem(self, field):
 399         values = self.value_dict[field]
 400         n = len(values)
 401         return mean(values), std(values)/float(numpy.sqrt(n))
 402 
 403     def get_fields(self):
 404         return self.fields_order[:] # return copy
 405 
 406     def sort_by(self,field,ascending=True):
 407         orig = self.value_dict[field]
 408         sorted = list(orig[:]) # play with copy
 409         sorted.sort() # make it sorted
 410         if not ascending:
 411             sorted.reverse() # make it sorted
 412         my_copy = list(orig[:]) # play with copy
 413         new_order = []
 414         for i in range(len(sorted)):
 415             sorted_value = sorted[i]
 416             index = my_copy.index( sorted_value )
 417             new_order.append( index )
 418             my_copy[index] = int # just set it equal to anything that can't be in list so it won't match again
 419         for field in self.fields_order:
 420             orig_list = self.value_dict[field]
 421             new_list = []
 422             for i in new_order:
 423                 new_list.append( orig_list[i] )
 424             self.value_dict[field] = new_list
 425 
 426     def sorted(self,field,ascending=True):
 427         result = DataFrame(self.value_dict, self.fields_order)
 428         result.sort_by(field,ascending=ascending)
 429         return result
 430 
 431     def get_unique_values(self, field):
 432         unique_values = list(sets.Set(self.get_all_values(field)))
 433         try:
 434             unique_values.sort()
 435         except:
 436             pass
 437         return unique_values
 438 
 439     def get_all_values(self, field):
 440         values = [ v for v in self.value_dict[field] ]
 441         return values
 442 
 443     def write_csv(self,filename,dialect='excel'):
 444         writer = csv.writer(open(filename,'w'),dialect=dialect)
 445         writer.writerow( self.fields_order )
 446         for i in range(self.num_rows):
 447             writer.writerow( self.__get_row(i) )
 448 
 449 def read_csv(file,header=True,dialect='excel'):
 450     if not hasattr(file,'readlines'):
 451         # inspired by R's read.csv
 452         reader = csv.reader(open(file,"r"),dialect=dialect)
 453     else:
 454         reader = csv.reader(file,dialect=dialect)
 455     split_lines = [ row for row in reader ] # newline problems sometimes happen here - de-Macify...
 456     if header:
 457         fields = split_lines.pop(0)
 458     else:
 459         num_fields = len(split_lines[0])
 460         num_chars = int(math.ceil(math.log10(num_fields)))
 461         name_string = "column%0"+str(num_chars)+"d"
 462         fields = [ name_string%(i+1) for i in range(len(split_lines[0])) ]
 463     # check/convert values
 464     for i in range(len(split_lines)):
 465         split_line = split_lines[i]
 466         for j in range(len(split_line)):
 467             value = split_line[j]
 468             if value == '':
 469                 converted_value = None
 470             else:
 471                 try:
 472                     converted_value = int(value)
 473                 except ValueError:
 474                     try:
 475                         converted_value = float(value)
 476                     except:
 477                         converted_value = value
 478             split_line[j] = converted_value
 479         split_lines[i] = split_line
 480     columns = zip(*split_lines)
 481     results = {}
 482     for i in range(len(fields)):
 483         field = fields[i]
 484         try:
 485             results[field] = list(columns[i])
 486         except IndexError:
 487             results[field] = None
 488 #        print i, field, results[field], columns[i]
 489     return DataFrame( results, fields_order=fields )
New Attachment

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
[get | view] (2006-09-13 12:20:03, 2.9 KB) [[attachment:CSVSample.csv]]
[get | view] (2006-09-13 07:57:52, 17.3 KB) [[attachment:DataFrame.py]]
[get | view] (2006-09-13 12:12:09, 25.1 KB) [[attachment:SampleCSV.csv]]