attachment:histogram.py of DavidHuard

Attachment 'histogram.py'

   1 """ Fast histogram """
   2 
   3 from numpy import array, zeros, asarray, sort, int32, digitize, bincount,\
   4     concatenate,ones, atleast_1d, iterable, linspace, diff,log10,around,\
   5     arange, apply_along_axis, hsplit, argsort, sum
   6 from scipy import weave
   7 from typed_array_converter import converters
   8 
   9 def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None, strategy=None):
  10     """histogram(a, bins=10, range=None, normed=False, weights=None, axis=None) 
  11                                                                    -> H, dict
  12     
  13     Return the distribution of sample.
  14     
  15     :Parameters:
  16       - `a` : Array sample.
  17       - `bins` : Number of bins, or an array of bin edges, in which case the 
  18                 range is not used.
  19       - `range` : Lower and upper bin edges, default: [min, max].
  20       - `normed` :Boolean, if False, return the number of samples in each bin,
  21                 if True, return the density.  
  22       - `weights` : Sample weights. The weights are normed only if normed is 
  23                 True. Should weights.sum() not equal len(a), the total bin count 
  24                 will not be equal to the number of samples.
  25       - `axis` : Specifies the dimension along which the histogram is computed. 
  26                 Defaults to None, which aggregates the entire sample array.
  27       - `strategy` : Histogramming method (binsize, searchsorted or digitize).
  28     
  29     :Return:
  30       - `H` : The number of samples in each bin. 
  31         If normed is True, H is a frequency distribution.
  32       - dict{ 'edges':      The bin edges, including the rightmost edge.
  33         'upper':      Upper outliers.
  34         'lower':      Lower outliers.
  35         'bincenters': Center of bins. 
  36         'strategy': the histogramming method employed.}
  37     
  38     :Examples:
  39       >>> x = random.rand(100,10)
  40       >>> H, D = histogram(x, bins=10, range=[0,1], normed=True)
  41       >>> H2, D = histogram(x, bins=10, range=[0,1], normed=True, axis=0)
  42     
  43     :SeeAlso: histogramnd
  44     """
  45     weighted = weights is not None
  46     
  47     a = asarray(a)
  48     if axis is None:
  49         a = atleast_1d(a.ravel())
  50         if weighted:
  51             weights = atleast_1d(weights.ravel())
  52         axis = 0 
  53         
  54     # Bin edges.   
  55     if not iterable(bins):
  56         if range is None:
  57             range = (a.min(), a.max())
  58         mn, mx = [mi+0.0 for mi in range]
  59         if mn == mx:
  60             mn -= 0.5
  61             mx += 0.5
  62         edges = linspace(mn, mx, bins+1, endpoint=True)
  63     else:
  64         edges = asarray(bins, float)
  65     
  66     nbin = len(edges)-1
  67     dedges = diff(edges)
  68     bincenters = edges[:-1] + dedges/2.
  69         
  70     # Measure of bin precision.
  71     decimal = int(-log10(dedges.min())+10)
  72     
  73     # Choose the fastest histogramming method
  74     even = (len(set(around(dedges, decimal))) == 1)
  75     if strategy is None:
  76         if even:
  77             strategy = 'binsize'
  78         else:
  79             if nbin > 30: # approximative threshold
  80                 strategy = 'searchsort'
  81             else:
  82                 strategy = 'digitize'
  83     else:
  84         if strategy not in ['binsize', 'digitize', 'searchsort']:
  85             raise 'Unknown histogramming strategy.', strategy
  86         if strategy == 'binsize' and not even:
  87             raise 'This binsize strategy cannot be used for uneven bins.'
  88         
  89     # Parameters for the even functions.
  90     start = float(edges[0])
  91     binwidth = float(dedges[0])
  92     
  93     # For the rightmost bin, we want values equal to the right 
  94     # edge to be counted in the last bin, and not as an outlier. 
  95     # Hence, we shift the last bin by a tiny amount.
  96     if not iterable(bins):
  97         binwidth += pow(10, -decimal)
  98         edges[-1] += pow(10, -decimal)
  99     
 100     # Looping to reduce memory usage
 101     block = 66600 
 102     slices = [slice(None)]*a.ndim
 103     for i in arange(0,len(a),block):
 104         slices[axis] = slice(i,i+block)
 105         at = a[slices]
 106         if weighted:
 107             at = concatenate((at, weights[slices]), axis)        
 108             if strategy == 'binsize':
 109                 count = apply_along_axis(splitinmiddle,axis,at,
 110                     histogram_binsize_weighted,start,binwidth,nbin)               
 111             elif strategy == 'searchsort':
 112                 count = apply_along_axis(splitinmiddle,axis,at, \
 113                         histogram_searchsort_weighted, edges)
 114             elif strategy == 'digitize':
 115                     count = apply_along_axis(splitinmiddle,axis,at,\
 116                         histogram_digitize,edges,decimal,normed)
 117         else:
 118             if strategy == 'binsize':
 119                 count = apply_along_axis(histogram_binsize,axis,at,start,binwidth,nbin)
 120             elif strategy == 'searchsort':
 121                 count = apply_along_axis(histogram_searchsort,axis,at,edges)
 122             elif strategy == 'digitize':
 123                 count = apply_along_axis(histogram_digitize,axis,at,None,edges,
 124                         decimal, normed)
 125                     
 126         if i == 0:
 127             total = count
 128         else:
 129             total += count
 130         
 131     # Outlier count
 132     upper = total.take(array([-1]), axis)
 133     lower = total.take(array([0]), axis)
 134     
 135     # Non-outlier count
 136     core = a.ndim*[slice(None)]
 137     core[axis] = slice(1, -1)
 138     hist = total[core]
 139     
 140     if normed:
 141         normalize = lambda x: atleast_1d(x/(x*dedges).sum())
 142         hist = apply_along_axis(normalize, axis, hist)
 143 
 144     return hist, {'edges':edges, 'lower':lower, 'upper':upper, \
 145         'bincenters':bincenters, 'strategy':strategy}
 146         
 147 
 148 
 149 def histogram_binsize(a, start, width, n):
 150     """histogram_even(a, start, width, n) -> histogram
 151     
 152     Return an histogram where the first bin counts the number of lower
 153     outliers and the last bin the number of upper outliers. Works only with 
 154     fixed width bins. 
 155     
 156     :Parameters:
 157       a : array
 158         Array of samples.
 159       start : float
 160         Left-most bin edge.
 161       width : float
 162         Width of the bins. All bins are considered to have the same width.
 163       n : int
 164         Number of bins. 
 165     
 166     :Return:
 167       H : array
 168         Array containing the number of elements in each bin. H[0] is the number
 169         of samples smaller than start and H[-1] the number of samples 
 170         greater than start + n*width.
 171     """    
 172     ary = asarray(a)
 173     
 174     # Create an array to hold the histogram count results, including outliers.
 175     count = zeros(n+2,dtype=int32)
 176     
 177     # The C++ code that actually does the histogramming.    
 178     code = """
 179            PyArrayIterObject *iter = (PyArrayIterObject*)PyArray_IterNew(py_ary);
 180            
 181            while(iter->index < iter->size)
 182            {
 183                           
 184                // This requires an update to weave 
 185                ary_data_type value = *((ary_data_type*)iter->dataptr);
 186                if (value>=start)
 187                {
 188                    int bin_index = (int)((value-start)/width);
 189                
 190                    //////////////////////////////////////////////////////////
 191                    // Bin counter increment
 192                    //////////////////////////////////////////////////////////
 193     
 194                    // If the value was found, increment the counter for that bin.
 195                    if (bin_index < n)
 196                    {
 197                        count[bin_index+1]++;
 198                    }
 199                    else {
 200                        count[n+1]++; }
 201                }
 202                else {
 203                   count[0]++;}
 204                PyArray_ITER_NEXT(iter);
 205            }
 206            """
 207     weave.inline(code, ['ary', 'start', 'width','n', 'count'], 
 208                  type_converters=converters, 
 209                  compiler='gcc')
 210                  
 211     return count
 212 
 213 
 214 def histogram_binsize_weighted(a, w, start, width, n):
 215     """histogram_even_weighted(a, start, width, n) -> histogram
 216     
 217     Return an histogram where the first bin counts the number of lower
 218     outliers and the last bin the number of upper outliers. Works only with 
 219     fixed width bins. 
 220     
 221     :Parameters:
 222       a : array
 223         Array of samples.
 224       w : array
 225         Weights of samples.
 226       start : float
 227         Left-most bin edge.
 228       width : float
 229         Width of the bins. All bins are considered to have the same width.
 230       n : int
 231         Number of bins. 
 232     
 233     :Return:
 234       H : array
 235         Array containing the number of elements in each bin. H[0] is the number
 236         of samples smaller than start and H[-1] the number of samples 
 237         greater than start + n*width.
 238     """    
 239     ary = asarray(a)
 240     w = asarray(w)
 241     # Create an array to hold the histogram count results, including outliers.
 242     count = zeros(n+2,dtype=w.dtype)
 243     
 244     # The C++ code that actually does the histogramming.    
 245     code = """
 246            PyArrayIterObject *iter = (PyArrayIterObject*)PyArray_IterNew(py_ary);
 247            
 248            while(iter->index < iter->size)
 249            {
 250                           
 251                // This requires an update to weave 
 252                ary_data_type value = *((ary_data_type*)iter->dataptr);
 253                if (value>=start)
 254                {
 255                    int bin_index = (int)((value-start)/width);
 256                
 257                    //////////////////////////////////////////////////////////
 258                    // Bin counter increment
 259                    //////////////////////////////////////////////////////////
 260     
 261                    // If the value was found, increment the counter for that bin.
 262                    if (bin_index < n)
 263                    {
 264                        count[bin_index+1]+=w[iter->index];
 265                    }
 266                    else {
 267                        count[n+1]+=w[iter->index]; }
 268                }
 269                else {
 270                   count[0]+=w[iter->index];}
 271                PyArray_ITER_NEXT(iter);
 272            }
 273            """
 274     weave.inline(code, ['ary', 'w', 'start', 'width','n', 'count'], 
 275                  type_converters=converters, 
 276                  compiler='gcc')
 277                  
 278     return count
 279        
 280 def histogram_searchsort(a, bins):
 281     n = sort(a).searchsorted(bins)
 282     n = concatenate([n, [len(a)]])
 283     count = concatenate([[n[0]], n[1:]-n[:-1]])
 284     return count
 285     
 286 def histogram_searchsort_weighted(a, w, bins):
 287     i = sort(a).searchsorted(bins)
 288     sw = w[argsort(a)]
 289     i = concatenate([i, [len(a)]])
 290     n = concatenate([[0],sw.cumsum()])[i]
 291     count = concatenate([[n[0]], n[1:]-n[:-1]])
 292     return count
 293 
 294 def splitinmiddle(x, function, *args, **kwds):
 295     x1,x2 = hsplit(x, 2)
 296     return function(x1,x2,*args, **kwds)
 297 
 298 def histogram_digitize(a, w, edges, decimal, normed):
 299     """Internal routine to compute the 1d weighted histogram.
 300     a: sample
 301     w: weights
 302     edges: bin edges
 303     decimal: approximation to put values lying on the rightmost edge in the last
 304              bin.
 305     weighted: Means that the weights are appended to array a. 
 306     Return the bin count or frequency if normed.
 307     """
 308     weighted = w is not None
 309     nbin = edges.shape[0]+1
 310     if weighted:
 311         count = zeros(nbin, dtype=w.dtype)
 312         if normed:    
 313             count = zeros(nbin, dtype=float)
 314             w = w/w.mean()
 315     else:
 316         count = zeros(nbin, dtype=int32)
 317             
 318     binindex = digitize(a, edges)
 319         
 320     # Count the number of identical indices.
 321     flatcount = bincount(binindex, w)
 322     
 323     # Place the count in the histogram array.
 324     count[:len(flatcount)] = flatcount
 325        
 326     return count
 327     
 328 
 329 from numpy.testing import *
 330 from numpy.random import rand
 331 
 332 class test_histogram1d_functions(NumpyTestCase):
 333     def check_consistency(self):
 334         n = 100
 335         r = rand(n)*12-1
 336         bins = range(11)
 337         a = histogram_binsize(r, bins[0], bins[1]-bins[0], len(bins)-1)
 338         b = histogram_digitize(r, None, array(bins), 6, False)
 339         c = histogram_searchsort(r,bins)
 340         assert_array_equal(a,b)
 341         assert_array_equal(c,b)
 342         
 343 class test_histogram(NumpyTestCase):
 344     def check_simple(self):
 345         n=100
 346         v=rand(n)
 347         (a,b)=histogram(v)
 348         #check if the sum of the bins equals the number of samples
 349         assert_equal(sum(a,axis=0),n)
 350         #check that the bin counts are evenly spaced when the data is from a linear function
 351         (a,b)=histogram(linspace(0,10,100))
 352         assert_array_equal(a,10)
 353         #Check the construction of the bin array
 354         a, b = histogram(v, bins=4, range=[.2,.8])
 355         assert_array_almost_equal(b['edges'],linspace(.2, .8, 5),8)
 356         #Check the number of outliers
 357         assert_equal((v<.2).sum(), b['lower'])
 358         assert_equal((v>.8).sum(),b['upper'])
 359         #Check the normalization
 360         bins = [0,.5,.75,1]
 361         a,b = histogram(v, bins, normed=True)
 362         assert_almost_equal((a*diff(bins)).sum(), 1)
 363         
 364     def check_axis(self):
 365         n,m = 100,20
 366         v = rand(n,m)
 367         a,b = histogram(v, bins=5)
 368         # Check dimension is reduced (axis=None).
 369         assert_equal(a.ndim, 1)
 370         #Check total number of count is equal to the number of samples.
 371         assert_equal(a.sum(), n*m)
 372         a,b = histogram(v, bins = 7, axis=0)
 373         # Check shape of new array is ok.
 374         assert(a.ndim == 2)
 375         assert_array_equal(a.shape,[7, m])
 376         # Check normalization is consistent 
 377         a,b = histogram(v, bins = 7, axis=0, normed=True)
 378         assert_array_almost_equal((a.T*diff(b['edges'])).sum(1), ones((m)),5)
 379         a,b = histogram(v, bins = 7, axis=1, normed=True)
 380         assert_array_equal(a.shape, [n,7])
 381         assert_array_almost_equal((a*diff(b['edges'])).sum(1), ones((n)))
 382         # Check results are consistent with 1d estimate
 383         a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True)
 384         assert_array_almost_equal(a1, a[0,:],7)
 385             
 386     def check_weights(self):
 387         # Check weights = constant gives the same answer as no weights.
 388         v = rand(100)
 389         w = ones(100)*5
 390         a,b = histogram(v)
 391         na,nb = histogram(v, normed=True)
 392         wa,wb = histogram(v, weights=w)
 393         nwa,nwb = histogram(v, weights=w, normed=True)
 394         assert_array_equal(a*5, wa)
 395         assert_array_almost_equal(na, nwa,8)
 396         # Check weights are properly applied.
 397         v = linspace(0,10,10)
 398         w = concatenate((zeros(5), ones(5)))
 399         wa,wb = histogram(v, bins=linspace(0,10.01, 11),weights=w)
 400         assert_array_almost_equal(wa, w)
 401         
 402     def check_strategies(self):
 403         v = rand(100)
 404         ae,be = histogram(v, strategy='binsize')
 405         ab,bb = histogram(v, strategy='digitize')
 406         as,bs = histogram(v, strategy='searchsort')
 407         assert_array_equal(ae, ab)
 408         assert_array_equal(ae, as)
 409         
 410         w = rand(100)
 411         ae,be = histogram(v, weights=w, strategy='binsize')
 412         ab,bb = histogram(v, weights=w, strategy='digitize')
 413         as,bs = histogram(v, weights=w, strategy='searchsort')
 414         assert_array_almost_equal(ae, ab,8)
 415         assert_array_almost_equal(ae, as,8)
 416         
 417         
 418 def compare_strategies():
 419     """Time each strategy in the case of even bins. 
 420     
 421     The fastest method is the C based function histogram_even, which
 422     however cannot be used for uneven bins. In this case, searchsort is faster 
 423     if there are more than 30 bins, otherwise, digitize should be used. 
 424     """
 425     from time import clock
 426     from numpy.random import randint
 427     import pylab as P
 428     N = 2**arange(12,24)
 429     B = arange(10,150,20)
 430     strategies = ['binsize', 'searchsort', 'digitize']
 431     tn = zeros((len(N), len(strategies)))
 432     tb = zeros((len(B), len(strategies)))
 433     s1 = P.subplot(211)
 434     s2 = P.subplot(212)
 435     for i,s in enumerate(strategies):
 436         for j,n in enumerate(N):
 437             r = randint(0,100, n)
 438             t1 = clock()
 439             a,d = histogram(r,30, strategy=s)
 440             t2 = clock()
 441             tn[j,i]=t2-t1
 442         s1.loglog(N, tn[:,i], '-', marker='o', label=s)
 443         
 444         r = randint(0,100, 1e6)
 445         for j,b in enumerate(B):
 446             t1 = clock()
 447             a,d = histogram(r,b, strategy=s)
 448             t2 = clock()
 449             tb[j,i]=t2-t1
 450         s2.plot(B, tb[:,i], '-', marker='o',label=s)
 451     s1.set_xlabel('Number of random variables')
 452     s1.set_ylabel('Computation time (s)')
 453     s2.set_xlabel('Bin array length')
 454     s2.set_ylabel('Computation time (s)')
 455     P.legend(loc=2)
 456     P.savefig('timing')
 457     P.close()
 458     
 459     
 460 
 461 if __name__ == "__main__":
 462     NumpyTest().run()
New Attachment

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
[get | view] (2007-03-15 18:48:51, 16.1 KB) [[attachment:histogram.py]]
[get | view] (2007-03-15 18:49:25, 1.5 KB) [[attachment:typed_array_converter.py]]