1 """ Fast histogram """
2
3 from numpy import array, zeros, asarray, sort, int32, digitize, bincount,\
4 concatenate,ones, atleast_1d, iterable, linspace, diff,log10,around,\
5 arange, apply_along_axis, hsplit, argsort, sum
6 from scipy import weave
7 from typed_array_converter import converters
8
9 def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None, strategy=None):
10 """histogram(a, bins=10, range=None, normed=False, weights=None, axis=None)
11 -> H, dict
12
13 Return the distribution of sample.
14
15 :Parameters:
16 - `a` : Array sample.
17 - `bins` : Number of bins, or an array of bin edges, in which case the
18 range is not used.
19 - `range` : Lower and upper bin edges, default: [min, max].
20 - `normed` :Boolean, if False, return the number of samples in each bin,
21 if True, return the density.
22 - `weights` : Sample weights. The weights are normed only if normed is
23 True. Should weights.sum() not equal len(a), the total bin count
24 will not be equal to the number of samples.
25 - `axis` : Specifies the dimension along which the histogram is computed.
26 Defaults to None, which aggregates the entire sample array.
27 - `strategy` : Histogramming method (binsize, searchsorted or digitize).
28
29 :Return:
30 - `H` : The number of samples in each bin.
31 If normed is True, H is a frequency distribution.
32 - dict{ 'edges': The bin edges, including the rightmost edge.
33 'upper': Upper outliers.
34 'lower': Lower outliers.
35 'bincenters': Center of bins.
36 'strategy': the histogramming method employed.}
37
38 :Examples:
39 >>> x = random.rand(100,10)
40 >>> H, D = histogram(x, bins=10, range=[0,1], normed=True)
41 >>> H2, D = histogram(x, bins=10, range=[0,1], normed=True, axis=0)
42
43 :SeeAlso: histogramnd
44 """
45 weighted = weights is not None
46
47 a = asarray(a)
48 if axis is None:
49 a = atleast_1d(a.ravel())
50 if weighted:
51 weights = atleast_1d(weights.ravel())
52 axis = 0
53
54
55 if not iterable(bins):
56 if range is None:
57 range = (a.min(), a.max())
58 mn, mx = [mi+0.0 for mi in range]
59 if mn == mx:
60 mn -= 0.5
61 mx += 0.5
62 edges = linspace(mn, mx, bins+1, endpoint=True)
63 else:
64 edges = asarray(bins, float)
65
66 nbin = len(edges)-1
67 dedges = diff(edges)
68 bincenters = edges[:-1] + dedges/2.
69
70
71 decimal = int(-log10(dedges.min())+10)
72
73
74 even = (len(set(around(dedges, decimal))) == 1)
75 if strategy is None:
76 if even:
77 strategy = 'binsize'
78 else:
79 if nbin > 30:
80 strategy = 'searchsort'
81 else:
82 strategy = 'digitize'
83 else:
84 if strategy not in ['binsize', 'digitize', 'searchsort']:
85 raise 'Unknown histogramming strategy.', strategy
86 if strategy == 'binsize' and not even:
87 raise 'This binsize strategy cannot be used for uneven bins.'
88
89
90 start = float(edges[0])
91 binwidth = float(dedges[0])
92
93
94
95
96 if not iterable(bins):
97 binwidth += pow(10, -decimal)
98 edges[-1] += pow(10, -decimal)
99
100
101 block = 66600
102 slices = [slice(None)]*a.ndim
103 for i in arange(0,len(a),block):
104 slices[axis] = slice(i,i+block)
105 at = a[slices]
106 if weighted:
107 at = concatenate((at, weights[slices]), axis)
108 if strategy == 'binsize':
109 count = apply_along_axis(splitinmiddle,axis,at,
110 histogram_binsize_weighted,start,binwidth,nbin)
111 elif strategy == 'searchsort':
112 count = apply_along_axis(splitinmiddle,axis,at, \
113 histogram_searchsort_weighted, edges)
114 elif strategy == 'digitize':
115 count = apply_along_axis(splitinmiddle,axis,at,\
116 histogram_digitize,edges,decimal,normed)
117 else:
118 if strategy == 'binsize':
119 count = apply_along_axis(histogram_binsize,axis,at,start,binwidth,nbin)
120 elif strategy == 'searchsort':
121 count = apply_along_axis(histogram_searchsort,axis,at,edges)
122 elif strategy == 'digitize':
123 count = apply_along_axis(histogram_digitize,axis,at,None,edges,
124 decimal, normed)
125
126 if i == 0:
127 total = count
128 else:
129 total += count
130
131
132 upper = total.take(array([-1]), axis)
133 lower = total.take(array([0]), axis)
134
135
136 core = a.ndim*[slice(None)]
137 core[axis] = slice(1, -1)
138 hist = total[core]
139
140 if normed:
141 normalize = lambda x: atleast_1d(x/(x*dedges).sum())
142 hist = apply_along_axis(normalize, axis, hist)
143
144 return hist, {'edges':edges, 'lower':lower, 'upper':upper, \
145 'bincenters':bincenters, 'strategy':strategy}
146
147
148
149 def histogram_binsize(a, start, width, n):
150 """histogram_even(a, start, width, n) -> histogram
151
152 Return an histogram where the first bin counts the number of lower
153 outliers and the last bin the number of upper outliers. Works only with
154 fixed width bins.
155
156 :Parameters:
157 a : array
158 Array of samples.
159 start : float
160 Left-most bin edge.
161 width : float
162 Width of the bins. All bins are considered to have the same width.
163 n : int
164 Number of bins.
165
166 :Return:
167 H : array
168 Array containing the number of elements in each bin. H[0] is the number
169 of samples smaller than start and H[-1] the number of samples
170 greater than start + n*width.
171 """
172 ary = asarray(a)
173
174
175 count = zeros(n+2,dtype=int32)
176
177
178 code = """
179 PyArrayIterObject *iter = (PyArrayIterObject*)PyArray_IterNew(py_ary);
180
181 while(iter->index < iter->size)
182 {
183
184 // This requires an update to weave
185 ary_data_type value = *((ary_data_type*)iter->dataptr);
186 if (value>=start)
187 {
188 int bin_index = (int)((value-start)/width);
189
190 //////////////////////////////////////////////////////////
191 // Bin counter increment
192 //////////////////////////////////////////////////////////
193
194 // If the value was found, increment the counter for that bin.
195 if (bin_index < n)
196 {
197 count[bin_index+1]++;
198 }
199 else {
200 count[n+1]++; }
201 }
202 else {
203 count[0]++;}
204 PyArray_ITER_NEXT(iter);
205 }
206 """
207 weave.inline(code, ['ary', 'start', 'width','n', 'count'],
208 type_converters=converters,
209 compiler='gcc')
210
211 return count
212
213
214 def histogram_binsize_weighted(a, w, start, width, n):
215 """histogram_even_weighted(a, start, width, n) -> histogram
216
217 Return an histogram where the first bin counts the number of lower
218 outliers and the last bin the number of upper outliers. Works only with
219 fixed width bins.
220
221 :Parameters:
222 a : array
223 Array of samples.
224 w : array
225 Weights of samples.
226 start : float
227 Left-most bin edge.
228 width : float
229 Width of the bins. All bins are considered to have the same width.
230 n : int
231 Number of bins.
232
233 :Return:
234 H : array
235 Array containing the number of elements in each bin. H[0] is the number
236 of samples smaller than start and H[-1] the number of samples
237 greater than start + n*width.
238 """
239 ary = asarray(a)
240 w = asarray(w)
241
242 count = zeros(n+2,dtype=w.dtype)
243
244
245 code = """
246 PyArrayIterObject *iter = (PyArrayIterObject*)PyArray_IterNew(py_ary);
247
248 while(iter->index < iter->size)
249 {
250
251 // This requires an update to weave
252 ary_data_type value = *((ary_data_type*)iter->dataptr);
253 if (value>=start)
254 {
255 int bin_index = (int)((value-start)/width);
256
257 //////////////////////////////////////////////////////////
258 // Bin counter increment
259 //////////////////////////////////////////////////////////
260
261 // If the value was found, increment the counter for that bin.
262 if (bin_index < n)
263 {
264 count[bin_index+1]+=w[iter->index];
265 }
266 else {
267 count[n+1]+=w[iter->index]; }
268 }
269 else {
270 count[0]+=w[iter->index];}
271 PyArray_ITER_NEXT(iter);
272 }
273 """
274 weave.inline(code, ['ary', 'w', 'start', 'width','n', 'count'],
275 type_converters=converters,
276 compiler='gcc')
277
278 return count
279
280 def histogram_searchsort(a, bins):
281 n = sort(a).searchsorted(bins)
282 n = concatenate([n, [len(a)]])
283 count = concatenate([[n[0]], n[1:]-n[:-1]])
284 return count
285
286 def histogram_searchsort_weighted(a, w, bins):
287 i = sort(a).searchsorted(bins)
288 sw = w[argsort(a)]
289 i = concatenate([i, [len(a)]])
290 n = concatenate([[0],sw.cumsum()])[i]
291 count = concatenate([[n[0]], n[1:]-n[:-1]])
292 return count
293
294 def splitinmiddle(x, function, *args, **kwds):
295 x1,x2 = hsplit(x, 2)
296 return function(x1,x2,*args, **kwds)
297
298 def histogram_digitize(a, w, edges, decimal, normed):
299 """Internal routine to compute the 1d weighted histogram.
300 a: sample
301 w: weights
302 edges: bin edges
303 decimal: approximation to put values lying on the rightmost edge in the last
304 bin.
305 weighted: Means that the weights are appended to array a.
306 Return the bin count or frequency if normed.
307 """
308 weighted = w is not None
309 nbin = edges.shape[0]+1
310 if weighted:
311 count = zeros(nbin, dtype=w.dtype)
312 if normed:
313 count = zeros(nbin, dtype=float)
314 w = w/w.mean()
315 else:
316 count = zeros(nbin, dtype=int32)
317
318 binindex = digitize(a, edges)
319
320
321 flatcount = bincount(binindex, w)
322
323
324 count[:len(flatcount)] = flatcount
325
326 return count
327
328
329 from numpy.testing import *
330 from numpy.random import rand
331
332 class test_histogram1d_functions(NumpyTestCase):
333 def check_consistency(self):
334 n = 100
335 r = rand(n)*12-1
336 bins = range(11)
337 a = histogram_binsize(r, bins[0], bins[1]-bins[0], len(bins)-1)
338 b = histogram_digitize(r, None, array(bins), 6, False)
339 c = histogram_searchsort(r,bins)
340 assert_array_equal(a,b)
341 assert_array_equal(c,b)
342
343 class test_histogram(NumpyTestCase):
344 def check_simple(self):
345 n=100
346 v=rand(n)
347 (a,b)=histogram(v)
348
349 assert_equal(sum(a,axis=0),n)
350
351 (a,b)=histogram(linspace(0,10,100))
352 assert_array_equal(a,10)
353
354 a, b = histogram(v, bins=4, range=[.2,.8])
355 assert_array_almost_equal(b['edges'],linspace(.2, .8, 5),8)
356
357 assert_equal((v<.2).sum(), b['lower'])
358 assert_equal((v>.8).sum(),b['upper'])
359
360 bins = [0,.5,.75,1]
361 a,b = histogram(v, bins, normed=True)
362 assert_almost_equal((a*diff(bins)).sum(), 1)
363
364 def check_axis(self):
365 n,m = 100,20
366 v = rand(n,m)
367 a,b = histogram(v, bins=5)
368
369 assert_equal(a.ndim, 1)
370
371 assert_equal(a.sum(), n*m)
372 a,b = histogram(v, bins = 7, axis=0)
373
374 assert(a.ndim == 2)
375 assert_array_equal(a.shape,[7, m])
376
377 a,b = histogram(v, bins = 7, axis=0, normed=True)
378 assert_array_almost_equal((a.T*diff(b['edges'])).sum(1), ones((m)),5)
379 a,b = histogram(v, bins = 7, axis=1, normed=True)
380 assert_array_equal(a.shape, [n,7])
381 assert_array_almost_equal((a*diff(b['edges'])).sum(1), ones((n)))
382
383 a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True)
384 assert_array_almost_equal(a1, a[0,:],7)
385
386 def check_weights(self):
387
388 v = rand(100)
389 w = ones(100)*5
390 a,b = histogram(v)
391 na,nb = histogram(v, normed=True)
392 wa,wb = histogram(v, weights=w)
393 nwa,nwb = histogram(v, weights=w, normed=True)
394 assert_array_equal(a*5, wa)
395 assert_array_almost_equal(na, nwa,8)
396
397 v = linspace(0,10,10)
398 w = concatenate((zeros(5), ones(5)))
399 wa,wb = histogram(v, bins=linspace(0,10.01, 11),weights=w)
400 assert_array_almost_equal(wa, w)
401
402 def check_strategies(self):
403 v = rand(100)
404 ae,be = histogram(v, strategy='binsize')
405 ab,bb = histogram(v, strategy='digitize')
406 as,bs = histogram(v, strategy='searchsort')
407 assert_array_equal(ae, ab)
408 assert_array_equal(ae, as)
409
410 w = rand(100)
411 ae,be = histogram(v, weights=w, strategy='binsize')
412 ab,bb = histogram(v, weights=w, strategy='digitize')
413 as,bs = histogram(v, weights=w, strategy='searchsort')
414 assert_array_almost_equal(ae, ab,8)
415 assert_array_almost_equal(ae, as,8)
416
417
418 def compare_strategies():
419 """Time each strategy in the case of even bins.
420
421 The fastest method is the C based function histogram_even, which
422 however cannot be used for uneven bins. In this case, searchsort is faster
423 if there are more than 30 bins, otherwise, digitize should be used.
424 """
425 from time import clock
426 from numpy.random import randint
427 import pylab as P
428 N = 2**arange(12,24)
429 B = arange(10,150,20)
430 strategies = ['binsize', 'searchsort', 'digitize']
431 tn = zeros((len(N), len(strategies)))
432 tb = zeros((len(B), len(strategies)))
433 s1 = P.subplot(211)
434 s2 = P.subplot(212)
435 for i,s in enumerate(strategies):
436 for j,n in enumerate(N):
437 r = randint(0,100, n)
438 t1 = clock()
439 a,d = histogram(r,30, strategy=s)
440 t2 = clock()
441 tn[j,i]=t2-t1
442 s1.loglog(N, tn[:,i], '-', marker='o', label=s)
443
444 r = randint(0,100, 1e6)
445 for j,b in enumerate(B):
446 t1 = clock()
447 a,d = histogram(r,b, strategy=s)
448 t2 = clock()
449 tb[j,i]=t2-t1
450 s2.plot(B, tb[:,i], '-', marker='o',label=s)
451 s1.set_xlabel('Number of random variables')
452 s1.set_ylabel('Computation time (s)')
453 s2.set_xlabel('Bin array length')
454 s2.set_ylabel('Computation time (s)')
455 P.legend(loc=2)
456 P.savefig('timing')
457 P.close()
458
459
460
461 if __name__ == "__main__":
462 NumpyTest().run()