Chris@87
|
1 """
|
Chris@87
|
2 A buffered iterator for big arrays.
|
Chris@87
|
3
|
Chris@87
|
4 This module solves the problem of iterating over a big file-based array
|
Chris@87
|
5 without having to read it into memory. The `Arrayterator` class wraps
|
Chris@87
|
6 an array object, and when iterated it will return sub-arrays with at most
|
Chris@87
|
7 a user-specified number of elements.
|
Chris@87
|
8
|
Chris@87
|
9 """
|
Chris@87
|
10 from __future__ import division, absolute_import, print_function
|
Chris@87
|
11
|
Chris@87
|
12 from operator import mul
|
Chris@87
|
13 from functools import reduce
|
Chris@87
|
14
|
Chris@87
|
15 from numpy.compat import long
|
Chris@87
|
16
|
Chris@87
|
17 __all__ = ['Arrayterator']
|
Chris@87
|
18
|
Chris@87
|
19
|
Chris@87
|
20 class Arrayterator(object):
|
Chris@87
|
21 """
|
Chris@87
|
22 Buffered iterator for big arrays.
|
Chris@87
|
23
|
Chris@87
|
24 `Arrayterator` creates a buffered iterator for reading big arrays in small
|
Chris@87
|
25 contiguous blocks. The class is useful for objects stored in the
|
Chris@87
|
26 file system. It allows iteration over the object *without* reading
|
Chris@87
|
27 everything in memory; instead, small blocks are read and iterated over.
|
Chris@87
|
28
|
Chris@87
|
29 `Arrayterator` can be used with any object that supports multidimensional
|
Chris@87
|
30 slices. This includes NumPy arrays, but also variables from
|
Chris@87
|
31 Scientific.IO.NetCDF or pynetcdf for example.
|
Chris@87
|
32
|
Chris@87
|
33 Parameters
|
Chris@87
|
34 ----------
|
Chris@87
|
35 var : array_like
|
Chris@87
|
36 The object to iterate over.
|
Chris@87
|
37 buf_size : int, optional
|
Chris@87
|
38 The buffer size. If `buf_size` is supplied, the maximum amount of
|
Chris@87
|
39 data that will be read into memory is `buf_size` elements.
|
Chris@87
|
40 Default is None, which will read as many element as possible
|
Chris@87
|
41 into memory.
|
Chris@87
|
42
|
Chris@87
|
43 Attributes
|
Chris@87
|
44 ----------
|
Chris@87
|
45 var
|
Chris@87
|
46 buf_size
|
Chris@87
|
47 start
|
Chris@87
|
48 stop
|
Chris@87
|
49 step
|
Chris@87
|
50 shape
|
Chris@87
|
51 flat
|
Chris@87
|
52
|
Chris@87
|
53 See Also
|
Chris@87
|
54 --------
|
Chris@87
|
55 ndenumerate : Multidimensional array iterator.
|
Chris@87
|
56 flatiter : Flat array iterator.
|
Chris@87
|
57 memmap : Create a memory-map to an array stored in a binary file on disk.
|
Chris@87
|
58
|
Chris@87
|
59 Notes
|
Chris@87
|
60 -----
|
Chris@87
|
61 The algorithm works by first finding a "running dimension", along which
|
Chris@87
|
62 the blocks will be extracted. Given an array of dimensions
|
Chris@87
|
63 ``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the
|
Chris@87
|
64 first dimension will be used. If, on the other hand,
|
Chris@87
|
65 ``d1 < buf_size < d1*d2`` the second dimension will be used, and so on.
|
Chris@87
|
66 Blocks are extracted along this dimension, and when the last block is
|
Chris@87
|
67 returned the process continues from the next dimension, until all
|
Chris@87
|
68 elements have been read.
|
Chris@87
|
69
|
Chris@87
|
70 Examples
|
Chris@87
|
71 --------
|
Chris@87
|
72 >>> import numpy as np
|
Chris@87
|
73 >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
|
Chris@87
|
74 >>> a_itor = np.lib.arrayterator.Arrayterator(a, 2)
|
Chris@87
|
75 >>> a_itor.shape
|
Chris@87
|
76 (3, 4, 5, 6)
|
Chris@87
|
77
|
Chris@87
|
78 Now we can iterate over ``a_itor``, and it will return arrays of size
|
Chris@87
|
79 two. Since `buf_size` was smaller than any dimension, the first
|
Chris@87
|
80 dimension will be iterated over first:
|
Chris@87
|
81
|
Chris@87
|
82 >>> for subarr in a_itor:
|
Chris@87
|
83 ... if not subarr.all():
|
Chris@87
|
84 ... print subarr, subarr.shape
|
Chris@87
|
85 ...
|
Chris@87
|
86 [[[[0 1]]]] (1, 1, 1, 2)
|
Chris@87
|
87
|
Chris@87
|
88 """
|
Chris@87
|
89
|
Chris@87
|
90 def __init__(self, var, buf_size=None):
|
Chris@87
|
91 self.var = var
|
Chris@87
|
92 self.buf_size = buf_size
|
Chris@87
|
93
|
Chris@87
|
94 self.start = [0 for dim in var.shape]
|
Chris@87
|
95 self.stop = [dim for dim in var.shape]
|
Chris@87
|
96 self.step = [1 for dim in var.shape]
|
Chris@87
|
97
|
Chris@87
|
98 def __getattr__(self, attr):
|
Chris@87
|
99 return getattr(self.var, attr)
|
Chris@87
|
100
|
Chris@87
|
101 def __getitem__(self, index):
|
Chris@87
|
102 """
|
Chris@87
|
103 Return a new arrayterator.
|
Chris@87
|
104
|
Chris@87
|
105 """
|
Chris@87
|
106 # Fix index, handling ellipsis and incomplete slices.
|
Chris@87
|
107 if not isinstance(index, tuple):
|
Chris@87
|
108 index = (index,)
|
Chris@87
|
109 fixed = []
|
Chris@87
|
110 length, dims = len(index), len(self.shape)
|
Chris@87
|
111 for slice_ in index:
|
Chris@87
|
112 if slice_ is Ellipsis:
|
Chris@87
|
113 fixed.extend([slice(None)] * (dims-length+1))
|
Chris@87
|
114 length = len(fixed)
|
Chris@87
|
115 elif isinstance(slice_, (int, long)):
|
Chris@87
|
116 fixed.append(slice(slice_, slice_+1, 1))
|
Chris@87
|
117 else:
|
Chris@87
|
118 fixed.append(slice_)
|
Chris@87
|
119 index = tuple(fixed)
|
Chris@87
|
120 if len(index) < dims:
|
Chris@87
|
121 index += (slice(None),) * (dims-len(index))
|
Chris@87
|
122
|
Chris@87
|
123 # Return a new arrayterator object.
|
Chris@87
|
124 out = self.__class__(self.var, self.buf_size)
|
Chris@87
|
125 for i, (start, stop, step, slice_) in enumerate(
|
Chris@87
|
126 zip(self.start, self.stop, self.step, index)):
|
Chris@87
|
127 out.start[i] = start + (slice_.start or 0)
|
Chris@87
|
128 out.step[i] = step * (slice_.step or 1)
|
Chris@87
|
129 out.stop[i] = start + (slice_.stop or stop-start)
|
Chris@87
|
130 out.stop[i] = min(stop, out.stop[i])
|
Chris@87
|
131 return out
|
Chris@87
|
132
|
Chris@87
|
133 def __array__(self):
|
Chris@87
|
134 """
|
Chris@87
|
135 Return corresponding data.
|
Chris@87
|
136
|
Chris@87
|
137 """
|
Chris@87
|
138 slice_ = tuple(slice(*t) for t in zip(
|
Chris@87
|
139 self.start, self.stop, self.step))
|
Chris@87
|
140 return self.var[slice_]
|
Chris@87
|
141
|
Chris@87
|
142 @property
|
Chris@87
|
143 def flat(self):
|
Chris@87
|
144 """
|
Chris@87
|
145 A 1-D flat iterator for Arrayterator objects.
|
Chris@87
|
146
|
Chris@87
|
147 This iterator returns elements of the array to be iterated over in
|
Chris@87
|
148 `Arrayterator` one by one. It is similar to `flatiter`.
|
Chris@87
|
149
|
Chris@87
|
150 See Also
|
Chris@87
|
151 --------
|
Chris@87
|
152 `Arrayterator`
|
Chris@87
|
153 flatiter
|
Chris@87
|
154
|
Chris@87
|
155 Examples
|
Chris@87
|
156 --------
|
Chris@87
|
157 >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
|
Chris@87
|
158 >>> a_itor = np.lib.arrayterator.Arrayterator(a, 2)
|
Chris@87
|
159
|
Chris@87
|
160 >>> for subarr in a_itor.flat:
|
Chris@87
|
161 ... if not subarr:
|
Chris@87
|
162 ... print subarr, type(subarr)
|
Chris@87
|
163 ...
|
Chris@87
|
164 0 <type 'numpy.int32'>
|
Chris@87
|
165
|
Chris@87
|
166 """
|
Chris@87
|
167 for block in self:
|
Chris@87
|
168 for value in block.flat:
|
Chris@87
|
169 yield value
|
Chris@87
|
170
|
Chris@87
|
171 @property
|
Chris@87
|
172 def shape(self):
|
Chris@87
|
173 """
|
Chris@87
|
174 The shape of the array to be iterated over.
|
Chris@87
|
175
|
Chris@87
|
176 For an example, see `Arrayterator`.
|
Chris@87
|
177
|
Chris@87
|
178 """
|
Chris@87
|
179 return tuple(((stop-start-1)//step+1) for start, stop, step in
|
Chris@87
|
180 zip(self.start, self.stop, self.step))
|
Chris@87
|
181
|
Chris@87
|
182 def __iter__(self):
|
Chris@87
|
183 # Skip arrays with degenerate dimensions
|
Chris@87
|
184 if [dim for dim in self.shape if dim <= 0]:
|
Chris@87
|
185 raise StopIteration
|
Chris@87
|
186
|
Chris@87
|
187 start = self.start[:]
|
Chris@87
|
188 stop = self.stop[:]
|
Chris@87
|
189 step = self.step[:]
|
Chris@87
|
190 ndims = len(self.var.shape)
|
Chris@87
|
191
|
Chris@87
|
192 while True:
|
Chris@87
|
193 count = self.buf_size or reduce(mul, self.shape)
|
Chris@87
|
194
|
Chris@87
|
195 # iterate over each dimension, looking for the
|
Chris@87
|
196 # running dimension (ie, the dimension along which
|
Chris@87
|
197 # the blocks will be built from)
|
Chris@87
|
198 rundim = 0
|
Chris@87
|
199 for i in range(ndims-1, -1, -1):
|
Chris@87
|
200 # if count is zero we ran out of elements to read
|
Chris@87
|
201 # along higher dimensions, so we read only a single position
|
Chris@87
|
202 if count == 0:
|
Chris@87
|
203 stop[i] = start[i]+1
|
Chris@87
|
204 elif count <= self.shape[i]:
|
Chris@87
|
205 # limit along this dimension
|
Chris@87
|
206 stop[i] = start[i] + count*step[i]
|
Chris@87
|
207 rundim = i
|
Chris@87
|
208 else:
|
Chris@87
|
209 # read everything along this dimension
|
Chris@87
|
210 stop[i] = self.stop[i]
|
Chris@87
|
211 stop[i] = min(self.stop[i], stop[i])
|
Chris@87
|
212 count = count//self.shape[i]
|
Chris@87
|
213
|
Chris@87
|
214 # yield a block
|
Chris@87
|
215 slice_ = tuple(slice(*t) for t in zip(start, stop, step))
|
Chris@87
|
216 yield self.var[slice_]
|
Chris@87
|
217
|
Chris@87
|
218 # Update start position, taking care of overflow to
|
Chris@87
|
219 # other dimensions
|
Chris@87
|
220 start[rundim] = stop[rundim] # start where we stopped
|
Chris@87
|
221 for i in range(ndims-1, 0, -1):
|
Chris@87
|
222 if start[i] >= self.stop[i]:
|
Chris@87
|
223 start[i] = self.start[i]
|
Chris@87
|
224 start[i-1] += self.step[i-1]
|
Chris@87
|
225 if start[0] >= self.stop[0]:
|
Chris@87
|
226 raise StopIteration
|