Chris@87: """
Chris@87: A buffered iterator for big arrays.
Chris@87: 
Chris@87: This module solves the problem of iterating over a big file-based array
Chris@87: without having to read it into memory. The `Arrayterator` class wraps
Chris@87: an array object, and when iterated it will return sub-arrays with at most
Chris@87: a user-specified number of elements.
Chris@87: 
Chris@87: """
Chris@87: from __future__ import division, absolute_import, print_function
Chris@87: 
Chris@87: from operator import mul
Chris@87: from functools import reduce
Chris@87: 
Chris@87: from numpy.compat import long
Chris@87: 
Chris@87: __all__ = ['Arrayterator']
Chris@87: 
Chris@87: 
Chris@87: class Arrayterator(object):
Chris@87:     """
Chris@87:     Buffered iterator for big arrays.
Chris@87: 
Chris@87:     `Arrayterator` creates a buffered iterator for reading big arrays in small
Chris@87:     contiguous blocks. The class is useful for objects stored in the
Chris@87:     file system. It allows iteration over the object *without* reading
Chris@87:     everything in memory; instead, small blocks are read and iterated over.
Chris@87: 
Chris@87:     `Arrayterator` can be used with any object that supports multidimensional
Chris@87:     slices. This includes NumPy arrays, but also variables from
Chris@87:     Scientific.IO.NetCDF or pynetcdf for example.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     var : array_like
Chris@87:         The object to iterate over.
Chris@87:     buf_size : int, optional
Chris@87:         The buffer size. If `buf_size` is supplied, the maximum amount of
Chris@87:         data that will be read into memory is `buf_size` elements.
Chris@87:         Default is None, which will read as many element as possible
Chris@87:         into memory.
Chris@87: 
Chris@87:     Attributes
Chris@87:     ----------
Chris@87:     var
Chris@87:     buf_size
Chris@87:     start
Chris@87:     stop
Chris@87:     step
Chris@87:     shape
Chris@87:     flat
Chris@87: 
Chris@87:     See Also
Chris@87:     --------
Chris@87:     ndenumerate : Multidimensional array iterator.
Chris@87:     flatiter : Flat array iterator.
Chris@87:     memmap : Create a memory-map to an array stored in a binary file on disk.
Chris@87: 
Chris@87:     Notes
Chris@87:     -----
Chris@87:     The algorithm works by first finding a "running dimension", along which
Chris@87:     the blocks will be extracted. Given an array of dimensions
Chris@87:     ``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the
Chris@87:     first dimension will be used. If, on the other hand,
Chris@87:     ``d1 < buf_size < d1*d2`` the second dimension will be used, and so on.
Chris@87:     Blocks are extracted along this dimension, and when the last block is
Chris@87:     returned the process continues from the next dimension, until all
Chris@87:     elements have been read.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> import numpy as np
Chris@87:     >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
Chris@87:     >>> a_itor = np.lib.arrayterator.Arrayterator(a, 2)
Chris@87:     >>> a_itor.shape
Chris@87:     (3, 4, 5, 6)
Chris@87: 
Chris@87:     Now we can iterate over ``a_itor``, and it will return arrays of size
Chris@87:     two. Since `buf_size` was smaller than any dimension, the first
Chris@87:     dimension will be iterated over first:
Chris@87: 
Chris@87:     >>> for subarr in a_itor:
Chris@87:     ...     if not subarr.all():
Chris@87:     ...         print subarr, subarr.shape
Chris@87:     ...
Chris@87:     [[[[0 1]]]] (1, 1, 1, 2)
Chris@87: 
Chris@87:     """
Chris@87: 
Chris@87:     def __init__(self, var, buf_size=None):
Chris@87:         self.var = var
Chris@87:         self.buf_size = buf_size
Chris@87: 
Chris@87:         self.start = [0 for dim in var.shape]
Chris@87:         self.stop = [dim for dim in var.shape]
Chris@87:         self.step = [1 for dim in var.shape]
Chris@87: 
Chris@87:     def __getattr__(self, attr):
Chris@87:         return getattr(self.var, attr)
Chris@87: 
Chris@87:     def __getitem__(self, index):
Chris@87:         """
Chris@87:         Return a new arrayterator.
Chris@87: 
Chris@87:         """
Chris@87:         # Fix index, handling ellipsis and incomplete slices.
Chris@87:         if not isinstance(index, tuple):
Chris@87:             index = (index,)
Chris@87:         fixed = []
Chris@87:         length, dims = len(index), len(self.shape)
Chris@87:         for slice_ in index:
Chris@87:             if slice_ is Ellipsis:
Chris@87:                 fixed.extend([slice(None)] * (dims-length+1))
Chris@87:                 length = len(fixed)
Chris@87:             elif isinstance(slice_, (int, long)):
Chris@87:                 fixed.append(slice(slice_, slice_+1, 1))
Chris@87:             else:
Chris@87:                 fixed.append(slice_)
Chris@87:         index = tuple(fixed)
Chris@87:         if len(index) < dims:
Chris@87:             index += (slice(None),) * (dims-len(index))
Chris@87: 
Chris@87:         # Return a new arrayterator object.
Chris@87:         out = self.__class__(self.var, self.buf_size)
Chris@87:         for i, (start, stop, step, slice_) in enumerate(
Chris@87:                 zip(self.start, self.stop, self.step, index)):
Chris@87:             out.start[i] = start + (slice_.start or 0)
Chris@87:             out.step[i] = step * (slice_.step or 1)
Chris@87:             out.stop[i] = start + (slice_.stop or stop-start)
Chris@87:             out.stop[i] = min(stop, out.stop[i])
Chris@87:         return out
Chris@87: 
Chris@87:     def __array__(self):
Chris@87:         """
Chris@87:         Return corresponding data.
Chris@87: 
Chris@87:         """
Chris@87:         slice_ = tuple(slice(*t) for t in zip(
Chris@87:                 self.start, self.stop, self.step))
Chris@87:         return self.var[slice_]
Chris@87: 
Chris@87:     @property
Chris@87:     def flat(self):
Chris@87:         """
Chris@87:         A 1-D flat iterator for Arrayterator objects.
Chris@87: 
Chris@87:         This iterator returns elements of the array to be iterated over in
Chris@87:         `Arrayterator` one by one. It is similar to `flatiter`.
Chris@87: 
Chris@87:         See Also
Chris@87:         --------
Chris@87:         `Arrayterator`
Chris@87:         flatiter
Chris@87: 
Chris@87:         Examples
Chris@87:         --------
Chris@87:         >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
Chris@87:         >>> a_itor = np.lib.arrayterator.Arrayterator(a, 2)
Chris@87: 
Chris@87:         >>> for subarr in a_itor.flat:
Chris@87:         ...     if not subarr:
Chris@87:         ...         print subarr, type(subarr)
Chris@87:         ...
Chris@87:         0 <type 'numpy.int32'>
Chris@87: 
Chris@87:         """
Chris@87:         for block in self:
Chris@87:             for value in block.flat:
Chris@87:                 yield value
Chris@87: 
Chris@87:     @property
Chris@87:     def shape(self):
Chris@87:         """
Chris@87:         The shape of the array to be iterated over.
Chris@87: 
Chris@87:         For an example, see `Arrayterator`.
Chris@87: 
Chris@87:         """
Chris@87:         return tuple(((stop-start-1)//step+1) for start, stop, step in
Chris@87:                 zip(self.start, self.stop, self.step))
Chris@87: 
Chris@87:     def __iter__(self):
Chris@87:         # Skip arrays with degenerate dimensions
Chris@87:         if [dim for dim in self.shape if dim <= 0]:
Chris@87:             raise StopIteration
Chris@87: 
Chris@87:         start = self.start[:]
Chris@87:         stop = self.stop[:]
Chris@87:         step = self.step[:]
Chris@87:         ndims = len(self.var.shape)
Chris@87: 
Chris@87:         while True:
Chris@87:             count = self.buf_size or reduce(mul, self.shape)
Chris@87: 
Chris@87:             # iterate over each dimension, looking for the
Chris@87:             # running dimension (ie, the dimension along which
Chris@87:             # the blocks will be built from)
Chris@87:             rundim = 0
Chris@87:             for i in range(ndims-1, -1, -1):
Chris@87:                 # if count is zero we ran out of elements to read
Chris@87:                 # along higher dimensions, so we read only a single position
Chris@87:                 if count == 0:
Chris@87:                     stop[i] = start[i]+1
Chris@87:                 elif count <= self.shape[i]:
Chris@87:                     # limit along this dimension
Chris@87:                     stop[i] = start[i] + count*step[i]
Chris@87:                     rundim = i
Chris@87:                 else:
Chris@87:                     # read everything along this dimension
Chris@87:                     stop[i] = self.stop[i]
Chris@87:                 stop[i] = min(self.stop[i], stop[i])
Chris@87:                 count = count//self.shape[i]
Chris@87: 
Chris@87:             # yield a block
Chris@87:             slice_ = tuple(slice(*t) for t in zip(start, stop, step))
Chris@87:             yield self.var[slice_]
Chris@87: 
Chris@87:             # Update start position, taking care of overflow to
Chris@87:             # other dimensions
Chris@87:             start[rundim] = stop[rundim]  # start where we stopped
Chris@87:             for i in range(ndims-1, 0, -1):
Chris@87:                 if start[i] >= self.stop[i]:
Chris@87:                     start[i] = self.start[i]
Chris@87:                     start[i-1] += self.step[i-1]
Chris@87:             if start[0] >= self.stop[0]:
Chris@87:                 raise StopIteration